def create_data_with_duplicate(filename, pos_dict, neg_dict, min_len, max_len, num, get_difference): with open(filename, "w+") as f: for i in range(min_len, max_len + 1): pos_fsa = \ pynini.randgen(pos_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False) if get_difference == 1: pos_dict[i] = pynini.difference(pos_dict[i], pos_fsa) for ele in list_string_set(pos_fsa): f.write(ele + "\t" + "TRUE\n") neg_fsa = \ pynini.randgen(neg_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False) if get_difference == 1: neg_dict[i] = pynini.difference(neg_dict[i], neg_fsa) for ele in list_string_set(neg_fsa): f.write(ele + "\t" + "FALSE\n") return pos_dict, neg_dict
def cody_rand_gen_no_duplicate(acceptor, n): loop = 50000 result_set = set() seed = 0 for i in range(loop): print('started loop ' + str(i)) num = int(n + n * i * 0.1) temp = pynini.randgen(acceptor, npath=num, seed=seed, select='uniform', max_length=2147483647, weighted=False) rand_list = list_string_set(temp) result_set = result_set.union(set(rand_list)) uniq_len = len(result_set) if uniq_len < n and i < loop - 1: print('insufficient random strings') seed += 1 continue else: rand_list = list(result_set) random.shuffle(rand_list) rand_list = rand_list[:n] rand_list.sort() acceptor = pynini.difference(acceptor, temp) print('returning') if len(rand_list) >= n: print('got full rand_list\n') return acceptor, rand_list
def alternate_rand_gen_no_duplicate(acceptor, n): rand_list = [] loop = 10 seed = 0 for i in range(loop): print('(alternate) trying to generate random strings (' + str(i) + ')') num = int(n + n * i * .01) temp = pynini.randgen(acceptor, npath=num, seed=seed, select='uniform', max_length=2147483647, weighted=False) print('made new `temp`') temp_list = list_string_set(temp) print('temp got ' + str(len(temp_list)) + ' random strings') temp_list = list(set(temp_list)) new_strings = [t for t in temp_list if t not in rand_list] print('got ' + str(len(new_strings)) + ' new strings') for t in temp_list: if t not in rand_list: rand_list.append(t) if len(rand_list) == n: print('rand_list now has ' + str(len(rand_list)) + ' strings') print('finally got enough strings in rand_list; i=' + str(i)) return acceptor, rand_list acceptor = pynini.difference(acceptor, temp) seed += 1 print('rand_list now has ' + str(len(rand_list)) + ' strings') print('need to add strings to rand_list (' + str(i) + ')') print('finished loop; returning incomplete set') return acceptor, rand_list
def findAmbiguity(self, strictness=100): """ Allauzen and Mohri: an FST f is functional (i.e. one-to-one or many-to-one) iff f' .o. f is the identity function over f's domain. Rather than implement A&Z's algorithm to determine strict identity, we test identity for a random sample of paths. """ fpcf = self.fsm.copy().invert() * self.fsm for top, bottom, _ in pynini.randgen(fpcf, npath=strictness, max_length=strictness)\ .paths(input_token_type="symbol", output_token_type="symbol"): if top != bottom: return (clean(top), clean(bottom)) return None
def by_len(ex, f, count): random_examples=pynini.randgen(ex,10000) ps = random_examples.paths(input_token_type="utf8", output_token_type="utf8") while not ps.done(): if ps.istring() and ps.ostring(): f[0].write(ps.istring() + "\tTRUE\n") f[0].write(ps.ostring() + "\tFALSE\n") if count % 10 ==0: f[1].write(ps.istring() + "\tTRUE\n") f[1].write(ps.ostring() + "\tFALSE\n") if count %100 ==0: f[2].write(ps.istring() + "\tTRUE\n") f[2].write(ps.ostring() + "\tFALSE\n") ps.next() count=count+1
def rand_gen_no_duplicate(acceptor, n): loop = 10 for i in range(loop): num = int(n + n*i*0.1) temp = pynini.randgen(acceptor, npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False) rand_list = list_string_set(temp) rand_list = list(set(rand_list)) uniq_len = len(rand_list) if uniq_len < n and i < loop - 1: print('insufficient random strings') continue else: random.shuffle(rand_list) rand_list = rand_list[:n] rand_list.sort() acceptor = pynini.difference(acceptor, temp) return acceptor, rand_list
def main(unused_argv): far = py.Far(FLAGS.far) fst = far[FLAGS.rule] # Note that we tried to push weights to the beginning so that we don"t get # spurious selection of "free" cases where the first byte of a UTF8 character # has no weight. # # fst = py.push(fst, push_weights=True, to_final=False) # # However this seems to produce artifacts of its own like endless series of # Greek roots starting with "drai". On the other hand without it PAN gets # endless roots starting with ñ. if FLAGS.push: fst = py.push(fst, push_weights=True, to_final=False) rand = py.randgen(fst, npath=FLAGS.npaths, seed=int(time.time()), select="log_prob", weighted=True) print(Counter([p for p in rand.paths().ostrings()]))
def create_adversarial_data(filename, pos_dict, neg_dict, min_len, max_len, num): with open(filename, "w+") as f: for i in range(min_len, max_len + 1): _, results = rand_gen_no_duplicate(pos_dict[i], num) for ele in results: one_edit_dist_fsa = get_one_edit_distance_fsa(A(ele)) temp_fsa = pynini.compose(one_edit_dist_fsa, neg_dict[i]) if i - 1 >= min_len: temp_fsa = temp_fsa | pynini.compose(one_edit_dist_fsa, neg_dict[i - 1]) if i + 1 <= max_len: temp_fsa = temp_fsa | pynini.compose(one_edit_dist_fsa, neg_dict[i + 1]) # print('one edit distance:' + ele) # print(list_string_set(temp_fsa)) # temp_res = list(set(list_string_set(one_edit_dist_fsa)) & set(list_string_set(neg_dict[i]))) temp_res = \ list_string_set(pynini.randgen(temp_fsa, npath=1, seed=0, select="uniform", max_length=2147483647, weighted=False)) if not temp_res: print('insufficient adversarial data') continue else: f.write(ele + "\t" + "TRUE\n") f.write(temp_res[0] + "\t" + "FALSE\n")
def _assert_fst_sampled_behavior( self, fsts: List[pynini.Fst], token_type: pynini.TokenType, samples: int, assert_function: Callable[[pynini.Fst, pynini.Fst], None]) -> None: """Asserts that FST composed on samples is follow a specific behavior. This samples from first FST's input projection in order to assert a behavior when composed with the FSTs. This is used in lieu of statically verifying that this composition has a specific property as that isn't easy to answer for non-deterministic FSTs. If token_type is set to "byte", then the input projection of the FST is intersected with the definition of the closure over valid UTF-8 characters to ensure all samples are valid UTF-8 strings that Python can handle. The maximum length of a sample is set to 100 labels. Args: fsts: List of FSTs to be applied on a sample to verify if the resultant FST obeys the property specified in the function. token_type: The token_type used to derive the FST. samples: The number of input samples to take to verify functionality. assert_function: An assert function with input string FSA and output FST as parameters. This function is run in `pynini.default_token_type` environment. This function raises AssertionError on assert failure. """ input_language = pynini.project(fsts[0], "input") if token_type == "byte": # NOTE: Randgenning directly from the byte machine is bound to lead to # trouble since it can generate things that aren't well-formed UTF-8 # sequences and thus cannot be put into a Python str type. input_language = pynini.intersect(input_language, utf8.VALID_UTF8_CHAR.star) input_samples = pynini.randgen(input_language, npath=samples, max_length=_MAX_SAMPLE_LENGTH) with pynini.default_token_type(token_type): for ilabels in _olabels_iter(input_samples): input_str_fsa = _label_list_to_string_fsa(ilabels) output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts) assert_function(input_str_fsa, output_fst)
import pynini as pn import random # compose - * # concat - + # union - | fst = (pn.a("a") | pn.a("e")) + pn.t("a", pn.a("0").closure(0, 5)) | pn.t( pn.a("a").star, "0") + pn.a("xxx") fst = fst.optimize() output_strings = set() for i in range(10000): s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify() output_strings.add(s) print(len(output_strings)) for output_string in output_strings: print(output_string) def top_paths(fst, count=100): return sorted( set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths())) print("INPUTS") print("\t")