Ejemplo n.º 1
0
def create_data_with_duplicate(filename, pos_dict, neg_dict, min_len, max_len, num, get_difference):
    with open(filename, "w+") as f:
        for i in range(min_len, max_len + 1):
            pos_fsa = \
                pynini.randgen(pos_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False)
            if get_difference == 1:
                pos_dict[i] = pynini.difference(pos_dict[i], pos_fsa)
            for ele in list_string_set(pos_fsa):
                f.write(ele + "\t" + "TRUE\n")
            neg_fsa = \
                pynini.randgen(neg_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False)
            if get_difference == 1:
                neg_dict[i] = pynini.difference(neg_dict[i], neg_fsa)
            for ele in list_string_set(neg_fsa):
                f.write(ele + "\t" + "FALSE\n")
    return pos_dict, neg_dict
Ejemplo n.º 2
0
def cody_rand_gen_no_duplicate(acceptor, n):
    loop = 50000
    result_set = set()
    seed = 0
    for i in range(loop):
        print('started loop ' + str(i))
        num = int(n + n * i * 0.1)
        temp = pynini.randgen(acceptor,
                              npath=num,
                              seed=seed,
                              select='uniform',
                              max_length=2147483647,
                              weighted=False)
        rand_list = list_string_set(temp)
        result_set = result_set.union(set(rand_list))
        uniq_len = len(result_set)
        if uniq_len < n and i < loop - 1:
            print('insufficient random strings')
            seed += 1
            continue
        else:
            rand_list = list(result_set)
            random.shuffle(rand_list)
            rand_list = rand_list[:n]
            rand_list.sort()
            acceptor = pynini.difference(acceptor, temp)
            print('returning')
            if len(rand_list) >= n:
                print('got full rand_list\n')
            return acceptor, rand_list
Ejemplo n.º 3
0
def alternate_rand_gen_no_duplicate(acceptor, n):
    rand_list = []
    loop = 10
    seed = 0
    for i in range(loop):
        print('(alternate) trying to generate random strings (' + str(i) + ')')
        num = int(n + n * i * .01)
        temp = pynini.randgen(acceptor,
                              npath=num,
                              seed=seed,
                              select='uniform',
                              max_length=2147483647,
                              weighted=False)
        print('made new `temp`')
        temp_list = list_string_set(temp)
        print('temp got ' + str(len(temp_list)) + ' random strings')
        temp_list = list(set(temp_list))
        new_strings = [t for t in temp_list if t not in rand_list]
        print('got ' + str(len(new_strings)) + ' new strings')
        for t in temp_list:
            if t not in rand_list:
                rand_list.append(t)
                if len(rand_list) == n:
                    print('rand_list now has ' + str(len(rand_list)) +
                          ' strings')
                    print('finally got enough strings in rand_list; i=' +
                          str(i))
                    return acceptor, rand_list
        acceptor = pynini.difference(acceptor, temp)
        seed += 1
        print('rand_list now has ' + str(len(rand_list)) + ' strings')
        print('need to add strings to rand_list (' + str(i) + ')')
    print('finished loop; returning incomplete set')
    return acceptor, rand_list
Ejemplo n.º 4
0
 def findAmbiguity(self, strictness=100):
     """ Allauzen and Mohri: an FST f is functional (i.e. one-to-one or
     many-to-one) iff f' .o. f is the identity function over f's domain.
     Rather than implement A&Z's algorithm to determine strict identity,
     we test identity for a random sample of paths.
     """
     fpcf = self.fsm.copy().invert() * self.fsm
     for top, bottom, _ in pynini.randgen(fpcf,
                                          npath=strictness,
                                          max_length=strictness)\
                                 .paths(input_token_type="symbol",
                                        output_token_type="symbol"):
         if top != bottom:
             return (clean(top), clean(bottom))
     return None
Ejemplo n.º 5
0
def by_len(ex, f, count):
    random_examples=pynini.randgen(ex,10000)
    ps = random_examples.paths(input_token_type="utf8", output_token_type="utf8")

    while not ps.done():
        if ps.istring() and ps.ostring():
            f[0].write(ps.istring() + "\tTRUE\n")
            f[0].write(ps.ostring() + "\tFALSE\n")
            if count % 10 ==0:
                f[1].write(ps.istring() + "\tTRUE\n")
                f[1].write(ps.ostring() + "\tFALSE\n")
                if count %100 ==0:
                    f[2].write(ps.istring() + "\tTRUE\n")
                    f[2].write(ps.ostring() + "\tFALSE\n")
        ps.next()
        count=count+1      
Ejemplo n.º 6
0
def rand_gen_no_duplicate(acceptor, n):
    loop = 10
    for i in range(loop):
        num = int(n + n*i*0.1)
        temp = pynini.randgen(acceptor, npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False)
        rand_list = list_string_set(temp)
        rand_list = list(set(rand_list))
        uniq_len = len(rand_list)
        if uniq_len < n and i < loop - 1:
            print('insufficient random strings')
            continue
        else:
            random.shuffle(rand_list)
            rand_list = rand_list[:n]
            rand_list.sort()
            acceptor = pynini.difference(acceptor, temp)
            return acceptor, rand_list
def main(unused_argv):
    far = py.Far(FLAGS.far)
    fst = far[FLAGS.rule]
    # Note that we tried to push weights to the beginning so that we don"t get
    # spurious selection of "free" cases where the first byte of a UTF8 character
    # has no weight.
    #
    #   fst = py.push(fst, push_weights=True, to_final=False)
    #
    # However this seems to produce artifacts of its own like endless series of
    # Greek roots starting with "drai".  On the other hand without it PAN gets
    # endless roots starting with ñ.
    if FLAGS.push:
        fst = py.push(fst, push_weights=True, to_final=False)
    rand = py.randgen(fst,
                      npath=FLAGS.npaths,
                      seed=int(time.time()),
                      select="log_prob",
                      weighted=True)
    print(Counter([p for p in rand.paths().ostrings()]))
Ejemplo n.º 8
0
def create_adversarial_data(filename, pos_dict, neg_dict, min_len, max_len, num):
    with open(filename, "w+") as f:
        for i in range(min_len, max_len + 1):
            _, results = rand_gen_no_duplicate(pos_dict[i], num)
            for ele in results:
                one_edit_dist_fsa = get_one_edit_distance_fsa(A(ele))
                temp_fsa = pynini.compose(one_edit_dist_fsa, neg_dict[i])
                if i - 1 >= min_len:
                    temp_fsa = temp_fsa | pynini.compose(one_edit_dist_fsa, neg_dict[i - 1])
                if i + 1 <= max_len:
                    temp_fsa = temp_fsa | pynini.compose(one_edit_dist_fsa, neg_dict[i + 1])
#                print('one edit distance:' + ele)
#                print(list_string_set(temp_fsa))
#                temp_res = list(set(list_string_set(one_edit_dist_fsa)) & set(list_string_set(neg_dict[i])))
                temp_res = \
                    list_string_set(pynini.randgen(temp_fsa, npath=1, seed=0, select="uniform", max_length=2147483647, weighted=False))
                if not temp_res:
                    print('insufficient adversarial data')
                    continue
                else:
                    f.write(ele + "\t" + "TRUE\n")
                    f.write(temp_res[0] + "\t" + "FALSE\n")
Ejemplo n.º 9
0
    def _assert_fst_sampled_behavior(
            self, fsts: List[pynini.Fst], token_type: pynini.TokenType,
            samples: int, assert_function: Callable[[pynini.Fst, pynini.Fst],
                                                    None]) -> None:
        """Asserts that FST composed on samples is follow a specific behavior.

    This samples from first FST's input projection in order to assert a
    behavior when composed with the FSTs. This is used in lieu of statically
    verifying that this composition has a specific property as that isn't easy
    to answer for non-deterministic FSTs. If token_type is set to "byte", then
    the input projection of the FST is intersected with the definition of the
    closure over valid UTF-8 characters to ensure all samples are valid UTF-8
    strings that Python can handle. The maximum length of a sample is set to 100
    labels.

    Args:
      fsts: List of FSTs to be applied on a sample to verify if the resultant
          FST obeys the property specified in the function.
      token_type: The token_type used to derive the FST.
      samples: The number of input samples to take to verify functionality.
      assert_function: An assert function with  input string FSA and output FST
          as parameters. This function is run in `pynini.default_token_type`
          environment. This function raises AssertionError on assert failure.
    """
        input_language = pynini.project(fsts[0], "input")
        if token_type == "byte":
            # NOTE: Randgenning directly from the byte machine is bound to lead to
            # trouble since it can generate things that aren't well-formed UTF-8
            # sequences and thus cannot be put into a Python str type.
            input_language = pynini.intersect(input_language,
                                              utf8.VALID_UTF8_CHAR.star)
        input_samples = pynini.randgen(input_language,
                                       npath=samples,
                                       max_length=_MAX_SAMPLE_LENGTH)
        with pynini.default_token_type(token_type):
            for ilabels in _olabels_iter(input_samples):
                input_str_fsa = _label_list_to_string_fsa(ilabels)
                output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts)
                assert_function(input_str_fsa, output_fst)
Ejemplo n.º 10
0
import pynini as pn
import random

# compose - *
# concat  - +
# union   - |

fst = (pn.a("a") | pn.a("e")) + pn.t("a",
                                     pn.a("0").closure(0, 5)) | pn.t(
                                         pn.a("a").star, "0") + pn.a("xxx")
fst = fst.optimize()

output_strings = set()

for i in range(10000):
    s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify()
    output_strings.add(s)

print(len(output_strings))

for output_string in output_strings:
    print(output_string)


def top_paths(fst, count=100):
    return sorted(
        set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths()))


print("INPUTS")
print("\t")