def correct_to_incorrect(x_fst, side): """used for creating negative examples for <= rules In order to make negative examples for <= rules we need to transform the examples so that some correct input:output pairs are changed so that the output part becomes different. The computed encoded FST maps correct inputs to any possible outputs (correct or incorrect). x_fst -- the FST for the X part of the rule side -- either "input" or "output" returns: an fst (encoded as a fsa) which maps correct examples into incorrect exs """ global pistar_fst, pistar_fsa if side == "input": mixed_fsa = mix_input(x_fst) else: mixed_fsa = mix_output(x_fst) temp_encod_fsa = hfst.fst_to_fsa(x_fst, separator="^") temp_encod_fsa.cross_product( mixed_fsa) # now maps corr X to all variations # twbt.ppfst(temp_encod_fsa, True) ## corr_to_incorr_encod_fst = pistar_fsa.copy() corr_to_incorr_encod_fst.concatenate(temp_encod_fsa) corr_to_incorr_encod_fst.concatenate(pistar_fsa) corr_to_incorr_encod_fst.minimize() # now complete corr_to_incorr_encod_fst.set_name("Correct to incorrect") return corr_to_incorr_encod_fst
def incorrect_to_correct(x_fst): """Compute a transformation for right-arrow (=>) rules In order to make negative examples for the => rules we need to modify the examples so that some correct occurrences of X are modified so that the output part of X becomes something else, i.e. incorrect because it is in an unexpected context. x_fst -- FST for the center part (X) of a rule Returns: scrambler_fst -- an encoded FST which maps encoded instances of X into all possible correct and incorrect pairs (where the input symbol is the same but the output symbol perhaps different). """ global pistar_fst, pistar_fsa x_encod_fsa = hfst.fst_to_fsa(x_fst, separator="^") mix_fst = mix_output(x_fst) # still an encoded fsa mix_fst.cross_product(x_encod_fsa) # now fst scrambler_fst = pistar_fsa.copy() scrambler_fst.concatenate(mix_fst) scrambler_fst.concatenate(pistar_fsa) scrambler_fst.minimize() # now complete scrambler_fst.set_name("Scrambler " + x_fst.get_name()) return scrambler_fst
def mix_output(x_fst): """Computes an FSA that is used when creating negative examples First, it computes an expression Y which represent all possible (correct and incorrect) realizations of the input side of X. Then, Y is transformed into an encoded FSA which can be a component of the transformation of correct examples into incorrect ones. x_fst -- the center FST (X part) of a rule Returns [X.u .o. PI*] encoded as an FSA (i.e. maps pairs to themselves) """ global pistar_fst result_fst = x_fst.copy() result_fst.input_project() result_fst.compose(pistar_fst) result_fst.minimize() result_encod_fsa = hfst.fst_to_fsa(result_fst, separator="^") # twbt.ppfst(result_fsa, True) ## return result_encod_fsa
def init(): """Initializes the module by computing several common FSTs Assumes that twexamp.read_fst() has read in cfg.examples_fst and initialized sone symbol sets. """ global pistar_fst, pistar_fsa, diamond_sym, diamond_fst global trim_pre_fst, trim_post_fst assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)" cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy() cfg.definitions["PI"] = cfg.all_pairs_fst.copy() diamond_sym = 'DIAMOND' diamond_fst = hfst.regex(diamond_sym) pi_fst = cfg.all_pairs_fst.copy() pistar_fst = cfg.all_pairs_fst.copy() pistar_fst.repeat_star() pistar_fst.remove_epsilons() pistar_fst.minimize() pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^') pi_in_fst = pi_fst.copy() pi_in_fst.input_project() pi_out_fst = pi_fst.copy() pi_out_fst.output_project() pi_in_star_fst = pistar_fst.copy() pi_in_star_fst.input_project() pi_out_star_fst = pistar_fst.copy() pi_out_star_fst.output_project() if cfg.verbosity >= 20: twbt.ppfst(pistar_fst, title="pistar_fst") fst1 = fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst)) fst2 = fs.star(fs.concat(fst1, fs.expr("ZERO:BEGIN"))) fst3 = fs.concat(fst2, pi_in_star_fst) fst4 = fs.star( fs.concat(fs.expr("ZERO:END"), fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst)))) trim_pre_fst = fs.concat(fst3, fst4) trim_pre_fst.set_name("trim_pre_fst") #trim_pre_fst = XRC.compile( # "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \ # "[[PI].u]* " \ # "[ZERO:END [ZERO .x. [PI].u]*]*" #) fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO"))) fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO"))) fst3 = fs.concat(fst2, pi_out_star_fst) fst4 = fs.star( fs.concat(fs.expr("END:ZERO"), fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO"))))) trim_post_fst = fs.concat(fst3, fst4) trim_post_fst.set_name("trim_post_fst") #trim_post_fst = XRC.compile( # "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \ # "[[PI].l]* " \ # "[END:ZERO [[PI].l .x. ZERO]*]*" #) if cfg.verbosity >= 20: twbt.ppfst(trim_pre_fst) twbt.ppfst(trim_post_fst) return
cfg.verbosity = args.verbosity if args.recursion: sys.setrecursionlimit(args.recursion) if args.examples.endswith(".fst"): twexamp.read_fst(args.examples) else: twexamp.read_examples(args.examples) if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, title="examples_fst") parser = twparser.init() examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^") examples_up_fsa = cfg.examples_fst.copy() examples_up_fsa.input_project() if cfg.verbosity >= 30: twbt.ppfst(examples_up_fsa, title="examples_up_fsa") twrule.init() skip = False all_rules_fst_lst = [] rule_file = open(args.rules, 'r') line_lst = [] for line_nl in rule_file: line = line_nl.split('!', maxsplit=1)[0].strip() if line == "START":
for transition in state: print('%u\t%u\t%s\t%s\t%.2f' % (index, transition.get_target_state(), transition.get_input_symbol(), transition.get_output_symbol(), transition.get_weight()), file=f) if fsm.is_final_state(index): print('%s\t%.2f' % (index, fsm.get_final_weight(index)), file=f) index = index + 1 print(fsm, file=f) f.close() tr = hfst.HfstBasicTransducer(hfst.regex('foo')) tr.substitute({'foo':'bar'}) tr.substitute({('foo','foo'):('bar','bar')}) tr = hfst.fst({'foo':'bar'}) fst = hfst.HfstBasicTransducer(tr) fsa = hfst.fst_to_fsa(fst, '^') fst = hfst.fsa_to_fst(fsa, '^') TR = hfst.HfstTransducer(fst) assert(TR.compare(tr)) tr = hfst.regex('{foo}:{bar}|{FOO}:{BAR}') fsm = hfst.HfstBasicTransducer(tr) net = fsm.states_and_transitions() for state in net: for arc in state: arc.set_input_symbol(arc.get_input_symbol() + '>') arc.set_output_symbol('<' + arc.get_output_symbol()) arc.set_weight(arc.get_weight() - 0.5) for state, arcs in enumerate(fsm): for arc in arcs:
arpar.add_argument("-d", "--debug", help="level of PLY debugging output", type=int, default=0) arpar.add_argument("-p", "--parser", help="which parser to use: ply or tatsu", default="ply") args = arpar.parse_args() print('Reading examples from:', args.examples) twex.read_fst(args.examples) examples_fsa = twex.EXAMPLES.copy() examples_fsa = hfst.fst_to_fsa(examples_fsa, separator="^") examples_up_fsa = twex.EXAMPLES.copy() examples_up_fsa.input_project() twrl.init(args.verbosity) if args.parser == "ply": import plytw plytw.init(args.verbosity) elif args.parser == "tatsu": import twolcsyntax twolcsyntax.init() else: print("--parser must be either 'tatsu' or 'ply', not", args.parser)
def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( description="A compiler and tester for two-level rules."\ " Version {}."\ " See https://pytwolc.readthedocs.io/en/latest/index.html"\ " or https://github.com/koskenni/twol"\ " for more information.".format(version)) arpar.add_argument( "-e", "--examples", action='store', nargs='+', help="""Either one name of a FST file that contains the examples or a list of names of files which contain the PSTR form examples used for compiling the rules.""", default=[None]) arpar.add_argument( "-r", "--rules", action='store', nargs='+', help="""One or more files which contain the rules, either just one rule file or a file of defines as the first one and a part of the whole rule set as the second""", default=[None]) arpar.add_argument( "-o", "--output", help="File to which write the compiled rules if a name is given", default="") arpar.add_argument( "-l", "--lost", help="File to which write the examples"\ " that were not accepted by all rules"\ " -- it is written as a FST", default="") arpar.add_argument( "-w", "--wrong", help="file to which write the wrong strings"\ " that are accepted by all rules -- it is written as a FST", default="") arpar.add_argument( "-t", "--thorough", help="test each rule separately: 0 if no testing is desired,"\ " 1 if against positive examples," " 2 against both positive and negative examples."\ " Default is 2.", type=int, choices=[0, 1, 2], default=2) arpar.add_argument( "--recursion", help="set the limit for recursion depth", type=int) arpar.add_argument( "-v", "--verbosity", help="level of diagnostic output", type=int, default=0) args = arpar.parse_args() cfg.verbosity = args.verbosity if args.recursion: sys.setrecursionlimit(args.recursion) if len(args.examples) == 1 and args.examples[0].endswith(".fst"): twexamp.read_fst(args.examples[0]) else: twexamp.read_examples(args.examples) if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, title="examples_fst") parser = twparser_init() examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^") examples_up_fsa = cfg.examples_fst.copy() examples_up_fsa.input_project() if cfg.verbosity >= 30: twbt.ppfst(examples_up_fsa, title="examples_up_fsa") twrule.init() i = 0 skip = False all_rules_fst_lst = [] line_lst = [] for line_nl in fileinput.input(args.rules): i += 1 if not line_lst: line_nl_lst = [] line_nl_lst.append(line_nl) line = line_nl.split('!', maxsplit=1)[0].strip() if line == "START": skip = False continue elif line == "STOP": skip = True if skip or (not line) or line.startswith("!"): continue line_lst.append(line) if not line.endswith(";"): continue else: rule_str = " ".join(line_lst) line_lst = [] op, left, right = parse_rule(parser, rule_str, i, line_nl_lst) if op == "?" or not (left and right): continue if (args.thorough > 0 and op != "=") or cfg.verbosity > 0: print("\n") print(rule_str) if op == "=": # if cfg.verbosity > 0: # print(line) if cfg.verbosity >= 10: print(left, op) twbt.ppfst(right) continue elif op == "=>": R, selector_fst, MIXe = twrule.rightarrow(line, left, *right) elif op == "<=": R, selector_fst, MIXe = twrule.output_coercion(line, left, *right) elif op == "<--": R, selector_fst, MIXe = twrule.input_coercion(line, left, *right) elif op == "<=>": R, selector_fst, MIXe = twrule.doublearrow(line, left, *right) elif op == "/<=": R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right) else: print("Error: not a valid type of a rule", op) continue if cfg.verbosity >= 10: twbt.ppfst(R) if args.lost or args.wrong or args.output: all_rules_fst_lst.append(R) if args.thorough > 0: selector_fst.intersect(cfg.examples_fst) # selector_fst.n_best(5) selector_fst.minimize() if cfg.verbosity >= 20: paths = selector_fst.extract_paths(output='raw') print_raw_paths(paths[0:20]) passed_pos_examples_fst = selector_fst.copy() passed_pos_examples_fst.intersect(R) if args.thorough > 0: if passed_pos_examples_fst.compare(selector_fst): print("All positive examples accepted") else: lost_examples_fst = selector_fst.copy() lost_examples_fst.minus(passed_pos_examples_fst) lost_examples_fst.minimize() print("** Some positive examples were rejected:") lost_paths = lost_examples_fst.extract_paths(output='raw') print_raw_paths(lost_paths[0:20]) if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}: neg_examples_fsa = examples_fsa.copy() neg_examples_fsa.compose(MIXe) neg_examples_fsa.output_project() neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^") neg_examples_fst.minus(cfg.examples_fst) NG = examples_up_fsa.copy() NG.compose(neg_examples_fst) npaths = NG.extract_paths(output='raw') #print_raw_paths(npaths) passed_neg_examples_fst = NG.copy() passed_neg_examples_fst.intersect(R) if passed_neg_examples_fst.compare(hfst.empty_fst()): print("All negative examples rejected") else: print("** Some negative examples accepted:") npaths = passed_neg_examples_fst.extract_paths(output='raw') print_raw_paths(npaths[0:20]) if args.lost or args.wrong: RESU = examples_up_fsa.copy() print(RESU.number_of_arcs(), "arcs in RESU") RESU.compose_intersect(tuple(all_rules_fst_lst)) RESU.minimize() if args.lost: lost_positive_examples_fst = cfg.examples_fst.copy() lost_positive_examples_fst.minus(RESU) lost_positive_examples_fst.minimize() lost_stream = hfst.HfstOutputStream(filename=args.lost) lost_stream.write(lost_positive_examples_fst) lost_stream.flush() lost_stream.close() print("wrote lost examples to", args.lost) if args.wrong: WRONG = RESU.copy() WRONG.subtract(cfg.examples_fst) WRONG.minimize() wrong_stream = hfst.HfstOutputStream(filename=args.wrong) wrong_stream.write(WRONG) wrong_stream.flush() wrong_stream.close() print("wrote wrongly accepted examples to", args.wrong) if args.output: outstream = hfst.HfstOutputStream(filename=args.output) for fst in all_rules_fst_lst: outstream.write(fst) outstream.flush() outstream.close() print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst), args.output)) return