def input_coercion(name, x_fst, *contexts): """Compiles rules like X <-- LC1 _ RC1, ..., LCk _ RCk name -- name to be given to the rule FST x_fst -- the center (X) of the rule \*contexts -- list of contexts, i.e. pairs of left and right context Returns a triple: rule_fst -- the compiled rule selector_fst -- FST which selects examples which are relevant for this rule scrambler_fst -- an encoded FST which produces negative examples """ global pistar_fst postcondition_fst = x_to_condition(x_fst) x_all_fst = pistar_fst.copy() temp_fst = x_fst.copy() temp_fst.output_project() x_all_fst.compose(temp_fst) # PI* .o. X.l precondition_fst = x_to_condition(x_all_fst) context_condition_fst = contexts_to_condition(*contexts) precondition_fst.intersect(context_condition_fst) rule_fst = generalized_restriction(precondition_fst, postcondition_fst) rule_fst.set_name(name) if cfg.verbosity >= 20: twbt.ppfst(rule_fst, True) selector_fst = selector_from_x(x_all_fst) scrambler_fst = correct_to_incorrect(x_fst, "input") return rule_fst, selector_fst, scrambler_fst
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ import hfst exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = hfst.empty_fst() for insym, outsym in cfg.symbol_pair_set: in_quoted = re.sub(r"([{}])", r"%\1", insym) #print(in_quoted, outsym)### tilts if insym contains bad chars pair_fst = hfst.regex(in_quoted + ':' + outsym) cfg.all_pairs_fst.disjunct(pair_fst) cfg.all_pairs_fst.remove_epsilons() cfg.all_pairs_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def read_examples(filename="test.pstr", build_fsts=True): """Reads the examples from the file whose name is 'filename'. The file must contain one example per line and each line consists of a space separated sequence of pair-symbols. The examples are processed into """ if build_fsts: import hfst examples_bfst = hfst.HfstBasicTransducer() exfile = open(filename, "r") for line_nl in exfile: line = line_nl.strip() if not line or line.startswith("!"): continue pairsym_lst = re.split("\s+", line) symbol_pair_lst = [ cfg.pairsym2sympair(pairsym) for pairsym in pairsym_lst ] # print("symbol_pair_lst:", symbol_pair_lst) ## pair_symbol_str = " ".join([ cfg.sympair2pairsym(insym, outsym) for insym, outsym in symbol_pair_lst ]) # print("pair_symbol_lst:", pair_symbol_lst) ## cfg.example_lst.append(pair_symbol_str) cfg.example_set.add(pair_symbol_str) # spaces normalized #LINE_FST = hfst.tokenized_fst(symbol_pair_lst) # twbt.printfst(LINE_FST, True) ## if build_fsts: examples_bfst.disjunct(symbol_pair_lst, 0) for insym, outsym in symbol_pair_lst: cfg.symbol_pair_set.add((insym, outsym)) exfile.close() if cfg.verbosity >= 30: print("List of examples:", cfg.example_lst) print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set)) if build_fsts: cfg.examples_fst = hfst.HfstTransducer(examples_bfst) cfg.examples_fst.set_name(filename) cfg.examples_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, False, title="Example file as FST") ## for insym, outsym in cfg.symbol_pair_set: cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) for insym, outsym in cfg.symbol_pair_set: pair_symbol = cfg.sympair2pairsym(insym, outsym) cfg.pair_symbol_set.add(pair_symbol) if build_fsts: pair_symbol_lst = [ insym + ':' + outsym for insym, outsym in cfg.symbol_pair_set ] pair_symbol_str = " ".join(sorted(pair_symbol_lst)) # print("symbol pairs:", pair_symbol_str) ## cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str) return
def e(str): """Convert a two-level component expression into a FST. str -- a string containing a (two-level) regular expression Returns an FST which performs the mapping represented by str corresponding to the expression. """ global XRC # print("Regex string:", str) ## if str == "": return (XRC.compile("[]")) F = XRC.compile(str) F.minimize() F.set_name(str) if cfg.verbosity >= 5: twbt.ppfst(F) ## return (F)
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set) if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def init(): """Initializes the module by computing several common FSTs Assumes that twexamp.read_fst() has read in cfg.examples_fst and initialized sone symbol sets. """ global pistar_fst, pistar_fsa, diamond_sym, diamond_fst global trim_pre_fst, trim_post_fst assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)" cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy() cfg.definitions["PI"] = cfg.all_pairs_fst.copy() diamond_sym = 'DIAMOND' diamond_fst = hfst.regex(diamond_sym) pi_fst = cfg.all_pairs_fst.copy() pistar_fst = cfg.all_pairs_fst.copy() pistar_fst.repeat_star() pistar_fst.remove_epsilons() pistar_fst.minimize() pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^') pi_in_fst = pi_fst.copy() pi_in_fst.input_project() pi_out_fst = pi_fst.copy() pi_out_fst.output_project() pi_in_star_fst = pistar_fst.copy() pi_in_star_fst.input_project() pi_out_star_fst = pistar_fst.copy() pi_out_star_fst.output_project() if cfg.verbosity >= 20: twbt.ppfst(pistar_fst, title="pistar_fst") fst1 = fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst)) fst2 = fs.star(fs.concat(fst1, fs.expr("ZERO:BEGIN"))) fst3 = fs.concat(fst2, pi_in_star_fst) fst4 = fs.star( fs.concat(fs.expr("ZERO:END"), fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst)))) trim_pre_fst = fs.concat(fst3, fst4) trim_pre_fst.set_name("trim_pre_fst") #trim_pre_fst = XRC.compile( # "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \ # "[[PI].u]* " \ # "[ZERO:END [ZERO .x. [PI].u]*]*" #) fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO"))) fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO"))) fst3 = fs.concat(fst2, pi_out_star_fst) fst4 = fs.star( fs.concat(fs.expr("END:ZERO"), fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO"))))) trim_post_fst = fs.concat(fst3, fst4) trim_post_fst.set_name("trim_post_fst") #trim_post_fst = XRC.compile( # "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \ # "[[PI].l]* " \ # "[END:ZERO [[PI].l .x. ZERO]*]*" #) if cfg.verbosity >= 20: twbt.ppfst(trim_pre_fst) twbt.ppfst(trim_post_fst) return
selector_fst -- FST which selects examples which are relevant for this rule scrambler_fst -- empty_fst (negative examples not relevant for these rules) """ context_condition_fst = contexts_to_condition(*contexts) x_condition_fst = x_to_condition(x_fst) context_condition_fst.intersect(x_condition_fst) null_fst = hfst.empty_fst() rule_fst = generalized_restriction(context_condition_fst, null_fst) rule_fst.set_name(name) # twbt.ppfst(rule_fst, True) ## selector_fst = selector_from_x(x_fst) scrambler_fst = hfst.empty_fst() return rule_fst, selector_fst, scrambler_fst if __name__ == "__main__": twex.read_examples() init(1) #define("V", "PI .o.[a|e|i|o|ä|ö]") #define("C", "[PI .o. [h|l|n|s|t|v]] | %{ij%}:j") R1 = doublearrow("{ao}:o <=> _ {ij}:", e("%{ao%}:o"), (e("[]"), e("[%{ij%} .o. PI]"))) twbt.ppfst(R1, True) rule2_fst = doublearrow("{ij}:j <=> V :Ø* _ :Ø* V", "%{ij%}:j", ("V [PI .o. Ø]*", "[PI .o. Ø]* V")) twbt.ppfst(rule2_fst, True) R3 = doublearrow("{tl}:l <=> _ CLOSED", "%{tl%}:l", ("[]", "V %{ij%}:i* C [C | [PI .o. Ø]* END]")) twbt.ppfst(R3, True)
description="A compiler and tester for two-level rules") arpar.add_argument("start", help="start parseing from", default="expr_start") args = arpar.parse_args() twexamp.read_fst(filename="nounex.fst") parser = init() for line_nl in sys.stdin: line = line_nl.strip() #print(line) result = parser.parse(line, start=args.start, semantics=TwolFstSemantics()) if args.start == "def_start": op, left, right, source = result print(left, "=") twbt.ppfst(right) elif args.start == "rul_start": op, left, right, source = result twbt.ppfst(left) print(op) for lc, rc in right: twbt.ppfst(lc, title="left context") twbt.ppfst(rc, title="right context") elif args.start == "expr_start": fst = result #print(fst) twbt.ppfst(fst, True) elif op == "?": print("Incorrect: " + line)
help="name of the examples fst or example pair symbol string file", default="examples.fst") arpar.add_argument("rules", help="name of the rule file", default="test.rules") args = arpar.parse_args() cfg.verbosity = args.verbosity if args.recursion: sys.setrecursionlimit(args.recursion) if args.examples.endswith(".fst"): twexamp.read_fst(args.examples) else: twexamp.read_examples(args.examples) if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, title="examples_fst") parser = twparser.init() examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^") examples_up_fsa = cfg.examples_fst.copy() examples_up_fsa.input_project() if cfg.verbosity >= 30: twbt.ppfst(examples_up_fsa, title="examples_up_fsa") twrule.init() skip = False all_rules_fst_lst = [] rule_file = open(args.rules, 'r')