def center_exclusion(name, x_fst, *contexts): """Compiles rules like X /<= [LC1,RC1),...(LCk,RCk)] name -- name to be given to the rule FST x_fst -- the center (X) of the rule \*contents -- list of contexts, i.e. pairs of left and right context Returns a triple: rule_fst -- the compiled rule selector_fst -- FST which selects examples which are relevant for this rule scrambler_fst -- empty_fst (negative examples not relevant for these rules) """ context_condition_fst = contexts_to_condition(*contexts) x_condition_fst = x_to_condition(x_fst) context_condition_fst.intersect(x_condition_fst) null_fst = hfst.empty_fst() rule_fst = generalized_restriction(context_condition_fst, null_fst) rule_fst.set_name(name) # twbt.ppfst(rule_fst, True) ## selector_fst = selector_from_x(x_fst) scrambler_fst = hfst.empty_fst() return rule_fst, selector_fst, scrambler_fst
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ import hfst exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = hfst.empty_fst() for insym, outsym in cfg.symbol_pair_set: in_quoted = re.sub(r"([{}])", r"%\1", insym) #print(in_quoted, outsym)### tilts if insym contains bad chars pair_fst = hfst.regex(in_quoted + ':' + outsym) cfg.all_pairs_fst.disjunct(pair_fst) cfg.all_pairs_fst.remove_epsilons() cfg.all_pairs_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def __init__(self, smoothing=0.0, alpha=0.05, freq_threshold=1) -> None: self.automaton = hfst.empty_fst() self.smoothing = smoothing self.alpha = alpha self.freq_threshold = freq_threshold if self.smoothing > 0: self.smoothing_model = UnigramRootModel()
def _compose_block(block, delenv, right_tr, tokenizer): tr = hfst.empty_fst() for word in block: tr.disjunct(hfst.tokenized_fst(tokenizer.tokenize(word))) tr.minimize() tr.compose(delenv) tr.minimize() tr.compose(right_tr) tr.minimize() return tr
def __init__(self, gen, penal_method="matching"): """ :param gen: A candidate generator (either Generator or HfstTransducer object) :param penal_method: Use matching (default) or counting approach for removal of marked candidates """ self._gen = gen.generate() if isinstance(gen, Generator) else gen self._penal_method = penal_method self._constraints = list() self._runnable = hfst.empty_fst() # Final FST for simple lookup self._stepwise = list() # Intermediate FSTs for candidate tracing
def aligner(words, max_zeros_in_longest, line): """Aligns a list of words according to similarity of their phonemes words -- a list of words (or morphs) to be aligned max_zeros_in_longest -- maximum number of zeros to be inserted into the longest word line -- the input line (used only in warning messages) cfg.all_zero_weight -- if phoneme set is {"Ø"} (default 100.0) Returns the best alignment as a list of raw morphophoneme. """ max_length = max([grapheme.length(x) for x in words]) weighted_fsa = hfst.empty_fst() for m in range(max_length, max_length + max_zeros_in_longest + 1): R = multialign(words, m) if R.compare(hfst.empty_fst()): if cfg.verbosity > 1: print("target length", m, "failed") continue weighted_fsa.disjunct(R) weighted_fsa.minimize() weighted_fsa.n_best(10) weighted_fsa.minimize() # accepts 10 best results results = weighted_fsa.extract_paths(output="raw") if cfg.verbosity >= 5: for w, sym_pair_seq in results: lst = [isym for isym, outsym in sym_pair_seq] mpw = ["{}::{:.2f}".format(x, mphon_weight(x)) for x in lst] print(" ".join(mpw), "total weight = {:.3f}".format(w)) if len(results) < 1: print("*** NO ALIGNMENTS FOR:", line, "***", results) return ([]) best_syl_struct = prefer_syl_struct(results) if cfg.final: best = prefer_final_zeros(best_syl_struct) else: best = best_syl_struct[0] return best
def main(): """Invoke a simple CLI analyser.""" argp = ArgumentParser() argp.add_argument('-a', '--analyser', metavar='FSA', required=True, help="Path to FSA analyser") argp.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data in CONLLU") options = argp.parse_args() analyser = load_analyser(options.analyser) sentence = hfst.epsilon_fst() if not options.infile: options.infile = stdin for line in options.infile: line = line.strip() if not line or line == '': print("@SENTENCE_SEPARATOR@") elif line.startswith('#'): print(line) else: refs = line.strip().split('\t') anals = analyse(analyser, refs[1]) if anals: lattice = hfst.empty_fst() for anal in anals: surf = refs[1] deep = anal[0] weight = anal[1] print(surf, deep) bleh = hfst.fst({surf: deep}) lattice.disjunct(bleh) sentence.concatenate(lattice) else: surf = refs[1] deep = refs[1] + "|NOUN|Case=Nom|Number=Sing|Guess=Yes|nsubj" print(surf, deep) bleh = hfst.fst({surf: deep}) sentence.concatenate(bleh) print("@TOKEN SEPARATOR@") foo = hfst.fst("@TOKEN_SEPARATOR@") sentence.concatenate(foo) exit(0)
print("** Some positive examples were rejected:") lost_paths = lost_examples_fst.extract_paths(output='raw') print_raw_paths(lost_paths[0:20]) if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}: neg_examples_fsa = examples_fsa.copy() neg_examples_fsa.compose(MIXe) neg_examples_fsa.output_project() neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^") neg_examples_fst.minus(cfg.examples_fst) NG = examples_up_fsa.copy() NG.compose(neg_examples_fst) npaths = NG.extract_paths(output='raw') #print_raw_paths(npaths) passed_neg_examples_fst = NG.copy() passed_neg_examples_fst.intersect(R) if passed_neg_examples_fst.compare(hfst.empty_fst()): print("All negative examples rejected") else: print("** Some negative examples accepted:") npaths = passed_neg_examples_fst.extract_paths(output='raw') print_raw_paths(npaths[0:20]) if args.lost or args.wrong: RESU = examples_up_fsa.copy() print(RESU.number_of_arcs(), "arcs in RESU") RESU.compose_intersect(tuple(all_rules_fst_lst)) RESU.minimize() if args.lost: lost_positive_examples_fst = cfg.examples_fst.copy() lost_positive_examples_fst.minus(RESU) lost_positive_examples_fst.minimize()
tok.add_multichar_symbol(hfst.EPSILON) # TODO: should this be included by default??? test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]') if not hfst.tokenized_fst([(hfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(hfst.regex('[0:b f:a o:a o:r]')): raise RuntimeError(get_linenumber()) # Is this ok??? if not hfst.regex('"' + hfst.EPSILON + '"').compare(hfst.regex('[0]')): raise RuntimeError(get_linenumber()) if not hfst.regex('"' + hfst.IDENTITY + '"').compare(hfst.regex('[?]')): raise RuntimeError(get_linenumber()) if not hfst.regex('"' + hfst.UNKNOWN + '":"' + hfst.UNKNOWN + '"').compare(hfst.regex('[?:?]')): raise RuntimeError(get_linenumber()) # other python functions if not hfst.empty_fst().compare(hfst.regex('[0-0]')): raise RuntimeError(get_linenumber()) if not hfst.epsilon_fst().compare(hfst.regex('[0]')): raise RuntimeError(get_linenumber()) if not hfst.epsilon_fst(-1.5).compare(hfst.regex('[0]::-1.5')): raise RuntimeError(get_linenumber()) # Non-ascii characters and unknowns/identities tr1 = hfst.regex('Ä:é å ?;') tr2 = hfst.regex('? Ö;') tr1.concatenate(tr2) result = hfst.regex('Ä:é å [Ä|é|å|Ö|?] [Ä|é|å|Ö|?] Ö;') if not tr1.compare(result): raise RuntimeError(get_linenumber()) tr1 = hfst.regex('ñ ?:á;')
def syllabify(self): """ Build a syllabifier FST with the specified settings. :return: An HfstTransducer for inserting syllable boundaries into candidates """ return hfst.empty_fst()
def generate(self): """ Build a generator FST with the specified settings. :return: An HfstTransducer for generating candidates """ return hfst.empty_fst()
ifile.close() print("Read %i transducers in total" % len(transducers)) # read_att from string #att_str = """0 1 a b #1 2 c d #2 #""" #print(att_str) #tr = hfst.read_att(att_str, '@0@') #print(tr) #exit(0) # write_att tr1 = hfst.regex('[foo:bar baz:0 " "]::0.3') tr2 = hfst.empty_fst() tr3 = hfst.epsilon_fst(0.5) tr4 = hfst.regex('[foo]') tr5 = hfst.empty_fst() f = open('testfile3.att', 'w') for tr in [tr1, tr2, tr3, tr4]: tr.write_att(f) f.write('--\n') tr5.write_att(f) f.close() # extract_paths tr = hfst.regex('a:b+ (a:c+)') print(tr) print(tr.extract_paths(max_cycles=1, output='text'))
def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( description="A compiler and tester for two-level rules."\ " Version {}."\ " See https://pytwolc.readthedocs.io/en/latest/index.html"\ " or https://github.com/koskenni/twol"\ " for more information.".format(version)) arpar.add_argument( "-e", "--examples", action='store', nargs='+', help="""Either one name of a FST file that contains the examples or a list of names of files which contain the PSTR form examples used for compiling the rules.""", default=[None]) arpar.add_argument( "-r", "--rules", action='store', nargs='+', help="""One or more files which contain the rules, either just one rule file or a file of defines as the first one and a part of the whole rule set as the second""", default=[None]) arpar.add_argument( "-o", "--output", help="File to which write the compiled rules if a name is given", default="") arpar.add_argument( "-l", "--lost", help="File to which write the examples"\ " that were not accepted by all rules"\ " -- it is written as a FST", default="") arpar.add_argument( "-w", "--wrong", help="file to which write the wrong strings"\ " that are accepted by all rules -- it is written as a FST", default="") arpar.add_argument( "-t", "--thorough", help="test each rule separately: 0 if no testing is desired,"\ " 1 if against positive examples," " 2 against both positive and negative examples."\ " Default is 2.", type=int, choices=[0, 1, 2], default=2) arpar.add_argument( "--recursion", help="set the limit for recursion depth", type=int) arpar.add_argument( "-v", "--verbosity", help="level of diagnostic output", type=int, default=0) args = arpar.parse_args() cfg.verbosity = args.verbosity if args.recursion: sys.setrecursionlimit(args.recursion) if len(args.examples) == 1 and args.examples[0].endswith(".fst"): twexamp.read_fst(args.examples[0]) else: twexamp.read_examples(args.examples) if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, title="examples_fst") parser = twparser_init() examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^") examples_up_fsa = cfg.examples_fst.copy() examples_up_fsa.input_project() if cfg.verbosity >= 30: twbt.ppfst(examples_up_fsa, title="examples_up_fsa") twrule.init() i = 0 skip = False all_rules_fst_lst = [] line_lst = [] for line_nl in fileinput.input(args.rules): i += 1 if not line_lst: line_nl_lst = [] line_nl_lst.append(line_nl) line = line_nl.split('!', maxsplit=1)[0].strip() if line == "START": skip = False continue elif line == "STOP": skip = True if skip or (not line) or line.startswith("!"): continue line_lst.append(line) if not line.endswith(";"): continue else: rule_str = " ".join(line_lst) line_lst = [] op, left, right = parse_rule(parser, rule_str, i, line_nl_lst) if op == "?" or not (left and right): continue if (args.thorough > 0 and op != "=") or cfg.verbosity > 0: print("\n") print(rule_str) if op == "=": # if cfg.verbosity > 0: # print(line) if cfg.verbosity >= 10: print(left, op) twbt.ppfst(right) continue elif op == "=>": R, selector_fst, MIXe = twrule.rightarrow(line, left, *right) elif op == "<=": R, selector_fst, MIXe = twrule.output_coercion(line, left, *right) elif op == "<--": R, selector_fst, MIXe = twrule.input_coercion(line, left, *right) elif op == "<=>": R, selector_fst, MIXe = twrule.doublearrow(line, left, *right) elif op == "/<=": R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right) else: print("Error: not a valid type of a rule", op) continue if cfg.verbosity >= 10: twbt.ppfst(R) if args.lost or args.wrong or args.output: all_rules_fst_lst.append(R) if args.thorough > 0: selector_fst.intersect(cfg.examples_fst) # selector_fst.n_best(5) selector_fst.minimize() if cfg.verbosity >= 20: paths = selector_fst.extract_paths(output='raw') print_raw_paths(paths[0:20]) passed_pos_examples_fst = selector_fst.copy() passed_pos_examples_fst.intersect(R) if args.thorough > 0: if passed_pos_examples_fst.compare(selector_fst): print("All positive examples accepted") else: lost_examples_fst = selector_fst.copy() lost_examples_fst.minus(passed_pos_examples_fst) lost_examples_fst.minimize() print("** Some positive examples were rejected:") lost_paths = lost_examples_fst.extract_paths(output='raw') print_raw_paths(lost_paths[0:20]) if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}: neg_examples_fsa = examples_fsa.copy() neg_examples_fsa.compose(MIXe) neg_examples_fsa.output_project() neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^") neg_examples_fst.minus(cfg.examples_fst) NG = examples_up_fsa.copy() NG.compose(neg_examples_fst) npaths = NG.extract_paths(output='raw') #print_raw_paths(npaths) passed_neg_examples_fst = NG.copy() passed_neg_examples_fst.intersect(R) if passed_neg_examples_fst.compare(hfst.empty_fst()): print("All negative examples rejected") else: print("** Some negative examples accepted:") npaths = passed_neg_examples_fst.extract_paths(output='raw') print_raw_paths(npaths[0:20]) if args.lost or args.wrong: RESU = examples_up_fsa.copy() print(RESU.number_of_arcs(), "arcs in RESU") RESU.compose_intersect(tuple(all_rules_fst_lst)) RESU.minimize() if args.lost: lost_positive_examples_fst = cfg.examples_fst.copy() lost_positive_examples_fst.minus(RESU) lost_positive_examples_fst.minimize() lost_stream = hfst.HfstOutputStream(filename=args.lost) lost_stream.write(lost_positive_examples_fst) lost_stream.flush() lost_stream.close() print("wrote lost examples to", args.lost) if args.wrong: WRONG = RESU.copy() WRONG.subtract(cfg.examples_fst) WRONG.minimize() wrong_stream = hfst.HfstOutputStream(filename=args.wrong) wrong_stream.write(WRONG) wrong_stream.flush() wrong_stream.close() print("wrote wrongly accepted examples to", args.wrong) if args.output: outstream = hfst.HfstOutputStream(filename=args.output) for fst in all_rules_fst_lst: outstream.write(fst) outstream.flush() outstream.close() print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst), args.output)) return