Ejemplo n.º 1
0
def center_exclusion(name, x_fst, *contexts):
    """Compiles rules like X /<= [LC1,RC1),...(LCk,RCk)]
    
    name -- name to be given to the rule FST
    
    x_fst -- the center (X) of the rule
    
    \*contents -- list of contexts, i.e. pairs of left and right context
    
    Returns a triple:
    
    rule_fst -- the compiled rule
    
    selector_fst -- FST which selects examples which are relevant for this rule
    
    scrambler_fst -- empty_fst (negative examples not relevant for these rules)
    """
    context_condition_fst = contexts_to_condition(*contexts)
    x_condition_fst = x_to_condition(x_fst)
    context_condition_fst.intersect(x_condition_fst)
    null_fst = hfst.empty_fst()
    rule_fst = generalized_restriction(context_condition_fst, null_fst)
    rule_fst.set_name(name)
    # twbt.ppfst(rule_fst, True) ##
    selector_fst = selector_from_x(x_fst)
    scrambler_fst = hfst.empty_fst()
    return rule_fst, selector_fst, scrambler_fst
Ejemplo n.º 2
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    import hfst
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = hfst.empty_fst()
    for insym, outsym in cfg.symbol_pair_set:
        in_quoted = re.sub(r"([{}])", r"%\1", insym)
        #print(in_quoted, outsym)### tilts if insym contains bad chars
        pair_fst = hfst.regex(in_quoted + ':' + outsym)
        cfg.all_pairs_fst.disjunct(pair_fst)
    cfg.all_pairs_fst.remove_epsilons()
    cfg.all_pairs_fst.minimize()
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
Ejemplo n.º 3
0
 def __init__(self, smoothing=0.0, alpha=0.05, freq_threshold=1) -> None:
     self.automaton = hfst.empty_fst()
     self.smoothing = smoothing
     self.alpha = alpha
     self.freq_threshold = freq_threshold
     if self.smoothing > 0:
         self.smoothing_model = UnigramRootModel()
Ejemplo n.º 4
0
 def _compose_block(block, delenv, right_tr, tokenizer):
     tr = hfst.empty_fst()
     for word in block:
         tr.disjunct(hfst.tokenized_fst(tokenizer.tokenize(word)))
     tr.minimize()
     tr.compose(delenv)
     tr.minimize()
     tr.compose(right_tr)
     tr.minimize()
     return tr
Ejemplo n.º 5
0
 def __init__(self, gen, penal_method="matching"):
     """
     :param gen: A candidate generator (either Generator or HfstTransducer object)
     :param penal_method: Use matching (default) or counting approach for removal of marked candidates
     """
     self._gen = gen.generate() if isinstance(gen, Generator) else gen
     self._penal_method = penal_method
     self._constraints = list()
     self._runnable = hfst.empty_fst()  # Final FST for simple lookup
     self._stepwise = list()  # Intermediate FSTs for candidate tracing
Ejemplo n.º 6
0
def aligner(words, max_zeros_in_longest, line):
    """Aligns a list of words according to similarity of their phonemes

    words -- a list of words (or morphs) to be aligned

    max_zeros_in_longest -- maximum number of zeros to be inserted into
    the longest word

    line -- the input line (used only in warning messages)

    cfg.all_zero_weight -- if phoneme set is {"Ø"} (default 100.0)

    Returns the best alignment as a list of raw morphophoneme.
    """
    max_length = max([grapheme.length(x) for x in words])
    weighted_fsa = hfst.empty_fst()
    for m in range(max_length, max_length + max_zeros_in_longest + 1):
        R = multialign(words, m)
        if R.compare(hfst.empty_fst()):
            if cfg.verbosity > 1:
                print("target length", m, "failed")
            continue
        weighted_fsa.disjunct(R)
        weighted_fsa.minimize()
    weighted_fsa.n_best(10)
    weighted_fsa.minimize()  # accepts 10 best results
    results = weighted_fsa.extract_paths(output="raw")
    if cfg.verbosity >= 5:
        for w, sym_pair_seq in results:
            lst = [isym for isym, outsym in sym_pair_seq]
            mpw = ["{}::{:.2f}".format(x, mphon_weight(x)) for x in lst]
            print(" ".join(mpw), "total weight = {:.3f}".format(w))
    if len(results) < 1:
        print("*** NO ALIGNMENTS FOR:", line, "***", results)
        return ([])
    best_syl_struct = prefer_syl_struct(results)
    if cfg.final:
        best = prefer_final_zeros(best_syl_struct)
    else:
        best = best_syl_struct[0]
    return best
Ejemplo n.º 7
0
def main():
    """Invoke a simple CLI analyser."""
    argp = ArgumentParser()
    argp.add_argument('-a', '--analyser', metavar='FSA', required=True,
                      help="Path to FSA analyser")
    argp.add_argument('-i', '--input', metavar="INFILE", type=open,
                      dest="infile", help="source of analysis data in CONLLU")
    options = argp.parse_args()
    analyser = load_analyser(options.analyser)
    sentence = hfst.epsilon_fst()
    if not options.infile:
        options.infile = stdin
    for line in options.infile:
        line = line.strip()
        if not line or line == '':
            print("@SENTENCE_SEPARATOR@")
        elif line.startswith('#'):
            print(line)
        else:
            refs = line.strip().split('\t')
            anals = analyse(analyser, refs[1])
            if anals:
                lattice = hfst.empty_fst()
                for anal in anals:
                    surf = refs[1]
                    deep = anal[0]
                    weight = anal[1]
                    print(surf, deep)
                    bleh = hfst.fst({surf: deep})
                    lattice.disjunct(bleh)
                sentence.concatenate(lattice)
            else:
                surf = refs[1]
                deep = refs[1] + "|NOUN|Case=Nom|Number=Sing|Guess=Yes|nsubj"
                print(surf, deep)
                bleh = hfst.fst({surf: deep})
                sentence.concatenate(bleh)
            print("@TOKEN SEPARATOR@")
            foo = hfst.fst("@TOKEN_SEPARATOR@")
            sentence.concatenate(foo)
    exit(0)
Ejemplo n.º 8
0
                print("** Some positive examples were rejected:")
                lost_paths = lost_examples_fst.extract_paths(output='raw')
                print_raw_paths(lost_paths[0:20])
    if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}:
        neg_examples_fsa = examples_fsa.copy()
        neg_examples_fsa.compose(MIXe)
        neg_examples_fsa.output_project()
        neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^")
        neg_examples_fst.minus(cfg.examples_fst)
        NG = examples_up_fsa.copy()
        NG.compose(neg_examples_fst)
        npaths = NG.extract_paths(output='raw')
        #print_raw_paths(npaths)
        passed_neg_examples_fst = NG.copy()
        passed_neg_examples_fst.intersect(R)
        if passed_neg_examples_fst.compare(hfst.empty_fst()):
            print("All negative examples rejected")
        else:
            print("** Some negative examples accepted:")
            npaths = passed_neg_examples_fst.extract_paths(output='raw')
            print_raw_paths(npaths[0:20])

if args.lost or args.wrong:
    RESU = examples_up_fsa.copy()
    print(RESU.number_of_arcs(), "arcs in RESU")
    RESU.compose_intersect(tuple(all_rules_fst_lst))
    RESU.minimize()
if args.lost:
    lost_positive_examples_fst = cfg.examples_fst.copy()
    lost_positive_examples_fst.minus(RESU)
    lost_positive_examples_fst.minimize()
Ejemplo n.º 9
0
    tok.add_multichar_symbol(hfst.EPSILON) # TODO: should this be included by default???
    test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]')

    if not hfst.tokenized_fst([(hfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(hfst.regex('[0:b f:a o:a o:r]')):
        raise RuntimeError(get_linenumber())

    # Is this ok???
    if not hfst.regex('"' + hfst.EPSILON + '"').compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.IDENTITY + '"').compare(hfst.regex('[?]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.UNKNOWN + '":"' + hfst.UNKNOWN + '"').compare(hfst.regex('[?:?]')):
        raise RuntimeError(get_linenumber())

    # other python functions
    if not hfst.empty_fst().compare(hfst.regex('[0-0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.epsilon_fst().compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.epsilon_fst(-1.5).compare(hfst.regex('[0]::-1.5')):
        raise RuntimeError(get_linenumber())

    # Non-ascii characters and unknowns/identities
    tr1 = hfst.regex('Ä:é å ?;')
    tr2 = hfst.regex('? Ö;')
    tr1.concatenate(tr2)
    result = hfst.regex('Ä:é å [Ä|é|å|Ö|?] [Ä|é|å|Ö|?] Ö;')
    if not tr1.compare(result):
        raise RuntimeError(get_linenumber())

    tr1 = hfst.regex('ñ ?:á;')
Ejemplo n.º 10
0
 def syllabify(self):
     """
     Build a syllabifier FST with the specified settings.
     :return: An HfstTransducer for inserting syllable boundaries into candidates
     """
     return hfst.empty_fst()
Ejemplo n.º 11
0
 def generate(self):
     """
     Build a generator FST with the specified settings.
     :return: An HfstTransducer for generating candidates
     """
     return hfst.empty_fst()
Ejemplo n.º 12
0
        ifile.close()
        print("Read %i transducers in total" % len(transducers))

# read_att from string
#att_str = """0 1 a b
#1 2 c d
#2
#"""
#print(att_str)
#tr = hfst.read_att(att_str, '@0@')
#print(tr)
#exit(0)

        # write_att
        tr1 = hfst.regex('[foo:bar baz:0 " "]::0.3')
        tr2 = hfst.empty_fst()
        tr3 = hfst.epsilon_fst(0.5)
        tr4 = hfst.regex('[foo]')
        tr5 = hfst.empty_fst()

        f = open('testfile3.att', 'w')
        for tr in [tr1, tr2, tr3, tr4]:
            tr.write_att(f)
            f.write('--\n')
        tr5.write_att(f)
        f.close()

        # extract_paths
        tr = hfst.regex('a:b+ (a:c+)')
        print(tr)
        print(tr.extract_paths(max_cycles=1, output='text'))
Ejemplo n.º 13
0
    tok.add_multichar_symbol(hfst.EPSILON) # TODO: should this be included by default???
    test_tokenized(tok, '@_EPSILON_SYMBOL_@foo', None, '[f o o]')

    if not hfst.tokenized_fst([(hfst.EPSILON,'b'),('f','a'),('o','a'),('o','r')]).compare(hfst.regex('[0:b f:a o:a o:r]')):
        raise RuntimeError(get_linenumber())

    # Is this ok???
    if not hfst.regex('"' + hfst.EPSILON + '"').compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.IDENTITY + '"').compare(hfst.regex('[?]')):
        raise RuntimeError(get_linenumber())
    if not hfst.regex('"' + hfst.UNKNOWN + '":"' + hfst.UNKNOWN + '"').compare(hfst.regex('[?:?]')):
        raise RuntimeError(get_linenumber())

    # other python functions
    if not hfst.empty_fst().compare(hfst.regex('[0-0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.epsilon_fst().compare(hfst.regex('[0]')):
        raise RuntimeError(get_linenumber())
    if not hfst.epsilon_fst(-1.5).compare(hfst.regex('[0]::-1.5')):
        raise RuntimeError(get_linenumber())

    # Non-ascii characters and unknowns/identities
    tr1 = hfst.regex('Ä:é å ?;')
    tr2 = hfst.regex('? Ö;')
    tr1.concatenate(tr2)
    result = hfst.regex('Ä:é å [Ä|é|å|Ö|?] [Ä|é|å|Ö|?] Ö;')
    if not tr1.compare(result):
        raise RuntimeError(get_linenumber())

    tr1 = hfst.regex('ñ ?:á;')
Ejemplo n.º 14
0
def main():

    version = cfg.timestamp(__file__)
    import argparse
    arpar = argparse.ArgumentParser(
        description="A compiler and tester for two-level rules."\
        " Version {}."\
        " See https://pytwolc.readthedocs.io/en/latest/index.html"\
        " or https://github.com/koskenni/twol"\
        " for more information.".format(version))
    arpar.add_argument(
        "-e", "--examples", action='store', nargs='+',
        help="""Either one name of a FST file that contains the examples or
            a list of names of files which contain the PSTR form examples
            used for compiling the rules.""",
        default=[None])
    arpar.add_argument(
        "-r", "--rules", action='store', nargs='+',
        help="""One or more files which contain the rules,
             either just one rule file or a file of defines
             as the first one and a part of the whole rule set
             as the second""",
        default=[None])
    arpar.add_argument(
        "-o", "--output",
        help="File to which write the compiled rules if a name is given",
        default="")
    arpar.add_argument(
        "-l", "--lost",
        help="File to which write the examples"\
        " that were not accepted by all rules"\
        " -- it is written as a FST",
        default="")
    arpar.add_argument(
        "-w", "--wrong",
        help="file to which write the wrong strings"\
        " that are accepted by all rules -- it is written as a FST",
        default="")
    arpar.add_argument(
        "-t", "--thorough",
        help="test each rule separately: 0 if no testing is desired,"\
        " 1 if against positive examples,"
        " 2 against both positive and negative examples."\
        " Default is 2.",
        type=int, choices=[0, 1, 2], default=2)
    arpar.add_argument(
        "--recursion",
        help="set the limit for recursion depth",
        type=int)
    arpar.add_argument(
        "-v", "--verbosity",
        help="level of  diagnostic output",
        type=int, default=0)

    args = arpar.parse_args()

    cfg.verbosity = args.verbosity
    if args.recursion:
        sys.setrecursionlimit(args.recursion)

    if len(args.examples) == 1 and args.examples[0].endswith(".fst"):
        twexamp.read_fst(args.examples[0])
    else:
        twexamp.read_examples(args.examples)

    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.examples_fst, title="examples_fst")

    parser = twparser_init()

    examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^")

    examples_up_fsa = cfg.examples_fst.copy()
    examples_up_fsa.input_project()
    if cfg.verbosity >= 30:
        twbt.ppfst(examples_up_fsa, title="examples_up_fsa")

    twrule.init()

    i = 0
    skip = False
    all_rules_fst_lst = []
    line_lst = []

    for line_nl in fileinput.input(args.rules):
        i += 1
        if not line_lst:
            line_nl_lst = []
        line_nl_lst.append(line_nl)
        line = line_nl.split('!', maxsplit=1)[0].strip()
        if line == "START":
            skip = False
            continue
        elif line == "STOP":
            skip = True
        if skip or (not line) or line.startswith("!"):
            continue
        line_lst.append(line)
        if not line.endswith(";"):
            continue
        else:
            rule_str = " ".join(line_lst)
            line_lst = []

        op, left, right = parse_rule(parser, rule_str, i, line_nl_lst)
        if op == "?" or not (left and right):
            continue

        if (args.thorough > 0 and op != "=") or cfg.verbosity > 0:
            print("\n")
            print(rule_str)

        if op == "=":
            #        if cfg.verbosity > 0:
            #            print(line)
            if cfg.verbosity >= 10:
                print(left, op)
                twbt.ppfst(right)
            continue
        elif op == "=>":
            R, selector_fst, MIXe = twrule.rightarrow(line, left, *right)
        elif op == "<=":
            R, selector_fst, MIXe = twrule.output_coercion(line, left, *right)
        elif op == "<--":
            R, selector_fst, MIXe = twrule.input_coercion(line, left, *right)
        elif op == "<=>":
            R, selector_fst, MIXe = twrule.doublearrow(line, left, *right)
        elif op == "/<=":
            R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right)
        else:
            print("Error: not a valid type of a rule", op)
            continue
        if cfg.verbosity >= 10:
            twbt.ppfst(R)
        if args.lost or args.wrong or args.output:
            all_rules_fst_lst.append(R)
        if args.thorough > 0:
            selector_fst.intersect(cfg.examples_fst)
            # selector_fst.n_best(5)
            selector_fst.minimize()
            if cfg.verbosity >= 20:
                paths = selector_fst.extract_paths(output='raw')
                print_raw_paths(paths[0:20])
            passed_pos_examples_fst = selector_fst.copy()
            passed_pos_examples_fst.intersect(R)
            if args.thorough > 0:
                if passed_pos_examples_fst.compare(selector_fst):
                    print("All positive examples accepted")
                else:
                    lost_examples_fst = selector_fst.copy()
                    lost_examples_fst.minus(passed_pos_examples_fst)
                    lost_examples_fst.minimize()
                    print("** Some positive examples were rejected:")
                    lost_paths = lost_examples_fst.extract_paths(output='raw')
                    print_raw_paths(lost_paths[0:20])
        if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}:
            neg_examples_fsa = examples_fsa.copy()
            neg_examples_fsa.compose(MIXe)
            neg_examples_fsa.output_project()
            neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^")
            neg_examples_fst.minus(cfg.examples_fst)
            NG = examples_up_fsa.copy()
            NG.compose(neg_examples_fst)
            npaths = NG.extract_paths(output='raw')
            #print_raw_paths(npaths)
            passed_neg_examples_fst = NG.copy()
            passed_neg_examples_fst.intersect(R)
            if passed_neg_examples_fst.compare(hfst.empty_fst()):
                print("All negative examples rejected")
            else:
                print("** Some negative examples accepted:")
                npaths = passed_neg_examples_fst.extract_paths(output='raw')
                print_raw_paths(npaths[0:20])

    if args.lost or args.wrong:
        RESU = examples_up_fsa.copy()
        print(RESU.number_of_arcs(), "arcs in RESU")
        RESU.compose_intersect(tuple(all_rules_fst_lst))
        RESU.minimize()
    if args.lost:
        lost_positive_examples_fst = cfg.examples_fst.copy()
        lost_positive_examples_fst.minus(RESU)
        lost_positive_examples_fst.minimize()
        lost_stream = hfst.HfstOutputStream(filename=args.lost)
        lost_stream.write(lost_positive_examples_fst)
        lost_stream.flush()
        lost_stream.close()
        print("wrote lost examples to", args.lost)
    if args.wrong:
        WRONG = RESU.copy()
        WRONG.subtract(cfg.examples_fst)
        WRONG.minimize()
        wrong_stream = hfst.HfstOutputStream(filename=args.wrong)
        wrong_stream.write(WRONG)
        wrong_stream.flush()
        wrong_stream.close()
        print("wrote wrongly accepted examples to", args.wrong)
    if args.output:
        outstream = hfst.HfstOutputStream(filename=args.output)
        for fst in all_rules_fst_lst:
            outstream.write(fst)
        outstream.flush()
        outstream.close()
        print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst),
                                                        args.output))
    return
Ejemplo n.º 15
0
        ifile.close()
        print("Read %i transducers in total" % len(transducers))

# read_att from string
#att_str = """0 1 a b
#1 2 c d
#2
#"""
#print(att_str)
#tr = hfst.read_att(att_str, '@0@')
#print(tr)
#exit(0)

        # write_att
        tr1 = hfst.regex('[foo:bar baz:0 " "]::0.3')
        tr2 = hfst.empty_fst()
        tr3 = hfst.epsilon_fst(0.5)
        tr4 = hfst.regex('[foo]')
        tr5 = hfst.empty_fst()

        f = open('testfile3.att', 'w')
        for tr in [tr1, tr2, tr3, tr4]:
            tr.write_att(f)
            f.write('--\n')
        tr5.write_att(f)
        f.close()

        # extract_paths
        tr = hfst.regex('a:b+ (a:c+)')
        print(tr)
        print(tr.extract_paths(max_cycles=1, output='text'))