Beispiel #1
0
def input_coercion(name, x_fst, *contexts):
    """Compiles rules like X <-- LC1 _ RC1, ..., LCk _ RCk
    
    name -- name to be given to the rule FST
    
    x_fst -- the center (X) of the rule
    
    \*contexts -- list of contexts, i.e. pairs of left and right context
    
    Returns a triple:
    rule_fst -- the compiled rule
    
    selector_fst -- FST which selects examples which are relevant for
    this rule
    
    scrambler_fst -- an encoded FST which produces negative examples
    """
    global pistar_fst
    postcondition_fst = x_to_condition(x_fst)
    x_all_fst = pistar_fst.copy()
    temp_fst = x_fst.copy()
    temp_fst.output_project()
    x_all_fst.compose(temp_fst)  # PI* .o. X.l
    precondition_fst = x_to_condition(x_all_fst)
    context_condition_fst = contexts_to_condition(*contexts)
    precondition_fst.intersect(context_condition_fst)
    rule_fst = generalized_restriction(precondition_fst, postcondition_fst)
    rule_fst.set_name(name)
    if cfg.verbosity >= 20:
        twbt.ppfst(rule_fst, True)
    selector_fst = selector_from_x(x_all_fst)
    scrambler_fst = correct_to_incorrect(x_fst, "input")
    return rule_fst, selector_fst, scrambler_fst
Beispiel #2
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    import hfst
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = hfst.empty_fst()
    for insym, outsym in cfg.symbol_pair_set:
        in_quoted = re.sub(r"([{}])", r"%\1", insym)
        #print(in_quoted, outsym)### tilts if insym contains bad chars
        pair_fst = hfst.regex(in_quoted + ':' + outsym)
        cfg.all_pairs_fst.disjunct(pair_fst)
    cfg.all_pairs_fst.remove_epsilons()
    cfg.all_pairs_fst.minimize()
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
Beispiel #3
0
def read_examples(filename="test.pstr", build_fsts=True):
    """Reads the examples from the file whose name is 'filename'.
    
    The file must contain one example per line and each line consists of
    a space separated sequence of pair-symbols.  The examples are processed into 
    """
    if build_fsts:
        import hfst
        examples_bfst = hfst.HfstBasicTransducer()
    exfile = open(filename, "r")
    for line_nl in exfile:
        line = line_nl.strip()
        if not line or line.startswith("!"):
            continue
        pairsym_lst = re.split("\s+", line)
        symbol_pair_lst = [
            cfg.pairsym2sympair(pairsym) for pairsym in pairsym_lst
        ]
        # print("symbol_pair_lst:", symbol_pair_lst) ##
        pair_symbol_str = " ".join([
            cfg.sympair2pairsym(insym, outsym)
            for insym, outsym in symbol_pair_lst
        ])
        # print("pair_symbol_lst:", pair_symbol_lst) ##
        cfg.example_lst.append(pair_symbol_str)
        cfg.example_set.add(pair_symbol_str)  # spaces normalized
        #LINE_FST = hfst.tokenized_fst(symbol_pair_lst)
        # twbt.printfst(LINE_FST, True) ##
        if build_fsts:
            examples_bfst.disjunct(symbol_pair_lst, 0)
        for insym, outsym in symbol_pair_lst:
            cfg.symbol_pair_set.add((insym, outsym))
    exfile.close()
    if cfg.verbosity >= 30:
        print("List of examples:", cfg.example_lst)
        print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set))
    if build_fsts:
        cfg.examples_fst = hfst.HfstTransducer(examples_bfst)
        cfg.examples_fst.set_name(filename)
        cfg.examples_fst.minimize()
        if cfg.verbosity >= 30:
            twbt.ppfst(cfg.examples_fst, False,
                       title="Example file as FST")  ##
    for insym, outsym in cfg.symbol_pair_set:
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    for insym, outsym in cfg.symbol_pair_set:
        pair_symbol = cfg.sympair2pairsym(insym, outsym)
        cfg.pair_symbol_set.add(pair_symbol)
    if build_fsts:
        pair_symbol_lst = [
            insym + ':' + outsym for insym, outsym in cfg.symbol_pair_set
        ]
        pair_symbol_str = " ".join(sorted(pair_symbol_lst))
        # print("symbol pairs:", pair_symbol_str) ##
        cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str)
    return
Beispiel #4
0
def e(str):
    """Convert a two-level component expression into a FST.
    
    str -- a string containing a (two-level) regular expression
    Returns an FST which performs the mapping represented by str 
    corresponding to the expression. 
    """
    global XRC
    # print("Regex string:", str) ##
    if str == "":
        return (XRC.compile("[]"))
    F = XRC.compile(str)
    F.minimize()
    F.set_name(str)
    if cfg.verbosity >= 5:
        twbt.ppfst(F)  ##
    return (F)
Beispiel #5
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
Beispiel #6
0
def init():
    """Initializes the module by computing several common FSTs
    
    Assumes that twexamp.read_fst() has read in cfg.examples_fst and
    initialized sone symbol sets.
    """
    global pistar_fst, pistar_fsa, diamond_sym, diamond_fst
    global trim_pre_fst, trim_post_fst

    assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)"

    cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy()
    cfg.definitions["PI"] = cfg.all_pairs_fst.copy()

    diamond_sym = 'DIAMOND'
    diamond_fst = hfst.regex(diamond_sym)
    pi_fst = cfg.all_pairs_fst.copy()
    pistar_fst = cfg.all_pairs_fst.copy()
    pistar_fst.repeat_star()
    pistar_fst.remove_epsilons()
    pistar_fst.minimize()
    pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^')
    pi_in_fst = pi_fst.copy()
    pi_in_fst.input_project()
    pi_out_fst = pi_fst.copy()
    pi_out_fst.output_project()
    pi_in_star_fst = pistar_fst.copy()
    pi_in_star_fst.input_project()
    pi_out_star_fst = pistar_fst.copy()
    pi_out_star_fst.output_project()
    if cfg.verbosity >= 20:
        twbt.ppfst(pistar_fst, title="pistar_fst")

    fst1 = fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst))
    fst2 = fs.star(fs.concat(fst1, fs.expr("ZERO:BEGIN")))
    fst3 = fs.concat(fst2, pi_in_star_fst)
    fst4 = fs.star(
        fs.concat(fs.expr("ZERO:END"),
                  fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst))))
    trim_pre_fst = fs.concat(fst3, fst4)
    trim_pre_fst.set_name("trim_pre_fst")
    #trim_pre_fst =  XRC.compile(
    #    "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \
    #    "[[PI].u]* " \
    #    "[ZERO:END [ZERO .x. [PI].u]*]*"
    #)

    fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))
    fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO")))
    fst3 = fs.concat(fst2, pi_out_star_fst)
    fst4 = fs.star(
        fs.concat(fs.expr("END:ZERO"),
                  fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))))
    trim_post_fst = fs.concat(fst3, fst4)
    trim_post_fst.set_name("trim_post_fst")
    #trim_post_fst = XRC.compile(
    #    "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \
    #    "[[PI].l]* " \
    #    "[END:ZERO [[PI].l .x. ZERO]*]*"
    #)
    if cfg.verbosity >= 20:
        twbt.ppfst(trim_pre_fst)
        twbt.ppfst(trim_post_fst)
    return
Beispiel #7
0
    selector_fst -- FST which selects examples which are relevant for this rule
    
    scrambler_fst -- empty_fst (negative examples not relevant for these rules)
    """
    context_condition_fst = contexts_to_condition(*contexts)
    x_condition_fst = x_to_condition(x_fst)
    context_condition_fst.intersect(x_condition_fst)
    null_fst = hfst.empty_fst()
    rule_fst = generalized_restriction(context_condition_fst, null_fst)
    rule_fst.set_name(name)
    # twbt.ppfst(rule_fst, True) ##
    selector_fst = selector_from_x(x_fst)
    scrambler_fst = hfst.empty_fst()
    return rule_fst, selector_fst, scrambler_fst


if __name__ == "__main__":
    twex.read_examples()
    init(1)
    #define("V", "PI .o.[a|e|i|o|ä|ö]")
    #define("C", "[PI .o. [h|l|n|s|t|v]] | %{ij%}:j")
    R1 = doublearrow("{ao}:o <=> _ {ij}:", e("%{ao%}:o"),
                     (e("[]"), e("[%{ij%} .o. PI]")))
    twbt.ppfst(R1, True)
    rule2_fst = doublearrow("{ij}:j <=> V :Ø* _ :Ø* V", "%{ij%}:j",
                            ("V [PI .o. Ø]*", "[PI .o. Ø]* V"))
    twbt.ppfst(rule2_fst, True)
    R3 = doublearrow("{tl}:l <=> _ CLOSED", "%{tl%}:l",
                     ("[]", "V %{ij%}:i* C [C | [PI .o. Ø]* END]"))
    twbt.ppfst(R3, True)
Beispiel #8
0
        description="A compiler and tester for two-level rules")
    arpar.add_argument("start",
                       help="start parseing from",
                       default="expr_start")
    args = arpar.parse_args()
    twexamp.read_fst(filename="nounex.fst")
    parser = init()
    for line_nl in sys.stdin:
        line = line_nl.strip()
        #print(line)
        result = parser.parse(line,
                              start=args.start,
                              semantics=TwolFstSemantics())
        if args.start == "def_start":
            op, left, right, source = result
            print(left, "=")
            twbt.ppfst(right)
        elif args.start == "rul_start":
            op, left, right, source = result
            twbt.ppfst(left)
            print(op)
            for lc, rc in right:
                twbt.ppfst(lc, title="left context")
                twbt.ppfst(rc, title="right context")
        elif args.start == "expr_start":
            fst = result
            #print(fst)
            twbt.ppfst(fst, True)
        elif op == "?":
            print("Incorrect: " + line)
Beispiel #9
0
    help="name of the examples fst or example pair symbol string file",
    default="examples.fst")
arpar.add_argument("rules", help="name of the rule file", default="test.rules")
args = arpar.parse_args()

cfg.verbosity = args.verbosity
if args.recursion:
    sys.setrecursionlimit(args.recursion)

if args.examples.endswith(".fst"):
    twexamp.read_fst(args.examples)
else:
    twexamp.read_examples(args.examples)

if cfg.verbosity >= 30:
    twbt.ppfst(cfg.examples_fst, title="examples_fst")

parser = twparser.init()

examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^")

examples_up_fsa = cfg.examples_fst.copy()
examples_up_fsa.input_project()
if cfg.verbosity >= 30:
    twbt.ppfst(examples_up_fsa, title="examples_up_fsa")

twrule.init()

skip = False
all_rules_fst_lst = []
rule_file = open(args.rules, 'r')