Example #1
0
def compile(filename, outfile, feats_file):

    stderr.write('Compiling into FST...\n')

    rlist = open(filename, 'r',
                 encoding='utf8').read().rstrip(' ;').split(',,\n')

    chars = get_chars(feats_file)

    fst = hfst.regex('0 -> "<S>"')
    fst.compose(hfst.regex('0 -> "<P>" || .#. _ ,, 0 -> "<P>" || _ .#.'))
    double(fst, chars)
    string2string(fst, rlist)

    # Delete preceding input-level symbol
    single(fst, chars)
    # Delete auxiliary symbols
    delete_aux(fst)
    # Minimize and write into .hfst file
    fst.minimize()

    fst.convert(hfst.ImplementationType.HFST_OLW_TYPE)

    ostr = hfst.HfstOutputStream(filename=outfile,
                                 type=hfst.ImplementationType.HFST_OLW_TYPE)
    ostr.write(fst)
    ostr.flush()
    ostr.close()
    stderr.write('Done.\n')
Example #2
0
def get_fst(start_rule, end_rule, *args):
    src = Path('g2p.twolc')
    tmp = Path('g2p_test_from_py.tmp.hfst')
    hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True)
    print('Preparing rule transducers for composition...', file=sys.stderr)
    rule_fsts_stream = hfst.HfstInputStream(tmp.name)

    rule_numbers = set()
    rule_numbers.add(0)
    for i in range(start_rule, end_rule + 1):
        rule_numbers.add(i)
    if (len(args) > 0):
        for i in range(args[0], args[1] + 1):
            rule_numbers.add(i)

    rule_fsts = []
    for index, rule in enumerate(rule_fsts_stream):
        if index in rule_numbers:
            rule_fsts.append(rule)

    print('Creating universal language FST...', file=sys.stderr)
    output = hfst.regex('?* ;')
    print('Compose-intersecting rules with universal FST...', file=sys.stderr)
    output.compose_intersect(rule_fsts)
    print('Optimizing for fast lookup...', file=sys.stderr)
    output.lookup_optimize()
    return output
Example #3
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    import hfst
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = hfst.empty_fst()
    for insym, outsym in cfg.symbol_pair_set:
        in_quoted = re.sub(r"([{}])", r"%\1", insym)
        #print(in_quoted, outsym)### tilts if insym contains bad chars
        pair_fst = hfst.regex(in_quoted + ':' + outsym)
        cfg.all_pairs_fst.disjunct(pair_fst)
    cfg.all_pairs_fst.remove_epsilons()
    cfg.all_pairs_fst.minimize()
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
Example #4
0
def get_fst():
    src = Path('g2p.twolc')
    tmp = Path('g2p_from_py.tmp.hfst')
    final = Path('g2p_from_py.hfstol')
    #if (not tmp.exists()) or (src.stat().st_mtime > tmp.stat().st_mtime):
    print('Compiling twolc rules...', file=sys.stderr)
    hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True)
    #if (not final.exists()) or not (src.stat().st_mtime <
    #                                tmp.stat().st_mtime <
    #                                final.stat().st_mtime):
    print('Preparing rule transducers for composition...', file=sys.stderr)
    rule_fsts_stream = hfst.HfstInputStream(tmp.name)
    rule_fsts = [t for t in rule_fsts_stream]
    print('Creating universal language FST...', file=sys.stderr)
    output = hfst.regex('?* ;')
    print('Compose-intersecting rules with universal FST...', file=sys.stderr)
    output.compose_intersect(rule_fsts)
    print('Optimizing for fast lookup...', file=sys.stderr)
    output.lookup_optimize()
    print('Writing out final FST...', file=sys.stderr)
    output.write_to_file(final.name)
    #else:
    #    ol_fst_stream = hfst.HfstInputStream(final.name)
    #    output = next(ol_fst_stream)
    return output
Example #5
0
 def test_tokenized(tok, pathin, pathout, exp, weight=0):
     tokenized = None
     if (pathout == None):
         tokenized = tok.tokenize_one_level(pathin)
     else:
         tokenized = tok.tokenize(pathin, pathout)
     if not hfst.tokenized_fst(tokenized, weight).compare(hfst.regex(exp)):
         if pathout == None:
             raise RuntimeError('test_tokenized failed with input: ' + pathin)
         else:
             raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
Example #6
0
 def test_tokenized(tok, pathin, pathout, exp, weight=0):
     tokenized = None
     if (pathout == None):
         tokenized = tok.tokenize_one_level(pathin)
     else:
         tokenized = tok.tokenize(pathin, pathout)
     if not hfst.tokenized_fst(tokenized, weight).compare(hfst.regex(exp)):
         if pathout == None:
             raise RuntimeError('test_tokenized failed with input: ' + pathin)
         else:
             raise RuntimeError('test_tokenized failed with input: ' + pathin + ", " + pathout)
Example #7
0
    def syllabify(self):
        v = "[ " + out_prefix + " " + _build_regex(
            self._alph.get_phonemes("+syllabic")) + " ]"
        c = "[ " + out_prefix + " " + _build_regex(
            self._alph.get_phonemes("-syllabic")) + " ]"

        fill_nucl = hfst.regex(v + " -> " + nucl_bound + " ... " + nucl_bound)
        syl = hfst.regex("0 -> " + syl_bound + " \/ " + nucl_bound + " " + c +
                         "* _ " + c + "* " + nucl_bound)
        surround = hfst.regex("?* -> " + syl_bound + " ... " + syl_bound +
                              " || .#. _ .#.")
        fill_nucl.compose(syl)
        fill_nucl.compose(surround)

        if self._fill_onset:
            no_vowstart = hfst.regex("~[ $[ \\" + v + " " + syl_bound + " " +
                                     nucl_bound + " ] ]")
            fill_nucl.compose(no_vowstart)

        if self._sonorous:
            son_scale = self._alph.get_sonority_scale()
            scale = list()
            for layer in son_scale[1:]:
                if not len(layer) == 0:
                    lregex = "[ " + out_prefix + " " + _build_regex(
                        layer) + " ]"
                    scale.append(lregex)
            suffix = ""
            for layer in scale:
                suffix += " " + layer + "*"
            prefix = ""
            for layer in reversed(scale):
                prefix += layer + "* "

            son_filter = hfst.regex(syl_bound + " [ " + prefix + " " +
                                    nucl_bound + " " + v + " " + nucl_bound +
                                    " " + suffix + " " + syl_bound + " ]+ ")
            fill_nucl.compose(son_filter)

        fill_nucl.minimize()
        return fill_nucl
 def __init__(self):
     letters_cyr = 'йцукенгшщзхъфывапролджэячсмитьбюё'
     letters_lat = ['j', 'c', 'u', 'k', 'e', 'n', 'g', ' sh', 'shch', 'z', 'kh', 'ie', 'f', 'y', 'v', 'a', 'p',
                    'r', 'o', 'l', 'd', 'zh', 'e', 'ia', 'ch', 's', 'm', 'i', 't', '0', 'b', 'iu', 'e']
     letters_cyr += letters_cyr.upper()
     letters_lat += [trans[0].upper() + trans[1:] for trans in letters_lat]
     regexes = []
     for i in range(len(letters_cyr)):
         regexes.append(hfst.regex(' {0} -> {1} || _'.format(letters_cyr[i], letters_lat[i])))
     tr = regexes[0]
     for reg in regexes[1:]:
         tr.compose(reg)
     self.tr = tr
Example #9
0
def get_fst(src):
    tmp = Path('../res/g2p_from_py.hfst')
    print('Compiling twolc rules...', file=sys.stderr)
    hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True)
    print('Preparing rule transducers for composition...', file=sys.stderr)
    rule_fsts_stream = hfst.HfstInputStream(tmp.name)
    rule_fsts = [t for t in rule_fsts_stream]
    print('Creating universal language FST...', file=sys.stderr)
    output = hfst.regex('?* ;')
    print('Compose-intersecting rules with universal FST...', file=sys.stderr)
    output.compose_intersect(rule_fsts)
    print('Optimizing for fast lookup...', file=sys.stderr)
    output.lookup_optimize()
    return output
Example #10
0
def get_all_forms(word,
                  pos,
                  language,
                  descrpitive=True,
                  limit_forms=-1,
                  filter_out=["#", "+Der", "+Cmp", "+Err"]):
    analyzer = get_transducer(language,
                              descrpitive=descrpitive,
                              analyzer=True,
                              convert_to_openfst=True,
                              cache=False)
    abcs = analyzer.get_alphabet()
    f = []
    flags = []
    for abc in abcs:
        for fi in filter_out:
            if abc.startswith(fi):
                f.append(__regex_escape(abc))
                break
        if "@" in abc and "@_" not in abc:
            flags.append("\"" + abc + "\"")
    flag_string = ""
    flag_end = ""
    start_flag_end = ""
    flag_string_start = ""
    if len(flags) > 0:
        flag_string_start = " [ " + " | ".join(flags)
        flag_string = flag_string_start + " | "
        flag_string_start = "" + flag_string_start
        flag_end = "]"
        start_flag_end = "]* "
    reg_text = flag_string_start + start_flag_end + "{" + word + "} %+" + pos + flag_string + " [ ? -  [ " + " | ".join(
        f) + " ]]" + flag_end + "*"
    reg = hfst.regex(reg_text)
    analyzer2 = analyzer
    analyzer2.compose(reg)
    output = analyzer2.extract_paths(max_cycles=1,
                                     max_number=limit_forms,
                                     output='text').replace(
                                         "@_EPSILON_SYMBOL_@", "").split("\n")
    output = filter(lambda x: x, output)
    output = list(map(lambda x: x.split('\t'), output))
    return list(map(lambda x: (
        x[0],
        float(x[1]),
    ), output))
Example #11
0
 def apply(self,
           candidates,
           no_penalty=False,
           no_pardon=False,
           method="matching"):
     """
     Apply the constraint to the current candidate set, i.e. compose the candidates with
     the constraint FST inserting the violation marks.
     :param candidates: The FST generating the current candidate set
     :param no_penalty: Do not remove losers if True
     :param no_pardon: Do not remove violation marks if True
     :param method: The penalization method to apply, matching (default) or counting
     :return: The updated candidate set FST
     """
     candidates.compose(hfst.regex(self._regex))
     if not no_penalty:
         penalize(candidates, n=self._n, no_pardon=no_pardon, method=method)
     return candidates
Example #12
0
    def generate(self):
        alph = _build_regex(self._alph.get_alphabet())
        mut = _build_regex(self._mut.get_alphabet())
        ignore = "" if self._ignore == "" else " | " + self._ignore

        # Remove syllable boundaries in input and insert insyms
        gen = hfst.regex("[ " + syl_bound + ":0 | [ 0:" + in_sym + " " + alph +
                         " ]" + ignore + " ]*")

        # Map input symbol to output symbol
        gen2 = hfst.regex("[ " + in_sym + " " + alph + " ] 0:[ " + out_sym +
                          " " + mut + " ]")
        # Ignore specified characters
        if self._ignore != "":
            ignore_marks = hfst.regex(ignore[3:])
            gen2.disjunct(ignore_marks)
        # Insert characters
        if self._allow_ins:
            ins = "0:[ " + in_sym + " " + no_sym + " " + out_sym + " " + mut + " ]"
            gen2.disjunct(hfst.regex(ins))
        # Delete input characters
        if self._allow_del:
            dle = "[ " + in_sym + " " + alph + " ] 0:[ " + out_sym + " " + no_sym + " ]"
            gen2.disjunct(hfst.regex(dle))

        # Loop mutator and compose with gen
        gen2.repeat_star()
        gen.compose(gen2)

        # Restrict insertions if desired
        if self._max_ins > 0:
            restrict = hfst.regex(at_most_n_of(ins_sym, self._max_ins))
            gen.compose(restrict)

        # Insert syllable boundaries if required
        if self._syl is not None: gen.compose(self._syl.syllabify())

        # Insert word boundaries
        surround = hfst.regex("?* -> " + word_bound + " ... " + word_bound +
                              " || .#. _ .#.")
        gen.compose(surround)

        gen.minimize()
        return gen
Example #13
0
def serial_compile(regexs):

    # Compile each rule individually
    queue = []
    for regex in regexs:
        fst = hfst.regex(regex)
        n = fst.number_of_states()
        queue.append(fst)

    # Sort resulting FST by number of states
    queue.sort(key=lambda fst: fst.number_of_states())

    # Compose smallest two, move resulting FST to end of queue
    n = len(queue)
    for i in range(n - 1):
        fst1, fst2 = queue[0:2]
        fst1.compose(fst2)
        queue = queue[2:] + [fst1]
        queue.sort(key=lambda fst: fst.number_of_states())

    return queue[0]
Example #14
0
def shuffle_with_zeros(string, target_length):
    """Return a fsa where zeros are inserted in all possible ways
    
    string -- the string to which zero symbols are inserted

    target_length -- how long the strings after insertions must be

    Returns a fsa which accepts all the strings with the inserted zeros.
    All strings have exactly target_length symbols.
    """
    result_fsa = hfst.fst(string)
    l = len(string)
    if l < target_length:
        n = target_length - l
        n_zeros_fsa = hfst.regex(' '.join(n * 'Ø'))
        result_fsa.shuffle(n_zeros_fsa)
    result_fsa.minimize()
    result_fsa.set_name(string)
    if cfg.verbosity >= 30:
        print("shuffle_with_zeros:")
        print(result_fsa)
    return result_fsa
Example #15
0
def shuffle_with_zeros(string, target_length):
    """Return a fsa where zeros are inserted in all possible ways
    
    string -- the string to which zero symbols are inserted

    target_length -- how long the strings after insertions must be

    Returns a fsa which accepts all the strings with the inserted zeros.
    All strings have exactly target_length symbols.
    """
    ### result_fsa = hfst.fst(string) # not correct for composed graphemes !!!
    result_fsa = fs.string_to_fsa(string)
    l = grapheme.length(string)
    if l < target_length:
        n = target_length - l
        n_zeros_fsa = hfst.regex(" ".join(n * "Ø"))
        result_fsa.shuffle(n_zeros_fsa)
    result_fsa.minimize()
    result_fsa.set_name(string)
    if cfg.verbosity >= 30:
        print("shuffle_with_zeros:")
        print(result_fsa)
    return result_fsa
Example #16
0
def penalize(candidates, n=10, no_pardon=False, method="matching"):
    """
    Remove losing candidates.
    :param candidates: Current candidate set
    :param n: The penalization precision for the counting approach
    :param no_pardon: Do not remove violation marks if True
    :param method: Use matching (default) or counting approach
    :return: Updated candidate set FST
    """
    if method == "counting":
        for i in reversed(range(n + 1)):
            penalty_i = hfst.regex(only_n_of(mark_sym, i))
            candidates.lenient_composition(penalty_i)
    else:
        # Remove modifications of gen, keep input characters and violation marks
        strip = hfst.regex("[ [ " + in_sym + ":0 [ " + no_sym +
                           ":0 .P. ? ] ]" + " | [ " + out_sym + " ? ]:0 | " +
                           bound_syms + ":0 | " + mark_sym + " ]*")
        # Insert at least one violation mark into the string
        insert_marks = hfst.regex("[ ?* 0:" + mark_sym + "+ ?* ]+")
        # Randomly insert new output characters
        mutate_output = hfst.regex("[ ? | 0:? ]*")
        # Randomly scatter violation marks throughout the string
        permute1 = hfst.regex("[ ?* " + mark_sym + ":0 ?* 0:" + mark_sym +
                              " ?* ]*")
        permute2 = hfst.regex("[ ?* 0:" + mark_sym + " ?* " + mark_sym +
                              ":0 ?* ]*")
        # Compose everything
        worse = candidates.copy()
        worse.compose(strip)
        worse.compose(insert_marks)
        worse.compose(permute1)
        worse.compose(permute2)
        worse.compose(mutate_output)
        # Subtract worse candidates from actual candidates
        candidates.subtract(worse)
        candidates.minimize()

    if not no_pardon: pardon(candidates)
    return candidates
Example #17
0
    if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
        sys.exit(77)
    hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
else:
    raise RuntimeError('implementation format not recognized')

transducers = []
istr = hfst.HfstInputStream()
while not istr.is_eof():
    transducers.append(istr.read())
istr.close()

if not len(transducers) == 3:
    raise RuntimeError('Wrong number of transducers read.')

i = 0
for re in ['föö:bär','0','0-0']:
    if not transducers[i].compare(hfst.regex(re)):
        raise RuntimeError('Transducers are not equivalent.')
    i += 1

if len(transducers) > 0:
    f = sys.stdout
    i=0
    transducers[i].write_att(f)
    i += 1
    while i < len(transducers):
        f.write('--\n')
        transducers[i].write_att(f)
        i += 1
Example #18
0
# -*- coding: utf-8 -*-
import hfst

import sys
if sys.argv[1] == 'sfst':
    if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.SFST_TYPE):
        sys.exit(77)
    hfst.set_default_fst_type(hfst.ImplementationType.SFST_TYPE)
elif sys.argv[1] == 'foma':
    if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE):
        sys.exit(77)
    hfst.set_default_fst_type(hfst.ImplementationType.FOMA_TYPE)
elif sys.argv[1] == 'openfst':
    if not hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
        sys.exit(77)
    hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
else:
    raise RuntimeError('implementation format not recognized')

tr1 = hfst.regex('föö:bär')
tr2 = hfst.regex('0')
tr3 = hfst.regex('0-0')

ostr = hfst.HfstOutputStream()
ostr.write(tr1)
ostr.write(tr2)
ostr.write(tr3)
ostr.flush()
ostr.close()
Example #19
0
# Distances between any two consonants:
cclist = featmetr(consonants, consonants, posdist, adist, adist)

vowl = sorted(vowels.keys())
cons = sorted(consonants.keys())
letters = sorted(vowl + cons)

# Deletion of a letter possible at a fairly high cost:
dellist = ['{}:Ø::{}'.format(l,3) for l in letters]
# Insertion of a letter possible at a fairly high cost:
epelist = ['Ø:{}::{}'.format(l,3) for l in letters]
# Doubling only after the letter, not before:
dbllist = ['{} Ø:{}::{}'.format(l,l,2) for l in letters]
# Shortening the second of two identical letters only:
sholist = ['{} {}:Ø::{}'.format(l,l,2) for l in letters]
# Individual treatment of some pairs or sequences:
speclist = ['k:c::0 k::0', 'k:x s:Ø::0', 't:d s:z::0', 'Ø:d s:z::3',
            'i:j::1', 'j:i::1', 'i j:Ø::0', 'i i:j::0',
            'f:p Ø:h::0', 'u:v::1', 'v:u::1', 'u:w::1', 'k:c::1',
            '[o:Ø o:?]::5', '[ö:Ø ö:?]::5']

all = vvlist + cclist + dbllist + sholist + dellist + epelist + speclist
re = '[{}]*'.format(' | '.join(all))
print(re) ##
algfst = hfst.regex(re)

algfile = hfst.HfstOutputStream(filename="chardist.fst")
algfile.write(algfst)
algfile.flush()
algfile.close()
Example #20
0
    def build(self, verbosity=1):
        """
        Build the tableau FST from the submitted gen and constraints.
        :param verbosity: Amount of information to be printed during building. 0 = print nothing,
                          1 = print progress in single line (default), 2+ = print time and FST size
                          for each constraint
        """
        start = time.time()

        self._gen.remove_optimization()
        self._runnable = self._gen.copy()
        self._optimize_lookup(self._gen)
        if verbosity > 1:
            print("Gen: %d states, %d arcs" %
                  (self._runnable.number_of_states(),
                   self._runnable.number_of_arcs()),
                  flush=True)

        n = len(self._constraints)
        for (i, constraint) in enumerate(self._constraints):
            c_start = time.time()
            if verbosity == 1:
                print("\rApplying constraints... (%d/%d)" % (i, n),
                      end="",
                      flush=True)
            elif verbosity > 1:
                print("Constraint %d: " % i, end="", flush=True)
            constraint.apply(self._runnable, no_penalty=True)
            self._runnable.minimize()
            before = self._runnable.copy()
            self._optimize_lookup(before)
            penalize(self._runnable,
                     constraint.n(),
                     no_pardon=True,
                     method=self._penal_method)
            self._runnable.minimize()
            after = self._runnable.copy()
            self._optimize_lookup(after)
            pardon(self._runnable)
            self._runnable.minimize()
            self._stepwise.append((before, after))
            if verbosity > 1:
                c_end = time.time()
                print("%d states, %d arcs (%.2f sec.)" %
                      (self._runnable.number_of_states(),
                       self._runnable.number_of_arcs(), c_end - c_start),
                      flush=True)

        finish = hfst.regex(out_prefix + " | " + word_bound + " " + syl_bound +
                            " | " + syl_bound + " " + word_bound + " | " +
                            nucl_bound + " -> 0")
        finish2 = hfst.regex(no_sym + " -> 0")
        finish.compose(finish2)
        self._runnable.compose(finish)
        self._runnable.minimize()
        if verbosity > 1:
            print("Final: %d states, %d arcs" %
                  (self._runnable.number_of_states(),
                   self._runnable.number_of_arcs()),
                  flush=True)
        self._optimize_lookup(self._runnable)

        end = time.time()
        if verbosity > 0:
            if verbosity == 1: print("\r", end="")
            print("Build complete in %.2f seconds." % (end - start),
                  flush=True)
Example #21
0
types = []
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.SFST_TYPE):
    types.append(hfst.ImplementationType.SFST_TYPE)
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
    types.append(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE):
    types.append(hfst.ImplementationType.FOMA_TYPE)

for type in types:
    if hfst.HfstTransducer.is_implementation_type_available(type):

        hfst.set_default_fst_type(type)
        
        # StreamIsClosedException
        try:
            tr = hfst.regex('foo')
            outstr = hfst.HfstOutputStream(filename='testfile')
            outstr.close()
            outstr.write(tr)
        except hfst.exceptions.StreamIsClosedException:
            print("Could not write transducer: stream to file was closed.")
            
        # TransducerIsCyclicException
        transducer = hfst.regex('[a:b]*')
        try:
            results = transducer.extract_paths(output='text')
            print("The transducer has %i paths:" % len(results))
            print(results)
        except hfst.exceptions.TransducerIsCyclicException:
            print("The transducer is cyclic and has an infinite number of paths. Some of them:")
            results = transducer.extract_paths(output='text', max_cycles=5)
Example #22
0
def expr(e):
    """Return an FST corresponding to a XFST regular expression"""
    res = hfst.regex(e)
    res.minimize()
    return res
Example #23
0
 def boundary(self, ast):
     result_fst = hfst.regex("END")
     #print(result_fst)####
     result_fst.set_name(".#.")
     return result_fst
Example #24
0
def delete_aux(fst):
    regex = '[ "<P>" | "<S>" | "<E>" | "<D>" | "<.>" ] -> 0'
    fst.compose(hfst.regex(regex))
Example #25
0
# -*- coding: utf-8 -*-
import sys
if len(sys.argv) > 1:
    sys.path.insert(0, sys.argv[1])
import hfst

for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]:
    if hfst.HfstTransducer.is_implementation_type_available(type):
        
        hfst.set_default_fst_type(type)
        tr = hfst.regex('[foo:bar] | [?:B ?:A ?:R]')
        result = tr.lookup('foo')
        assert(len(result) == 1)
        assert(result[0][0] == 'bar')
        tr = hfst.regex('[f:0 o:0 o:foo]')
        result = tr.lookup('foo')
        assert(len(result) == 1)
        assert(result[0][0] == '@_EPSILON_SYMBOL_@@_EPSILON_SYMBOL_@foo')
        tr = hfst.regex('[foo:bar]|[f:B o:A o:R]')
        result = tr.lookup('foo')
        assert(len(result) == 1)
        assert(result[0][0] == 'bar')
Example #26
0
def init():
    """Initializes the module by computing several common FSTs
    
    Assumes that twexamp.read_fst() has read in cfg.examples_fst and
    initialized sone symbol sets.
    """
    global pistar_fst, pistar_fsa, diamond_sym, diamond_fst
    global trim_pre_fst, trim_post_fst

    assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)"

    cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy()
    cfg.definitions["PI"] = cfg.all_pairs_fst.copy()

    diamond_sym = 'DIAMOND'
    diamond_fst = hfst.regex(diamond_sym)
    pi_fst = cfg.all_pairs_fst.copy()
    pistar_fst = cfg.all_pairs_fst.copy()
    pistar_fst.repeat_star()
    pistar_fst.remove_epsilons()
    pistar_fst.minimize()
    pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^')
    pi_in_fst = pi_fst.copy()
    pi_in_fst.input_project()
    pi_out_fst = pi_fst.copy()
    pi_out_fst.output_project()
    pi_in_star_fst = pistar_fst.copy()
    pi_in_star_fst.input_project()
    pi_out_star_fst = pistar_fst.copy()
    pi_out_star_fst.output_project()
    if cfg.verbosity >= 20:
        twbt.ppfst(pistar_fst, title="pistar_fst")

    fst1 = fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst))
    fst2 = fs.star(fs.concat(fst1, fs.expr("ZERO:BEGIN")))
    fst3 = fs.concat(fst2, pi_in_star_fst)
    fst4 = fs.star(
        fs.concat(fs.expr("ZERO:END"),
                  fs.star(fs.crossprod(fs.expr("ZERO"), pi_in_fst))))
    trim_pre_fst = fs.concat(fst3, fst4)
    trim_pre_fst.set_name("trim_pre_fst")
    #trim_pre_fst =  XRC.compile(
    #    "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \
    #    "[[PI].u]* " \
    #    "[ZERO:END [ZERO .x. [PI].u]*]*"
    #)

    fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))
    fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO")))
    fst3 = fs.concat(fst2, pi_out_star_fst)
    fst4 = fs.star(
        fs.concat(fs.expr("END:ZERO"),
                  fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))))
    trim_post_fst = fs.concat(fst3, fst4)
    trim_post_fst.set_name("trim_post_fst")
    #trim_post_fst = XRC.compile(
    #    "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \
    #    "[[PI].l]* " \
    #    "[END:ZERO [[PI].l .x. ZERO]*]*"
    #)
    if cfg.verbosity >= 20:
        twbt.ppfst(trim_pre_fst)
        twbt.ppfst(trim_post_fst)
    return
    for tr in r:
        transducers.append(tr)

assert(f.closed)
assert(len(transducers)) == 4

transducers = []

with open('testfile_fail.att', 'r') as f:
    try:
        r = hfst.AttReader(f, "<eps>")
        for tr in r:
            transducers.append(tr)
    except hfst.exceptions.NotValidAttFormatException as e:
        assert("1      baz    baz      0.3" in e.what())
        assert("line: 11" in e.what())

assert(f.closed)
assert(len(transducers)) == 4

transducers = []
with open('testfile_unicode.att', 'r') as f:
    r = hfst.AttReader(f)
    for tr in r:
        transducers.append(tr)

assert(f.closed)
assert(len(transducers)) == 1
TR = hfst.regex('föö:bär::0.5')
assert(TR.compare(transducers[0]))
Example #28
0
 def test_fst(input, result):
     tr1_ = hfst.fst(input)
     tr2_ = hfst.regex(result)
     if not tr1_.compare(tr2_):
         raise RuntimeError('test_fst failed with input: ' + input)
Example #29
0
    print('HERE!!!')

for type in types:

    print('\n--- Testing implementation type %s ---\n' % hfst.fst_type_to_string(type))

    hfst.set_default_fst_type(type)

    tr1 = None
    tr2 = None
    tr3 = None

    type_ = hfst.ImplementationType.TROPICAL_OPENFST_TYPE
    ostr = hfst.HfstOutputStream(filename='foobar.hfst', type=type_)

    tr_ = hfst.regex('{foo}:{bar}::0.5')
    tr_.convert(type_)

    ostr.write(tr_)
    ostr.write(tr_)
    ostr.flush()
    ostr.close()

    if not os.path.isfile('foobar.hfst'):
        raise RuntimeError('Missing file: foobar.hfst')

    istr = hfst.HfstInputStream('foobar.hfst')
    numtr = 0
    try:
        tr1 = istr.read()
        numtr += 1
Example #30
0
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
    types.append(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE):
    types.append(hfst.ImplementationType.FOMA_TYPE)

from hfst.xerox_rules import *
from hfst import regex

for type in types:
    if hfst.HfstTransducer.is_implementation_type_available(type):

        hfst.set_default_fst_type(type)

        rule = Rule() # just testing the default constructor

        mapping = ( (regex('a'),regex('b')), )
        rule = Rule(mapping)
        assert(replace(rule, False).compare(regex('a -> b')))
        assert(replace(rule, True).compare(regex('a (->) b')))

        mapping = ( (regex('a'),regex('b')), (regex('b'),regex('a')) )
        rule = Rule(mapping)
        assert(replace(rule, False).compare(regex('a -> b, b -> a')))
        assert(replace(rule, True).compare(regex('a (->) b, b (->) a')))

        for repl_type in [(ReplaceType.REPL_UP, '||'), (ReplaceType.REPL_DOWN, '\\/'), (ReplaceType.REPL_LEFT, '\\\\'), (ReplaceType.REPL_RIGHT,'//')]:

            mapping1 = ( (regex('a'),regex('b')), )
            context1 = ( (regex('c'),regex('c')), )

            rule1 = Rule(mapping1, context1, repl_type[0])
Example #31
0
    types.append(hfst.ImplementationType.FOMA_TYPE)

for type in types:

    print('\n--- Testing implementation type %s ---\n' % hfst.fst_type_to_string(type))

    hfst.set_default_fst_type(type)

    tr1 = None
    tr2 = None
    tr3 = None

    type_ = hfst.ImplementationType.TROPICAL_OPENFST_TYPE
    ostr = hfst.HfstOutputStream(filename='foobar.hfst', type=type_)

    tr_ = hfst.regex('{foo}:{bar}::0.5')
    tr_.convert(type_)

    ostr.write(tr_)
    ostr.write(tr_)
    ostr.flush()
    ostr.close()

    if not os.path.isfile('foobar.hfst'):
        raise RuntimeError('Missing file: foobar.hfst')

    istr = hfst.HfstInputStream('foobar.hfst')
    numtr = 0
    try:
        tr1 = istr.read()
        numtr += 1
Example #32
0
def expand(fst):
    regex = '"<S>" -> [ "<S>" "<E>" "<.>" "<E>" "<S>" ]'
    fst.compose(hfst.regex(regex))
Example #33
0
e = hfst.exceptions.ContextTransducersAreNotAutomataException('foo','bar', 10)
e = hfst.exceptions.TransducersAreNotAutomataException('foo','bar', 10)
e = hfst.exceptions.StateIndexOutOfBoundsException('foo','bar', 10)
e = hfst.exceptions.TransducerHeaderException('foo','bar', 10)
e = hfst.exceptions.MissingOpenFstInputSymbolTableException('foo','bar', 10)
e = hfst.exceptions.TransducerTypeMismatchException('foo','bar', 10)
e = hfst.exceptions.EmptySetOfContextsException('foo','bar', 10)
e = hfst.exceptions.SpecifiedTypeRequiredException('foo','bar', 10)
e = hfst.exceptions.HfstFatalException('foo','bar', 10)
e = hfst.exceptions.TransducerHasWrongTypeException('foo','bar', 10)
e = hfst.exceptions.IncorrectUtf8CodingException('foo','bar', 10)
e = hfst.exceptions.EmptyStringException('foo','bar', 10)
e = hfst.exceptions.SymbolNotFoundException('foo','bar', 10)
e = hfst.exceptions.MetadataException('foo','bar', 10)
e = hfst.exceptions.FlagDiacriticsAreNotIdentitiesException('foo','bar', 10)

import hfst

# Test that importing exceptions via a package works
if hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.FOMA_TYPE) and hfst.HfstTransducer.is_implementation_type_available(hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
    try:
        foo = hfst.regex('foo')
        bar = hfst.regex('bar')
        foo.convert(hfst.ImplementationType.FOMA_TYPE)
        bar.convert(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
        foo.concatenate(bar)
        assert False
    except hfst.exceptions.TransducerTypeMismatchException as e:
        pass

Example #34
0
 def test_fst(input, result):
     tr1_ = hfst.fst(input)
     tr2_ = hfst.regex(result)
     if not tr1_.compare(tr2_):
         raise RuntimeError('test_fst failed with input: ' + input)
Example #35
0
 def symbol_or_pair(self, ast):
     string = ast.token.strip()
     failmsg = []
     pat = re.compile(
         r"""^
     (?P<up>[a-zšžåäöüõA-ZÅÄÖ0-9'´`]*
      |
      \{[a-zåäöüõA-ZÅÄÖØ'´`]+\})
     :
     (?P<lo>[a-zšžåäöüõA-ZÅÄÖØ'´`]*)
     $""", re.X)
     m = re.match(pat, string)
     if m:  # it is a pair with a colon
         up = m.group("up")
         up_quoted = re.sub(r"([{}])", r"%\1", up)
         lo = m.group("lo")
         if up and (up not in cfg.input_symbol_set):
             failmsg.append("input symbol '{}'".format(up))
         if lo and (lo not in cfg.output_symbol_set):
             failmsg.append("output symbol '{}'".format(lo))
         if up and lo and ((up, lo) not in cfg.symbol_pair_set):
             failmsg.append("symbol pair '{}'".format(string))
         if failmsg:
             cfg.error_message = " and ".join(failmsg) + " not in alphabet"
             raise FailedSemantics(cfg.error_message)
         elif up and lo:  # it is a valid pair with a colon
             result_fst = hfst.regex(up_quoted + ':' + lo)
             result_fst.set_name(string)
             return result_fst
         elif up and (not lo):
             result_fst = hfst.regex(up_quoted)
             result_fst.compose(cfg.all_pairs_fst)
             result_fst.set_name(string)
             return result_fst
         elif (not up) and lo:
             result_fst = cfg.all_pairs_fst.copy()
             lo_fst = hfst.regex(lo)
             result_fst.compose(lo_fst)
             result_fst.set_name(string)
             return result_fst
         else:
             result_fst = cfg.all_pairs_fst.copy()
             result_fst.set_name("PI")
             return result_fst
     m = re.fullmatch(r"[a-zåäöšžüõA-ZÅÄÖØ'´`]+", string)
     if m:  # its either a defined sym or a surf ch
         if string in cfg.definitions:
             result_fst = cfg.definitions[string].copy()
             result_fst.set_name(string)
             return result_fst
         elif (string
               in cfg.output_symbol_set) and (string
                                              in cfg.input_symbol_set):
             result_fst = hfst.regex(string)
             result_fst.set_name(string)
             return result_fst
         elif string in {'BEGIN', 'END'}:
             result_fst = hfst.regex(string)
             result_fst.set_name(string)
             return result_fst
     cfg.error_message = "'" + string + "' is an invalid pair/definend symbol"
     raise FailedSemantics(cfg.error_message)
Example #36
0
def separators(fst):
    regex = '0 -> "<S>"'
    fst.compose(hfst.regex(regex))
    hfst.set_default_fst_type(hfst.ImplementationType.SFST_TYPE)
elif sys.argv[1] == 'foma':
    if not hfst.HfstTransducer.is_implementation_type_available(
            hfst.ImplementationType.FOMA_TYPE):
        sys.exit(77)
    hfst.set_default_fst_type(hfst.ImplementationType.FOMA_TYPE)
elif sys.argv[1] == 'openfst':
    if not hfst.HfstTransducer.is_implementation_type_available(
            hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
        sys.exit(77)
    hfst.set_default_fst_type(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
else:
    raise RuntimeError('implementation format not recognized')

transducers = []

try:
    while (True):
        transducers.append(hfst.read_att_transducer(sys.stdin))
except hfst.exceptions.EndOfStreamException:
    pass

if not len(transducers) == 3:
    raise RuntimeError('Wrong number of transducers read.')

i = 0
for re in ['föö:bär', '0', '0-0']:
    if not transducers[i].compare(hfst.regex(re)):
        raise RuntimeError('Transducers are not equivalent.')
    i += 1
Example #38
0
def double(fst, chars):
    regex = '0 -> "<D>" "<.>" || "<S>" _ [ ? - "<S>" ] ,, 0 -> [ "<D>" "<.>" ] || .#. _ '
    fst.compose(hfst.regex(regex))
    rlist = ['"<D>" "<.>" %s -> %s "<.>" %s' % (c, c, c) for c in chars]
    regex = ' ,, '.join(rlist)
    fst.compose(hfst.regex(regex))
Example #39
0
import argparse

eps = hfst.EPSILON
pad = '"<P>"'

eps_pair = (
    eps,
    eps,
)
pad_pair = (
    pad,
    pad,
)

tok = hfst.HfstTokenizer()
levenshtein = hfst.regex('[ ?::0 | ?:?::1 | 0:?::1 | ?:0::1 | 0:0::0 ]*')

cldict = {
    '\\': '\\\\',
    '\x84': '',
}


def clean(s):
    """
	Remove and escape certain characters
	"""

    for a, b in cldict.items():
        s = s.replace(a, b)
    return s
Example #40
0
# -*- coding: utf-8 -*-
import sys
if len(sys.argv) > 1:
    sys.path.insert(0, sys.argv[1])
import hfst

for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]:
    if hfst.HfstTransducer.is_implementation_type_available(type):
        
        f = open('cats_and_dogs.prolog', 'r')
        F = open('tmp', 'w')
        
        tr = hfst.read_prolog_transducer(f)
        re = hfst.regex('{cat}')
        assert(tr.compare(re))
        tr.write_prolog(F, True)
        F.write('\n')
        
        tr = hfst.read_prolog_transducer(f)
        re = hfst.regex('0 - 0')
        assert(tr.compare(re))
        tr.write_prolog(F, True)
        F.write('\n')
        
        tr = hfst.read_prolog_transducer(f)
        re = hfst.regex('{dog}:{cat}::0.5')
        assert(tr.compare(re))
        tr.write_prolog(F, True)
        F.write('\n')
        
        tr = hfst.read_prolog_transducer(f)
e = hfst.exceptions.StateIndexOutOfBoundsException('foo', 'bar', 10)
e = hfst.exceptions.TransducerHeaderException('foo', 'bar', 10)
e = hfst.exceptions.MissingOpenFstInputSymbolTableException('foo', 'bar', 10)
e = hfst.exceptions.TransducerTypeMismatchException('foo', 'bar', 10)
e = hfst.exceptions.EmptySetOfContextsException('foo', 'bar', 10)
e = hfst.exceptions.SpecifiedTypeRequiredException('foo', 'bar', 10)
e = hfst.exceptions.HfstFatalException('foo', 'bar', 10)
e = hfst.exceptions.TransducerHasWrongTypeException('foo', 'bar', 10)
e = hfst.exceptions.IncorrectUtf8CodingException('foo', 'bar', 10)
e = hfst.exceptions.EmptyStringException('foo', 'bar', 10)
e = hfst.exceptions.SymbolNotFoundException('foo', 'bar', 10)
e = hfst.exceptions.MetadataException('foo', 'bar', 10)
e = hfst.exceptions.FlagDiacriticsAreNotIdentitiesException('foo', 'bar', 10)

import hfst

# Test that importing exceptions via a package works
if hfst.HfstTransducer.is_implementation_type_available(
        hfst.ImplementationType.FOMA_TYPE
) and hfst.HfstTransducer.is_implementation_type_available(
        hfst.ImplementationType.TROPICAL_OPENFST_TYPE):
    try:
        foo = hfst.regex('foo')
        bar = hfst.regex('bar')
        foo.convert(hfst.ImplementationType.FOMA_TYPE)
        bar.convert(hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
        foo.concatenate(bar)
        assert False
    except hfst.exceptions.TransducerTypeMismatchException as e:
        pass
Example #42
0
import sys, fileinput, io, hfst
s2e_file = hfst.HfstInputStream("s2m.fst")
s2e = s2e_file.read()
# print(s2e.number_of_states())

def print_results(paths):
    for path in paths.strip().split('\n'):
        print("\t" + path.split(':')[0])

while True:
    res = hfst.regex("?*")
    # print(res)
    print("Enter forms of the next lemma")
    while True:
        try:
            line = input()
        except EOFError:
            sys.exit()
        l = " ".join(list(line.strip()))
        # print("word = " + l)
        a = hfst.regex(l)
        a.compose(s2e)
        a.output_project()
        a.minimize()
        a.extract_paths(max_number=10)
        a.minimize()
    
        nps = a.extract_paths(output='text')
        # print("    tentative new entries = ")
        # print_results(nps)
        
Example #43
0
import hfst

for type in [hfst.ImplementationType.SFST_TYPE, hfst.ImplementationType.TROPICAL_OPENFST_TYPE, hfst.ImplementationType.FOMA_TYPE]:
    if hfst.HfstTransducer.is_implementation_type_available(type):

        comp = hfst.XreCompiler(hfst.get_default_fst_type())
        comp.set_expand_definitions(True)
        comp.define_xre('FooStar', '[foo]*')
        tr = hfst.regex('[foo]+')
        comp.define_transducer('FooPlus', tr)
        comp.define_xre('Bar', 'bar')
        comp.undefine('Bar')
        
        TR = comp.compile('FooStar a FooPlus Bar')
        TR1 = hfst.regex('[foo* a foo+ Bar]')
        assert TR1.compare(TR)

        tr = hfst.regex('foo:bar')
        comp.define_transducer('FooBar', tr)
        TR = comp.compile('FooBar.l')
        TR1 = hfst.regex('bar')
        assert TR1.compare(TR)
Example #44
0
def single(fst, chars):
    regex = '"<S>" ? -> 0'
    fst.compose(hfst.regex(regex))