Example #1
0
def get_fst():
    src = Path('g2p.twolc')
    tmp = Path('g2p_from_py.tmp.hfst')
    final = Path('g2p_from_py.hfstol')
    if (not tmp.exists()) or (src.stat().st_mtime > tmp.stat().st_mtime):
        print('Compiling twolc rules...', file=sys.stderr)
        hfst.compile_twolc_file(src.name,
                                tmp.name,
                                resolve_left_conflicts=True)
    if (not final.exists()) or not (src.stat().st_mtime < tmp.stat().st_mtime <
                                    final.stat().st_mtime):
        print('Preparing rule transducers for composition...', file=sys.stderr)
        rule_fsts_stream = hfst.HfstInputStream(tmp.name)
        rule_fsts = [t for t in rule_fsts_stream]
        print('Creating universal language FST...', file=sys.stderr)
        output = hfst.regex('?* ;')
        print('Compose-intersecting rules with universal FST...',
              file=sys.stderr)
        output.compose_intersect(rule_fsts)
        print('Optimizing for fast lookup...', file=sys.stderr)
        output.lookup_optimize()
        print('Writing out final FST...', file=sys.stderr)
        output.write_to_file(final.name)
    else:
        ol_fst_stream = hfst.HfstInputStream(final.name)
        output = next(ol_fst_stream)
    return output
Example #2
0
def get_input_stream(filename):
    if filename == '-':
        return hfst.HfstInputStream()
    elif filename != None:
        return hfst.HfstInputStream(filename)
    else:
        return None
Example #3
0
def load_cascade(filename):
    transducers = []
    istr = hfst.HfstInputStream(full_path(filename))
    while not istr.is_eof():
        transducers.append(istr.read())
    istr.close()
    return tuple(transducers)
Example #4
0
 def load_filename(self, fsa):
     istr = hfst.HfstInputStream(fsa)
     transducers = []
     while not (istr.is_eof()):
         transducers.append(istr.read())
     istr.close()
     self.transducer = transducers[0]
Example #5
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    import hfst
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = hfst.empty_fst()
    for insym, outsym in cfg.symbol_pair_set:
        in_quoted = re.sub(r"([{}])", r"%\1", insym)
        #print(in_quoted, outsym)### tilts if insym contains bad chars
        pair_fst = hfst.regex(in_quoted + ':' + outsym)
        cfg.all_pairs_fst.disjunct(pair_fst)
    cfg.all_pairs_fst.remove_epsilons()
    cfg.all_pairs_fst.minimize()
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
Example #6
0
def get_fst(start_rule, end_rule, *args):
    src = Path('g2p.twolc')
    tmp = Path('g2p_test_from_py.tmp.hfst')
    hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True)
    print('Preparing rule transducers for composition...', file=sys.stderr)
    rule_fsts_stream = hfst.HfstInputStream(tmp.name)

    rule_numbers = set()
    rule_numbers.add(0)
    for i in range(start_rule, end_rule + 1):
        rule_numbers.add(i)
    if (len(args) > 0):
        for i in range(args[0], args[1] + 1):
            rule_numbers.add(i)

    rule_fsts = []
    for index, rule in enumerate(rule_fsts_stream):
        if index in rule_numbers:
            rule_fsts.append(rule)

    print('Creating universal language FST...', file=sys.stderr)
    output = hfst.regex('?* ;')
    print('Compose-intersecting rules with universal FST...', file=sys.stderr)
    output.compose_intersect(rule_fsts)
    print('Optimizing for fast lookup...', file=sys.stderr)
    output.lookup_optimize()
    return output
Example #7
0
def segment(word, segmenter, sep=' '):
    """
    Segments a word and returns the segmentations that correspond
    to the original spelling.

    :param word: a word (str)
    :param segmenter: a path to the HFST transducer for segmentation (str)
                      or HFST transducer (libhfst.HfstTransducer)
    :return: a list of segmentations in the original spellimg
    """

    if isinstance(segmenter, str):
        if not os.path.exists(segmenter):
            raise ValueError('The segmenter could not be found!')
        segmenter = hfst.HfstInputStream(path).read()

    segmentation = segmenter.lookup(word)

    res = []
    for seg in segmentation:
        seg = re.sub('·+', sep, seg[0])

        reverted = revert_spellrelax(word, seg)
        if reverted and reverted not in res:
            res.append(reverted)

    return res
Example #8
0
def __analyze_locally(query, language, cache=True, descrpitive=True):
    if cache and language + str(descrpitive) in analyzer_cache:
        generator = analyzer_cache[language + str(descrpitive)]
    else:
        filename = os.path.join(__where_models(language),
                                __analyzer_model_name(descrpitive))
        input_stream = hfst.HfstInputStream(filename)
        generator = input_stream.read()
        analyzer_cache[language + str(descrpitive)] = generator
    r = generator.lookup(query)
    return r
Example #9
0
 def isWord(self, ls):
     string = ''.join(ls)
     filename = "apertium-jpn/jpn.automorf.hfst"
     input_stream = hfst.HfstInputStream(filename)
     analyser = input_stream.read()
     fullout = analyser.lookup(string)
     if len(fullout) == 0:
         return False
     output = fullout[0][0]
     items = output.split("<", 1)
     token = Unit(items[0], items[1])
     return token
Example #10
0
def get_transducer(language,
                   cache=True,
                   analyzer=True,
                   descrpitive=True,
                   dictionary_forms=True,
                   convert_to_openfst=False):
    conversion_type = hfst.ImplementationType.TROPICAL_OPENFST_TYPE
    if not analyzer:
        #generator
        if cache and language + str(descrpitive) + str(dictionary_forms) + str(
                convert_to_openfst) in generator_cache:
            generator = generator_cache[language + str(descrpitive) +
                                        str(dictionary_forms) +
                                        str(convert_to_openfst)]
        else:
            filename = os.path.join(
                __where_models(language),
                __generator_model_name(descrpitive, dictionary_forms))
            input_stream = hfst.HfstInputStream(filename)
            generator = input_stream.read()
            if convert_to_openfst:
                generator.convert(conversion_type)
            generator_cache[language + str(descrpitive) +
                            str(dictionary_forms) +
                            str(convert_to_openfst)] = generator
    else:
        if cache and language + str(descrpitive) + str(
                convert_to_openfst) in analyzer_cache:
            generator = analyzer_cache[language + str(descrpitive) +
                                       str(convert_to_openfst)]
        else:
            filename = os.path.join(__where_models(language),
                                    __analyzer_model_name(descrpitive))
            input_stream = hfst.HfstInputStream(filename)
            generator = input_stream.read()
            if convert_to_openfst:
                generator.convert(conversion_type)
            analyzer_cache[language + str(descrpitive) +
                           str(convert_to_openfst)] = generator
    return generator
Example #11
0
def _load_transducer(filename, invert):
    metadata_filename = os.path.join(os.path.dirname(filename),
                                     "metadata.json")
    try:
        metadata = mikatools.json_load(metadata_filename)
    except:
        #No crash if JSON is not found or malformed for some reason
        metadata = {}
    if "fst_type" in metadata and metadata["fst_type"] == "foma":
        return FomaFSTWrapper(filename, invert)
    else:
        input_stream = hfst.HfstInputStream(filename)
        return input_stream.read()
Example #12
0
def get_fst(src):
    tmp = Path('../res/g2p_from_py.hfst')
    print('Compiling twolc rules...', file=sys.stderr)
    hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True)
    print('Preparing rule transducers for composition...', file=sys.stderr)
    rule_fsts_stream = hfst.HfstInputStream(tmp.name)
    rule_fsts = [t for t in rule_fsts_stream]
    print('Creating universal language FST...', file=sys.stderr)
    output = hfst.regex('?* ;')
    print('Compose-intersecting rules with universal FST...', file=sys.stderr)
    output.compose_intersect(rule_fsts)
    print('Optimizing for fast lookup...', file=sys.stderr)
    output.lookup_optimize()
    return output
Example #13
0
def load_analyser(filename: str):
    """Load an automaton from file.

    Args:
        filename:  containing single hfst automaton binary.

    Throws:
        FileNotFoundError if file is not found
    """
    try:
        his = hfst.HfstInputStream(filename)
        return his.read()
    except libhfst.NotTransducerStreamException:
        raise IOError(2, filename) from None
Example #14
0
def hello_world():
    
    req_data = request.get_json()
    item = req_data['item']
    
    
    
    filename = "./finntreebank.hfst"
    input_stream = hfst.HfstInputStream(filename)
    analyser = input_stream.read()
    result = analyser.lookup(item)
    print (result[0][0])
    resultObj = result[0][0]
    print(resultObj)
    return(resultObj)
Example #15
0
def read_fst(filename="examples.fst"):
    """Reads in a previously stored example FST file
    """
    exfile = hfst.HfstInputStream(filename)
    cfg.examples_fst = exfile.read()
    pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
    # print("pair_symbols", pair_symbols) ##
    pair_symbol_lst = re.split(r" +", pair_symbols)
    for pair in pair_symbol_lst:
        cfg.pair_symbol_set.add(pair)
        (insym, outsym) = cfg.pairsym2sympair(pair)
        cfg.symbol_pair_set.add((insym, outsym))
        cfg.input_symbol_set.add(insym)
        cfg.output_symbol_set.add(outsym)
    cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
    return
Example #16
0
def __generate_locally(query,
                       language,
                       cache=True,
                       descrpitive=False,
                       dictionary_forms=True):
    if cache and language + str(descrpitive) + str(
            dictionary_forms) in generator_cache:
        generator = generator_cache[language + str(descrpitive) +
                                    str(dictionary_forms)]
    else:
        filename = os.path.join(
            __where_models(language),
            __generator_model_name(descrpitive, dictionary_forms))
        input_stream = hfst.HfstInputStream(filename)
        generator = input_stream.read()
        generator_cache[language + str(descrpitive) +
                        str(dictionary_forms)] = generator
    r = generator.lookup(query)
    return r
Example #17
0
    def load(cls, tab_file, fst_file):
        """
        Load a Tableau object from file.
        :param tab_file: The .tableau file
        :param fst_file: The .hfst file
        :return: A Tableau object
        """
        with open(tab_file, "r", encoding="utf8") as t_file:
            lines = t_file.readlines()
        penal_method = lines[0].rstrip("\n")

        f_file = hfst.HfstInputStream(fst_file)
        gen = f_file.read()
        cls._optimize_lookup(gen)

        tab = cls(gen, penal_method=penal_method)

        stepwise = list()
        while not f_file.is_eof():
            fst_1 = f_file.read()
            cls._optimize_lookup(fst_1)
            if f_file.is_eof():
                tab._runnable = fst_1
            else:
                fst_2 = f_file.read()
                cls._optimize_lookup(fst_2)
                stepwise.append((fst_1, fst_2))
        tab._stepwise = stepwise
        f_file.close()

        for line in lines[1:]:
            fields = line.rstrip("\n").split("\t")
            name = fields[0]
            n = int(fields[1])
            regex = fields[2]
            constraint = Constraint(regex, n=n, name=name)
            tab.add_constraint(constraint)

        return tab
Example #18
0
    elif arg == '--xfst' or arg == '-X':
        xfst = '<next>'
    elif nrandom == '<next>':
        nrandom = int(arg)
    elif xfst == '<next>':
        if arg == 'obey-flags':
            obeyflags = True
            xfst = None
        else:
            raise RuntimeError(
                'error: hfst-fst2strings.py: option --xfst supports only variable obey-flags'
            )
    else:
        infile = arg

istr = hfst.HfstInputStream(infile)
for tr in istr:
    weighted = tr.get_type(
    ) != hfst.ImplementationType.SFST_TYPE and tr.get_type(
    ) != hfst.ImplementationType.FOMA_TYPE
    paths = None
    if nrandom != None:
        paths = tr.extract_paths(obey_flags=obeyflags, random=nrandom)
    else:
        paths = tr.extract_paths(obey_flags=obeyflags)
    for key, values in paths.items():
        for value in values:
            print(key, end='')
            if (key != value[0]):
                print(':' + value[0], end='')
            #if weighted:
Example #19
0
import hfst
import hfst_commandline
options = hfst_commandline.hfst_getopt('', [], 1)

if len(options[1]) == 0:
    raise RuntimeError('Usage: hfst-determinize.py INFILE')

istr = hfst.HfstInputStream(options[1][0])
ostr = hfst.HfstOutputStream(type=istr.get_type())

while (not istr.is_eof()):
    tr = istr.read()
    tr.determinize()
    tr.write(ostr)
    ostr.flush()

istr.close()
ostr.close()
Example #20
0
    '<futnear>'
]
person = ['<p1>', '<p2>', '<p3>']

imp = [
    '<imp><prox><p1><sg>', '<imp><prox><p2><sg>', '<imp><prox><p3><sg>',
    '<imp><prox><p1><pe>', '<imp><prox><p1><pi>', '<imp><prox><p2><pl>',
    '<imp><prox><p3><pl>', '<imp><rem><p1><sg>', '<imp><rem><p2><sg>',
    '<imp><rem><p3><sg>', '<imp><rem><p1><pe>', '<imp><rem><p1><pi>',
    '<imp><rem><p2><pl>', '<imp><rem><p3><pl>'
]

config = configparser.ConfigParser()
config.read('settings.ini')
path = config['HFST']['BinaryFilePath']
transducer_gen = hfst.HfstInputStream(path).read()


def generate_result(input_file, result_file):
    with open(input_file, 'r', encoding='utf-8') as inp_file:
        data = inp_file.readlines()

    errors = []
    with open(result_file, 'w+', encoding='utf-8') as res_file:
        for word in data:
            list_of_wf = []
            stem = word.split()[0]
            pos = word.split()[1]
            if pos == 'n':
                for n in num:
                    for c in case:
Example #21
0
retval = 0
from sys import argv
if len(argv) < 3:
    raise RuntimeError('Usage: hfst-compose.py INFILE1 INFILE2')
for arg in argv[1:]:
    if arg == '-s' or arg == '--silent' or arg == '-q' or arg == '--quiet':
        silent = True
    elif arg == '-H' or arg == '--do-not-harmonize':
        harmonize = False
    else:
        if in1 == None:
            in1 = arg
        else:
            in2 = arg

istr1 = hfst.HfstInputStream(in1)
istr2 = hfst.HfstInputStream(in2)
if (istr1.get_type() != istr2.get_type()):
    raise RuntimeError('Error: transducer types differ in ' + in1 + ' and ' +
                       in2)

while ((not istr1.is_eof()) and (not istr2.is_eof())):
    tr1 = istr1.read()
    tr2 = istr2.read()
    if (tr1.compare(tr2, harmonize)):
        if not silent:
            print(tr1.get_name() + ' == ' + tr2.get_name())
    else:
        if not silent:
            print(tr1.get_name() + ' != ' + tr2.get_name())
        retval = 1
Example #22
0
import hfst
from sys import argv
if len(argv) != 3:
    raise RuntimeError('Usage: hfst-disjunct.py INFILE1 INFILE2')

istr1 = hfst.HfstInputStream(argv[1])
istr2 = hfst.HfstInputStream(argv[2])
if (istr1.get_type() != istr2.get_type()):
    raise RuntimeError('Error: transducer types differ in ' + argv[1] +
                       ' and ' + argv[2])
ostr = hfst.HfstOutputStream(type=istr1.get_type())

while ((not istr1.is_eof()) and (not istr2.is_eof())):
    tr1 = istr1.read()
    tr2 = istr2.read()
    tr1.disjunct(tr2)
    tr1.write(ostr)
    ostr.flush()

istr1.close()
istr2.close()
Example #23
0
        infile1 = '<next>'
    elif arg == '-2':
        infile2 = '<next>'
    elif infile1 == '<next>':
        infile1 = arg
    elif infile2 == '<next>':
        infile2 = arg
    elif infile1 == None:
        infile1 = arg
    elif infile2 == None:
        infile2 = arg
    else:
        raise RuntimeError(
            'Usage: hfst-compose-intersect.py [-1] INFILE1 [-2] INFILE2')

istr1 = hfst.HfstInputStream(infile1)
istr2 = hfst.HfstInputStream(infile2)
if (istr1.get_type() != istr2.get_type()):
    raise RuntimeError('Error: transducer types differ in ' + infile1 +
                       ' and ' + infile2)

tr1 = istr1.read()
if not istr1.is_eof():
    raise RuntimeError('Error: ' + infile1 +
                       ' must contain exactly one transducer')
istr1.close()

transducers = []
while (not istr2.is_eof()):
    transducers.append(istr2.read())
istr2.close()
Example #24
0
import hfst
# load a transducer from the ones listed above
transducer = hfst.HfstInputStream(
    'c:/Users/user/Compling/apertium-evn-master/evn.automorf.hfst').read()

# get the result for a wordform:
result = transducer.lookup("хороки")
print(result)
Example #25
0
from manageXML.constants import SPECIFICATION
from manageXML.inflector import Inflector
from collections import defaultdict
from django.conf import settings
import os
import hfst
from uralicNLP import uralicApi
import csv
from manageXML.utils import *

register = template.Library()
_inflector = Inflector()
transducer_path = os.path.join(
    settings.BASE_DIR, '../local/transducers/generator-dict-gt-norm.hfstol')
if os.path.exists(transducer_path) and os.path.isfile(transducer_path):
    input_stream = hfst.HfstInputStream(transducer_path)
    synthetiser = input_stream.read()


@register.filter(name='tex_escape')
def tex_escape(text):
    """
        :param text: a plain text message
        :return: the message escaped to appear correctly in LaTeX
    """
    conv = {
        '&': r'\&',
        '%': r'\%',
        '$': r'\$',
        '#': r'\#',
        '_': r'\_',
Example #26
0
 transducer = hfst.regex('[a:b]*')
 try:
     results = transducer.extract_paths(output='text')
     print("The transducer has %i paths:" % len(results))
     print(results)
 except hfst.exceptions.TransducerIsCyclicException:
     print("The transducer is cyclic and has an infinite number of paths. Some of them:")
     results = transducer.extract_paths(output='text', max_cycles=5)
     print(results)
 
 # NotTransducerStreamException
 f = open('foofile', 'w')
 f.write('This is an ordinary text file.\n')
 f.close()
 try:
     instr = hfst.HfstInputStream('foofile')
     tr = instr.read()
     print(tr)
     instr.close()
 except hfst.exceptions.NotTransducerStreamException:
     print("Could not print transducer: the file does not contain binary transducers.")
 
 f = open('testfile1.att', 'w')
 f.write('0 1 a b\n1 2 c\n2\n')
 f.close()
 f = open('testfile1.att', 'r')
 try:
     tr = hfst.read_att_transducer(f)
 except hfst.exceptions.NotValidAttFormatException:
     print('Could not read file: it is not in valid ATT format.')
 f.close()
Example #27
0
argparser = argparse.ArgumentParser(
    "python3 gyessbygenerating.py",
    description="Guess lexicon entries from generated forms of them")
argparser.add_argument("guesser",
                       help="Guesser file FST",
                       default="ofi-guess-n.fst")
argparser.add_argument("rules", help="name of the two-level rule file")
argparser.add_argument("-v",
                       "--verbosity",
                       default=0,
                       type=int,
                       help="level of diagnostic output")
args = argparser.parse_args()

guesser_fil = hfst.HfstInputStream(args.guesser)
guesser_fst = guesser_fil.read()
guesser_fil.close()
#guesser_fst.invert()
guesser_fst.minimize()
guesser_fst.lookup_optimize()

import sys, re
import generate

suf = {"/s": ["", "n", "{nrs}{aä}", "{ij}{Øt}{aä}"]}

print()
for line_nl in sys.stdin:
    line = line_nl.strip()
    res = guesser_fst.lookup(line, output="tuple")
Example #28
0
import hfst

force = False
from sys import argv
if len(argv) > 3:
    raise RuntimeError(
        'Usage: hfst-prune-alphabet.py [-f|--force] [-S|--safe]')
for arg in argv[1:]:
    if arg == '-f' or arg == '--force':
        force = True
    elif arg == '-S' or arg == '--safe':
        force = False
    else:
        raise RuntimeError('unknown option: ' + arg)

istr = hfst.HfstInputStream()
ostr = hfst.HfstOutputStream(type=istr.get_type())

while (not istr.is_eof()):
    tr = istr.read()
    tr = hfst.HfstBasicTransducer(tr)
    tr.prune_alphabet(force)
    tr = hfst.HfstTransducer(tr, istr.get_type())
    tr.write(ostr)
    ostr.flush()

istr.close()
ostr.close()
Example #29
0
argparser.add_argument(
    "ksk", help="input file, e.g. ~/Dropbox/lang/fin/ksk/ksk-v.dic")
argparser.add_argument("fst", help=" conversion fst, e.g. ofi-conv-n.fst")
argparser.add_argument("name", help="name of the lexicon for entries created")
argparser.add_argument("lexc", help="output file, e.g. ofi-words-n.lexc")
argparser.add_argument("-c", "--codes", help="infl-codes.text")
argparser.add_argument("-v",
                       "--verbosity",
                       type=int,
                       default=0,
                       help="level of diagnostic info printed")
args = argparser.parse_args()

ksk_file = open(args.ksk, "r")

fstfile = hfst.HfstInputStream(args.fst)
fst = fstfile.read()
fst.lookup_optimize()

outf = open(args.lexc, "w")

infl_set = set(open(args.codes).read().split())
if args.verbosity >= 5:
    print("infl_set =", infl_set)  ###

entrylist = []

multiharacters = set()


def find_multichars(str):
Example #30
0
    type_ = hfst.ImplementationType.TROPICAL_OPENFST_TYPE
    ostr = hfst.HfstOutputStream(filename='foobar.hfst', type=type_)

    tr_ = hfst.regex('{foo}:{bar}::0.5')
    tr_.convert(type_)

    ostr.write(tr_)
    ostr.write(tr_)
    ostr.flush()
    ostr.close()

    if not os.path.isfile('foobar.hfst'):
        raise RuntimeError('Missing file: foobar.hfst')

    istr = hfst.HfstInputStream('foobar.hfst')
    numtr = 0
    try:
        tr1 = istr.read()
        numtr += 1
        tr2 = istr.read()
        numtr += 1
        tr3 = istr.read()
        numtr += 1
    except hfst.exceptions.EndOfStreamException:
        pass
    except:
        raise RuntimeError(get_linenumber())
    istr.close()

    if numtr != 2: