def get_fst(): src = Path('g2p.twolc') tmp = Path('g2p_from_py.tmp.hfst') final = Path('g2p_from_py.hfstol') if (not tmp.exists()) or (src.stat().st_mtime > tmp.stat().st_mtime): print('Compiling twolc rules...', file=sys.stderr) hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True) if (not final.exists()) or not (src.stat().st_mtime < tmp.stat().st_mtime < final.stat().st_mtime): print('Preparing rule transducers for composition...', file=sys.stderr) rule_fsts_stream = hfst.HfstInputStream(tmp.name) rule_fsts = [t for t in rule_fsts_stream] print('Creating universal language FST...', file=sys.stderr) output = hfst.regex('?* ;') print('Compose-intersecting rules with universal FST...', file=sys.stderr) output.compose_intersect(rule_fsts) print('Optimizing for fast lookup...', file=sys.stderr) output.lookup_optimize() print('Writing out final FST...', file=sys.stderr) output.write_to_file(final.name) else: ol_fst_stream = hfst.HfstInputStream(final.name) output = next(ol_fst_stream) return output
def get_input_stream(filename): if filename == '-': return hfst.HfstInputStream() elif filename != None: return hfst.HfstInputStream(filename) else: return None
def load_cascade(filename): transducers = [] istr = hfst.HfstInputStream(full_path(filename)) while not istr.is_eof(): transducers.append(istr.read()) istr.close() return tuple(transducers)
def load_filename(self, fsa): istr = hfst.HfstInputStream(fsa) transducers = [] while not (istr.is_eof()): transducers.append(istr.read()) istr.close() self.transducer = transducers[0]
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ import hfst exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = hfst.empty_fst() for insym, outsym in cfg.symbol_pair_set: in_quoted = re.sub(r"([{}])", r"%\1", insym) #print(in_quoted, outsym)### tilts if insym contains bad chars pair_fst = hfst.regex(in_quoted + ':' + outsym) cfg.all_pairs_fst.disjunct(pair_fst) cfg.all_pairs_fst.remove_epsilons() cfg.all_pairs_fst.minimize() if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def get_fst(start_rule, end_rule, *args): src = Path('g2p.twolc') tmp = Path('g2p_test_from_py.tmp.hfst') hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True) print('Preparing rule transducers for composition...', file=sys.stderr) rule_fsts_stream = hfst.HfstInputStream(tmp.name) rule_numbers = set() rule_numbers.add(0) for i in range(start_rule, end_rule + 1): rule_numbers.add(i) if (len(args) > 0): for i in range(args[0], args[1] + 1): rule_numbers.add(i) rule_fsts = [] for index, rule in enumerate(rule_fsts_stream): if index in rule_numbers: rule_fsts.append(rule) print('Creating universal language FST...', file=sys.stderr) output = hfst.regex('?* ;') print('Compose-intersecting rules with universal FST...', file=sys.stderr) output.compose_intersect(rule_fsts) print('Optimizing for fast lookup...', file=sys.stderr) output.lookup_optimize() return output
def segment(word, segmenter, sep=' '): """ Segments a word and returns the segmentations that correspond to the original spelling. :param word: a word (str) :param segmenter: a path to the HFST transducer for segmentation (str) or HFST transducer (libhfst.HfstTransducer) :return: a list of segmentations in the original spellimg """ if isinstance(segmenter, str): if not os.path.exists(segmenter): raise ValueError('The segmenter could not be found!') segmenter = hfst.HfstInputStream(path).read() segmentation = segmenter.lookup(word) res = [] for seg in segmentation: seg = re.sub('·+', sep, seg[0]) reverted = revert_spellrelax(word, seg) if reverted and reverted not in res: res.append(reverted) return res
def __analyze_locally(query, language, cache=True, descrpitive=True): if cache and language + str(descrpitive) in analyzer_cache: generator = analyzer_cache[language + str(descrpitive)] else: filename = os.path.join(__where_models(language), __analyzer_model_name(descrpitive)) input_stream = hfst.HfstInputStream(filename) generator = input_stream.read() analyzer_cache[language + str(descrpitive)] = generator r = generator.lookup(query) return r
def isWord(self, ls): string = ''.join(ls) filename = "apertium-jpn/jpn.automorf.hfst" input_stream = hfst.HfstInputStream(filename) analyser = input_stream.read() fullout = analyser.lookup(string) if len(fullout) == 0: return False output = fullout[0][0] items = output.split("<", 1) token = Unit(items[0], items[1]) return token
def get_transducer(language, cache=True, analyzer=True, descrpitive=True, dictionary_forms=True, convert_to_openfst=False): conversion_type = hfst.ImplementationType.TROPICAL_OPENFST_TYPE if not analyzer: #generator if cache and language + str(descrpitive) + str(dictionary_forms) + str( convert_to_openfst) in generator_cache: generator = generator_cache[language + str(descrpitive) + str(dictionary_forms) + str(convert_to_openfst)] else: filename = os.path.join( __where_models(language), __generator_model_name(descrpitive, dictionary_forms)) input_stream = hfst.HfstInputStream(filename) generator = input_stream.read() if convert_to_openfst: generator.convert(conversion_type) generator_cache[language + str(descrpitive) + str(dictionary_forms) + str(convert_to_openfst)] = generator else: if cache and language + str(descrpitive) + str( convert_to_openfst) in analyzer_cache: generator = analyzer_cache[language + str(descrpitive) + str(convert_to_openfst)] else: filename = os.path.join(__where_models(language), __analyzer_model_name(descrpitive)) input_stream = hfst.HfstInputStream(filename) generator = input_stream.read() if convert_to_openfst: generator.convert(conversion_type) analyzer_cache[language + str(descrpitive) + str(convert_to_openfst)] = generator return generator
def _load_transducer(filename, invert): metadata_filename = os.path.join(os.path.dirname(filename), "metadata.json") try: metadata = mikatools.json_load(metadata_filename) except: #No crash if JSON is not found or malformed for some reason metadata = {} if "fst_type" in metadata and metadata["fst_type"] == "foma": return FomaFSTWrapper(filename, invert) else: input_stream = hfst.HfstInputStream(filename) return input_stream.read()
def get_fst(src): tmp = Path('../res/g2p_from_py.hfst') print('Compiling twolc rules...', file=sys.stderr) hfst.compile_twolc_file(src.name, tmp.name, resolve_left_conflicts=True) print('Preparing rule transducers for composition...', file=sys.stderr) rule_fsts_stream = hfst.HfstInputStream(tmp.name) rule_fsts = [t for t in rule_fsts_stream] print('Creating universal language FST...', file=sys.stderr) output = hfst.regex('?* ;') print('Compose-intersecting rules with universal FST...', file=sys.stderr) output.compose_intersect(rule_fsts) print('Optimizing for fast lookup...', file=sys.stderr) output.lookup_optimize() return output
def load_analyser(filename: str): """Load an automaton from file. Args: filename: containing single hfst automaton binary. Throws: FileNotFoundError if file is not found """ try: his = hfst.HfstInputStream(filename) return his.read() except libhfst.NotTransducerStreamException: raise IOError(2, filename) from None
def hello_world(): req_data = request.get_json() item = req_data['item'] filename = "./finntreebank.hfst" input_stream = hfst.HfstInputStream(filename) analyser = input_stream.read() result = analyser.lookup(item) print (result[0][0]) resultObj = result[0][0] print(resultObj) return(resultObj)
def read_fst(filename="examples.fst"): """Reads in a previously stored example FST file """ exfile = hfst.HfstInputStream(filename) cfg.examples_fst = exfile.read() pair_symbols = cfg.examples_fst.get_property("x-pair_symbols") # print("pair_symbols", pair_symbols) ## pair_symbol_lst = re.split(r" +", pair_symbols) for pair in pair_symbol_lst: cfg.pair_symbol_set.add(pair) (insym, outsym) = cfg.pairsym2sympair(pair) cfg.symbol_pair_set.add((insym, outsym)) cfg.input_symbol_set.add(insym) cfg.output_symbol_set.add(outsym) cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set) if cfg.verbosity >= 30: twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst") return
def __generate_locally(query, language, cache=True, descrpitive=False, dictionary_forms=True): if cache and language + str(descrpitive) + str( dictionary_forms) in generator_cache: generator = generator_cache[language + str(descrpitive) + str(dictionary_forms)] else: filename = os.path.join( __where_models(language), __generator_model_name(descrpitive, dictionary_forms)) input_stream = hfst.HfstInputStream(filename) generator = input_stream.read() generator_cache[language + str(descrpitive) + str(dictionary_forms)] = generator r = generator.lookup(query) return r
def load(cls, tab_file, fst_file): """ Load a Tableau object from file. :param tab_file: The .tableau file :param fst_file: The .hfst file :return: A Tableau object """ with open(tab_file, "r", encoding="utf8") as t_file: lines = t_file.readlines() penal_method = lines[0].rstrip("\n") f_file = hfst.HfstInputStream(fst_file) gen = f_file.read() cls._optimize_lookup(gen) tab = cls(gen, penal_method=penal_method) stepwise = list() while not f_file.is_eof(): fst_1 = f_file.read() cls._optimize_lookup(fst_1) if f_file.is_eof(): tab._runnable = fst_1 else: fst_2 = f_file.read() cls._optimize_lookup(fst_2) stepwise.append((fst_1, fst_2)) tab._stepwise = stepwise f_file.close() for line in lines[1:]: fields = line.rstrip("\n").split("\t") name = fields[0] n = int(fields[1]) regex = fields[2] constraint = Constraint(regex, n=n, name=name) tab.add_constraint(constraint) return tab
elif arg == '--xfst' or arg == '-X': xfst = '<next>' elif nrandom == '<next>': nrandom = int(arg) elif xfst == '<next>': if arg == 'obey-flags': obeyflags = True xfst = None else: raise RuntimeError( 'error: hfst-fst2strings.py: option --xfst supports only variable obey-flags' ) else: infile = arg istr = hfst.HfstInputStream(infile) for tr in istr: weighted = tr.get_type( ) != hfst.ImplementationType.SFST_TYPE and tr.get_type( ) != hfst.ImplementationType.FOMA_TYPE paths = None if nrandom != None: paths = tr.extract_paths(obey_flags=obeyflags, random=nrandom) else: paths = tr.extract_paths(obey_flags=obeyflags) for key, values in paths.items(): for value in values: print(key, end='') if (key != value[0]): print(':' + value[0], end='') #if weighted:
import hfst import hfst_commandline options = hfst_commandline.hfst_getopt('', [], 1) if len(options[1]) == 0: raise RuntimeError('Usage: hfst-determinize.py INFILE') istr = hfst.HfstInputStream(options[1][0]) ostr = hfst.HfstOutputStream(type=istr.get_type()) while (not istr.is_eof()): tr = istr.read() tr.determinize() tr.write(ostr) ostr.flush() istr.close() ostr.close()
'<futnear>' ] person = ['<p1>', '<p2>', '<p3>'] imp = [ '<imp><prox><p1><sg>', '<imp><prox><p2><sg>', '<imp><prox><p3><sg>', '<imp><prox><p1><pe>', '<imp><prox><p1><pi>', '<imp><prox><p2><pl>', '<imp><prox><p3><pl>', '<imp><rem><p1><sg>', '<imp><rem><p2><sg>', '<imp><rem><p3><sg>', '<imp><rem><p1><pe>', '<imp><rem><p1><pi>', '<imp><rem><p2><pl>', '<imp><rem><p3><pl>' ] config = configparser.ConfigParser() config.read('settings.ini') path = config['HFST']['BinaryFilePath'] transducer_gen = hfst.HfstInputStream(path).read() def generate_result(input_file, result_file): with open(input_file, 'r', encoding='utf-8') as inp_file: data = inp_file.readlines() errors = [] with open(result_file, 'w+', encoding='utf-8') as res_file: for word in data: list_of_wf = [] stem = word.split()[0] pos = word.split()[1] if pos == 'n': for n in num: for c in case:
retval = 0 from sys import argv if len(argv) < 3: raise RuntimeError('Usage: hfst-compose.py INFILE1 INFILE2') for arg in argv[1:]: if arg == '-s' or arg == '--silent' or arg == '-q' or arg == '--quiet': silent = True elif arg == '-H' or arg == '--do-not-harmonize': harmonize = False else: if in1 == None: in1 = arg else: in2 = arg istr1 = hfst.HfstInputStream(in1) istr2 = hfst.HfstInputStream(in2) if (istr1.get_type() != istr2.get_type()): raise RuntimeError('Error: transducer types differ in ' + in1 + ' and ' + in2) while ((not istr1.is_eof()) and (not istr2.is_eof())): tr1 = istr1.read() tr2 = istr2.read() if (tr1.compare(tr2, harmonize)): if not silent: print(tr1.get_name() + ' == ' + tr2.get_name()) else: if not silent: print(tr1.get_name() + ' != ' + tr2.get_name()) retval = 1
import hfst from sys import argv if len(argv) != 3: raise RuntimeError('Usage: hfst-disjunct.py INFILE1 INFILE2') istr1 = hfst.HfstInputStream(argv[1]) istr2 = hfst.HfstInputStream(argv[2]) if (istr1.get_type() != istr2.get_type()): raise RuntimeError('Error: transducer types differ in ' + argv[1] + ' and ' + argv[2]) ostr = hfst.HfstOutputStream(type=istr1.get_type()) while ((not istr1.is_eof()) and (not istr2.is_eof())): tr1 = istr1.read() tr2 = istr2.read() tr1.disjunct(tr2) tr1.write(ostr) ostr.flush() istr1.close() istr2.close()
infile1 = '<next>' elif arg == '-2': infile2 = '<next>' elif infile1 == '<next>': infile1 = arg elif infile2 == '<next>': infile2 = arg elif infile1 == None: infile1 = arg elif infile2 == None: infile2 = arg else: raise RuntimeError( 'Usage: hfst-compose-intersect.py [-1] INFILE1 [-2] INFILE2') istr1 = hfst.HfstInputStream(infile1) istr2 = hfst.HfstInputStream(infile2) if (istr1.get_type() != istr2.get_type()): raise RuntimeError('Error: transducer types differ in ' + infile1 + ' and ' + infile2) tr1 = istr1.read() if not istr1.is_eof(): raise RuntimeError('Error: ' + infile1 + ' must contain exactly one transducer') istr1.close() transducers = [] while (not istr2.is_eof()): transducers.append(istr2.read()) istr2.close()
import hfst # load a transducer from the ones listed above transducer = hfst.HfstInputStream( 'c:/Users/user/Compling/apertium-evn-master/evn.automorf.hfst').read() # get the result for a wordform: result = transducer.lookup("хороки") print(result)
from manageXML.constants import SPECIFICATION from manageXML.inflector import Inflector from collections import defaultdict from django.conf import settings import os import hfst from uralicNLP import uralicApi import csv from manageXML.utils import * register = template.Library() _inflector = Inflector() transducer_path = os.path.join( settings.BASE_DIR, '../local/transducers/generator-dict-gt-norm.hfstol') if os.path.exists(transducer_path) and os.path.isfile(transducer_path): input_stream = hfst.HfstInputStream(transducer_path) synthetiser = input_stream.read() @register.filter(name='tex_escape') def tex_escape(text): """ :param text: a plain text message :return: the message escaped to appear correctly in LaTeX """ conv = { '&': r'\&', '%': r'\%', '$': r'\$', '#': r'\#', '_': r'\_',
transducer = hfst.regex('[a:b]*') try: results = transducer.extract_paths(output='text') print("The transducer has %i paths:" % len(results)) print(results) except hfst.exceptions.TransducerIsCyclicException: print("The transducer is cyclic and has an infinite number of paths. Some of them:") results = transducer.extract_paths(output='text', max_cycles=5) print(results) # NotTransducerStreamException f = open('foofile', 'w') f.write('This is an ordinary text file.\n') f.close() try: instr = hfst.HfstInputStream('foofile') tr = instr.read() print(tr) instr.close() except hfst.exceptions.NotTransducerStreamException: print("Could not print transducer: the file does not contain binary transducers.") f = open('testfile1.att', 'w') f.write('0 1 a b\n1 2 c\n2\n') f.close() f = open('testfile1.att', 'r') try: tr = hfst.read_att_transducer(f) except hfst.exceptions.NotValidAttFormatException: print('Could not read file: it is not in valid ATT format.') f.close()
argparser = argparse.ArgumentParser( "python3 gyessbygenerating.py", description="Guess lexicon entries from generated forms of them") argparser.add_argument("guesser", help="Guesser file FST", default="ofi-guess-n.fst") argparser.add_argument("rules", help="name of the two-level rule file") argparser.add_argument("-v", "--verbosity", default=0, type=int, help="level of diagnostic output") args = argparser.parse_args() guesser_fil = hfst.HfstInputStream(args.guesser) guesser_fst = guesser_fil.read() guesser_fil.close() #guesser_fst.invert() guesser_fst.minimize() guesser_fst.lookup_optimize() import sys, re import generate suf = {"/s": ["", "n", "{nrs}{aä}", "{ij}{Øt}{aä}"]} print() for line_nl in sys.stdin: line = line_nl.strip() res = guesser_fst.lookup(line, output="tuple")
import hfst force = False from sys import argv if len(argv) > 3: raise RuntimeError( 'Usage: hfst-prune-alphabet.py [-f|--force] [-S|--safe]') for arg in argv[1:]: if arg == '-f' or arg == '--force': force = True elif arg == '-S' or arg == '--safe': force = False else: raise RuntimeError('unknown option: ' + arg) istr = hfst.HfstInputStream() ostr = hfst.HfstOutputStream(type=istr.get_type()) while (not istr.is_eof()): tr = istr.read() tr = hfst.HfstBasicTransducer(tr) tr.prune_alphabet(force) tr = hfst.HfstTransducer(tr, istr.get_type()) tr.write(ostr) ostr.flush() istr.close() ostr.close()
argparser.add_argument( "ksk", help="input file, e.g. ~/Dropbox/lang/fin/ksk/ksk-v.dic") argparser.add_argument("fst", help=" conversion fst, e.g. ofi-conv-n.fst") argparser.add_argument("name", help="name of the lexicon for entries created") argparser.add_argument("lexc", help="output file, e.g. ofi-words-n.lexc") argparser.add_argument("-c", "--codes", help="infl-codes.text") argparser.add_argument("-v", "--verbosity", type=int, default=0, help="level of diagnostic info printed") args = argparser.parse_args() ksk_file = open(args.ksk, "r") fstfile = hfst.HfstInputStream(args.fst) fst = fstfile.read() fst.lookup_optimize() outf = open(args.lexc, "w") infl_set = set(open(args.codes).read().split()) if args.verbosity >= 5: print("infl_set =", infl_set) ### entrylist = [] multiharacters = set() def find_multichars(str):
type_ = hfst.ImplementationType.TROPICAL_OPENFST_TYPE ostr = hfst.HfstOutputStream(filename='foobar.hfst', type=type_) tr_ = hfst.regex('{foo}:{bar}::0.5') tr_.convert(type_) ostr.write(tr_) ostr.write(tr_) ostr.flush() ostr.close() if not os.path.isfile('foobar.hfst'): raise RuntimeError('Missing file: foobar.hfst') istr = hfst.HfstInputStream('foobar.hfst') numtr = 0 try: tr1 = istr.read() numtr += 1 tr2 = istr.read() numtr += 1 tr3 = istr.read() numtr += 1 except hfst.exceptions.EndOfStreamException: pass except: raise RuntimeError(get_linenumber()) istr.close() if numtr != 2: