def lookup(self): # getting analysis data used in test analysis_stream = libhfst.HfstInputStream(self.morph_path).read() for section in self.sections: for test in section.tests: for result in analysis_stream.lookup(test.right): test.ana_result.append(result[0]) # getting generation data used in test generation_stream = libhfst.HfstInputStream(self.gen_path).read() for section in self.sections: for test in section.tests: for result in generation_stream.lookup(test.left): test.gen_result.append(result[0])
def get_transducer(fsa): istr = libhfst.HfstInputStream(fsa) transducers = [] while not (istr.is_eof()): transducers.append(istr.read()) istr.close() return transducers[0]
def getTransducer(fsa): """Returns transducer from finite state automata""" istr = libhfst.HfstInputStream(fsa) transducers = [] while not (istr.is_eof()): transducers.append(istr.read()) istr.close() return transducers[0]
def load_hfst(self, f): """Load an automaton from file. @param f containing single hfst automaton binary. """ try: his = libhfst.HfstInputStream(f) return his.read() except libhfst.NotTransducerStreamException: raise IOError
def generate(parse_str): ''' Generate a word given a parser output (lemma + tags). ''' global generator s = parse_str.replace('#', '+Use/NoHyphens#') if not generator: generator = libhfst.HfstInputStream(generator_file).read() g = generator.lookup(s, output='tuple') try: g = g[0][0] return g except IndexError: return s.split('+')[0]
def load_analyser(filename): """Load an automaton from file. Args: filename: containing single hfst automaton binary. Throws: FileNotFoundError if file is not found """ try: his = libhfst.HfstInputStream(filename) return his.read() except libhfst.NotTransducerStreamException: raise IOError(2, filename) from None
def loadTransducer(): try: istr = libhfst.HfstInputStream(r"italian_verb_analyzer.hfst") transducers = [] while not (istr.is_eof()): transducers.append(istr.read()) istr.close() td = hfst.HfstBasicTransducer(transducers[1]) return td except: print('Transducer file was invalid or not found.') time.sleep(3) exit()
freq_rel_feat[feat] = {} freq_rel_feat[feat][rel] = int(freq) if ':' in rel: rel = rel.split(':')[0] freq_rel_feat[feat][rel] = int(freq) return freq_rel_feat ############################################################################### if len(sys.argv) < 3: #{ print('conllu-morph.py <fst> <tsv>') sys.exit(-1) #} istr = libhfst.HfstInputStream(sys.argv[1]) morf = istr.read() #morf.remove_epsilons(); af = open(sys.argv[2]) apertium_symbs = read_rules(af) freq_rel_feat = {} if len(sys.argv) == 4: freq_rel_feat = read_rel_feat(sys.argv[3]) #print(freq_rel_feat, file=sys.stderr) #print(apertium_symbs); unknown = 0
results = transducer.extract_paths(output='text') print("The transducer has %i paths:" % len(results)) print(results) except libhfst.TransducerIsCyclicException: print( "The transducer is cyclic and has an infinite number of paths. Some of them:" ) results = transducer.extract_paths(output='text', max_cycles=5) print(results) # NotTransducerStreamException f = open('foofile', 'w') f.write('This is an ordinary text file.\n') f.close() try: instr = libhfst.HfstInputStream('foofile') tr = instr.read() print(tr) instr.close() except libhfst.NotTransducerStreamException: print( "Could not print transducer: the file does not contain binary transducers." ) f = open('testfile1.att', 'w') f.write('0 1 a b\n\ 1 2 c\n\ 2\n') f.close() f = libhfst.hfst_open('testfile1.att', 'r') try:
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) ttype = 0 if sys.argv[1] == 'sfst': ttype = libhfst.SFST_TYPE elif sys.argv[1] == 'openfst': ttype = libhfst.TROPICAL_OPENFST_TYPE elif sys.argv[1] == 'foma': ttype = libhfst.FOMA_TYPE else: print("ERROR: could not parse transducer format argument.") sys.exit(1) transducers_in_stream = int(sys.argv[2]) istr = libhfst.HfstInputStream() ostr = libhfst.HfstOutputStream(ttype) transducers_read = 0 transducers_written = 0 while True: try: tr = libhfst.HfstTransducer(istr) transducers_read += 1 ostr.redirect(tr) transducers_written += 1 except: # libhfst.EndOfStreamException: assert(libhfst.hfst_get_exception() == "EndOfStreamException") break; if transducers_read != transducers_in_stream: print("ERROR: wrong number of transducers read")
def __init__(self, lang, analyzer_file, **kwargs): self.language = lang self.analyzer = libhfst.HfstInputStream(analyzer_file).read()
def load_filename(self, path, **include): """Load omorfi automaton from filename and guess its use. A file name should consist of three parts separated by full stop. The second part must be a keyword describing the use of the automaton, first part is parsed as an identifier typically starting with the word omorfi, followed by any extras, such as the tagset for analysis or generation. The named arguments can include a name of automaton type as name, and truth value as value, for types of automata allowed to load. By default, the names `analyse`, `generate` and `segment` are loaded. Names not included are defaulted to False. E.g., `omorfi.load_filename(fn, analyse=True)` will only load file named fn if it can be identified as omorfi analyser. This is best used in conjunction with omorfi.load_from_dir. """ if len(include) == 0: include['analyse'] = True include['generate'] = True include['segment'] = True include['accept'] = True for ttype in [ 'analyse', 'generate', 'accept', 'tokenise', 'lemmatise', 'hyphenate', 'segment', 'labelsegment' ]: if ttype not in include: include[ttype] = False his = None if self._verbosity: print('Opening file', path) if access(path, F_OK): his = libhfst.HfstInputStream(path) else: # FIXME: should fail if self._verbosity: print('No access to ', path, file=stderr) pass parts = path[path.rfind('/') + 1:path.rfind('.')].split('.') if len(parts) != 2: if self._verbosity: print('not loaded', path) elif not parts[0].startswith('omorfi'): if self._verbosity: print('not omorfi', path) elif parts[1] == 'analyse' and include['analyse']: if self._verbosity: print('analyser', parts[0]) self.analysers[parts[0]] = his.read() elif parts[1] == 'generate' and include['generate']: if self._verbosity: print('generator', parts[0]) self.generators[parts[0]] = his.read() elif parts[1] == 'accept' and include['accept']: if self._verbosity: print('acceptor', parts[0]) self.acceptors[parts[0]] = his.read() elif parts[1] == 'tokenise' and include['tokenise']: if self._verbosity: print('tokeniser', parts[0]) self.tokenisers[parts[0]] = his.read() elif parts[1] == 'lemmatise' and include['lemmatise']: if self._verbosity: print('lemmatiser', parts[0]) self.lemmatisers[parts[0]] = his.read() elif parts[1] == 'hyphenate' and include['hyphenate']: if self._verbosity: print('hyphenator', parts[0]) self.hyphenators[parts[0]] = his.read() elif parts[1] == 'segment' and include['segment']: if self._verbosity: print('segmenter', parts[0]) self.segmenters[parts[0]] = his.read() elif parts[1] == 'labelsegment' and include['labelsegment']: if self._verbosity: print('labelsegmenter', parts[0]) self.labelsegmenters[parts[0]] = his.read() elif self._verbosity: print('skipped', parts)
print("The transducer has {0} paths".format(len(results))) assert (False) except: # libhfst.TransducerIsCyclicException: print("The transducer is cyclic and has an infinite number of paths.") # The stream does not contain transducers. # ---------------------------------------- print("NotTransducerStreamException") foofile = open('foofile', 'wb') foofile.write('This is a text file.\n'.encode('ascii')) foofile.write('Here is another line.\n'.encode('ascii')) foofile.write('The file ends here.'.encode('ascii')) foofile.close() try: instr = libhfst.HfstInputStream("foofile") except: # libhfst.NotTransducerStreamException: print("file does not contain transducers.") # The stream is not in valid AT&T format. # --------------------------------------- print("NotValidAttFormatException") testfile_att = open("testfile.att", "wb") testfile_att.write('0 1 a b\n'.encode('ascii')) testfile_att.write('1\n'.encode('ascii')) testfile_att.write('c\n'.encode('ascii')) testfile_att.close() for type in types: transducers = []
def main(): a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'), required=True, dest="outfile", help="log file name") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int, help="require THOLD % coverage or exit 1 (for testing)") options = a.parse_args() his = libhfst.HfstInputStream(options.fsa) omorfi = his.read() #libhfst.HfstTransducer(libhfst.HfstInputStream(options.fsa)) # statistics tokens = 0 uniqs = 0 found_tokens = 0 found_uniqs = 0 missed_tokens = 0 missed_uniqs = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] tokens += freq uniqs += 1 if options.verbose: print(tokens, "(", freq, ')...', end='\r') anals = omorfi.lookup(surf) if surf[0].isupper(): anals += omorfi.lookup(surf[0].lower() + surf[1:]) if surf.isupper(): anals += omorfi.lookup(surf.lower()) if surf.isupper(): anals += omorfi.lookup(surf[0] + surf[1:].lower()) if len(anals) > 0: found_tokens += freq found_uniqs += 1 else: missed_tokens += freq missed_uniqs += 1 print(freq, surf, "? (missed)", sep="\t", file=options.outfile) if options.verbose: print() cpuend = process_time() realend = perf_counter() print("cpu time: ", cpuend - cpustart, "real time:", realend - realstart) print("Tokens", "Matches", "Misses", "%", sep="\t") print(tokens, found_tokens, missed_tokens, found_tokens / tokens * 100 if tokens != 0 else 0, sep="\t") print("Uniqs", "Matches", "Misses", "%", sep="\t") print(uniqs, found_uniqs, missed_uniqs, found_uniqs / uniqs * 100 if uniqs != 0 else 0, sep="\t") if tokens == 0 or (found_tokens / tokens * 100 < options.threshold): print("needs to have", options.threshold, "% non-unique matches to pass regress test\n", file=stderr) exit(1) else: exit(0)
if len(sys.argv) <= 1: #{ print('test.py <lang code>'); sys.exit(-1); #} lang = sys.argv[1]; testf = []; if len(sys.argv) == 3: #{ testf = [sys.argv[2]]; else: testf = glob.glob('*.tsv'); #} istr1 = libhfst.HfstInputStream('../'+lang+'.automorf.hfst'); anal = istr1.read(); #anal.remove_epsilons(); istr2 = libhfst.HfstInputStream('../'+lang+'.autogen.hfst'); gene = istr2.read(); #gene.remove_epsilons(); print(testf); err_g = 0; corr_g = 0; total_g = 0; err_a = 0; corr_a = 0; total_a = 0;
'''This is a demo python script to show how you might do lookup through libhfst, in this case using an omorfi installation.''' import os, sys from itertools import ifilterfalse as ffilter import libhfst datadir = "/usr/local/share/hfst/fi" omorfipath = os.path.abspath(datadir + "/morphology.finntreebank.hfstol") def process_result_vector(vector): results = [] for entry in vector: if len(entry) < 2: continue weight = entry[0] string = ''.join(ffilter(libhfst.FdOperation.is_diacritic, entry[1])) results.append((string, weight)) return results istr = libhfst.HfstInputStream(omorfipath) transducer = libhfst.HfstTransducer(istr) input = raw_input() while input: results = process_result_vector(libhfst.vectorize(transducer.lookup_fd(input))) for result in results: print result[0] + '\t' + str(result[1]) try: input = raw_input() except EOFError: sys.exit()
return cf.f_back.f_lineno for type in (libhfst.TROPICAL_OPENFST_TYPE, libhfst.FOMA_TYPE): print('\n--- Testing implementation type %s ---\n' % libhfst.fst_type_to_string(type)) libhfst.set_default_fst_type(type) tr1 = None tr2 = None tr3 = None if not os.path.isfile('foobar.hfst'): raise RuntimeError('Missing file: foobar.hfst') istr = libhfst.HfstInputStream('foobar.hfst') numtr = 0 try: tr1 = istr.read() numtr += 1 tr2 = istr.read() numtr += 1 tr3 = istr.read() numtr += 1 except libhfst.EndOfStreamException: pass except: raise RuntimeError(get_linenumber()) istr.close() if numtr != 2:
#os.remove('foo.hfst') pass for ttype in (libhfst.SFST_TYPE, libhfst.TROPICAL_OPENFST_TYPE, libhfst.FOMA_TYPE): tr1 = libhfst.HfstTransducer('a', 'b', ttype) tr2 = libhfst.HfstTransducer('c', 'd', ttype) ostr = libhfst.HfstOutputStream('foo.hfst', tr1.get_type()) ostr.redirect(tr1) ostr.redirect(tr2) ostr.close() att_file = libhfst.hfst_open('foo.att', 'w') istr = libhfst.HfstInputStream('foo.hfst') transducers_read = 0 while True: try: tr = libhfst.HfstTransducer(istr) transducers_read += 1 if transducers_read == 1: if not tr.compare(tr1): print("ERROR: transducer 1 changed.") remove_generated_files() sys.exit(1) if transducers_read == 2: if not tr.compare(tr2): print("ERROR: transducer 2 changed.") remove_generated_files()
def main(): a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help= "HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", required=True, type=FileType('w'), dest="outfile", help="result file") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-a', '--additional-mapping', default="", metavar="MAP", help="Also try using MAP to match analyses and lemmas", choices=["ftb3.1", ""]) a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") options = a.parse_args() his = libhfst.HfstInputStream(options.fsa) omorfi = his.read() if not options.statfile: options.statfile = stdout # basic statistics full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # known bugs by type (FTB 3.1) deduct_forgn = 0 deduct_advposman = 0 deduct_oliprt = 0 deduct_abbr_prop = 0 deduct_unkwn = 0 # known bugs by statistic to deduct (all) deduct_lemma = 0 deduct_anal = 0 deduct_matches = 0 deduct_results = 0 # for make check target threshold = 90 realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 4: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = fields[2] analysis = fields[3] lines += freq if options.verbose: print(lines, '(', freq, ') ...', end='\r') anals = omorfi.lookup(surf) if not options.no_casing: if surf[0].isupper(): anals += omorfi.lookup(surf[0].lower() + surf[1:]) if surf.isupper(): anals += omorfi.lookup(surf.lower()) if surf.isupper(): anals += omorfi.lookup(surf[0] + surf[1:].lower()) found_anals = False found_lemma = False print_in = True for anal in anals: if analysis in anal[0]: found_anals = True if lemma in anal[0]: found_lemma = True if not options.no_casing: if lemma.lower() in anal[0]: found_lemma = True elif lemma.upper() in anal[0]: found_lemma = True if len(anals) == 0: print_in = False no_results += freq if options.additional_mapping == "ftb3.1": if 'Forgn' in analysis: deduct_forgn += freq deduct_results += freq print_in = False elif 'Unkwn' in analysis: deduct_unkwn += freq deduct_results += freq print_in = False else: print("NORESULTS:", freq, surf, lemma, anals, sep="\t", file=options.outfile) if options.verbose: print("?", end='', file=stderr) else: print("NORESULTS:", freq, surf, lemma, anals, sep="\t", file=options.outfile) if options.verbose: print("?", end='', file=stderr) elif not found_anals and not found_lemma: no_matches += freq if options.additional_mapping == "ftb3.1": if 'Adv Pos Man' in analysis: deduct_advposman += freq deduct_matches += freq print_in = False elif 'Unkwn' in analysis: deduct_unkwn += 1 deduct_matches += 1 print_in = False else: print("NOMATCH:", freq, surf, lemma + " " + analysis, sep="\t", end="\t", file=options.outfile) if options.verbose: print("!", end='', file=stderr) else: print("NOMATCH:", freq, surf, lemma + " " + analysis, sep="\t", end="\t", file=options.outfile) if options.verbose: print("!", end='', file=stderr) elif not found_anals: lemma_matches += freq if options.additional_mapping == "ftb3.1": if 'Adv Pos Man' in analysis: deduct_advposman += freq deduct_lemma += freq print_in = False elif 'V Prt Act' in analysis and surf.startswith('oli'): deduct_oliprt += freq deduct_lemma += freq print_in = False elif 'Forgn' in analysis: deduct_forgn += freq deduct_lemma += freq print_in = False elif 'Abbr' in analysis: propfail = False for anal in anals: if 'Abbr Prop' in anal[0]: propfail = True if propfail: deduct_abbr_prop += freq deduct_lemma += freq print_in = False else: print("NOANALMATCH:", freq, surf, analysis, sep="\t", end="\t", file=options.outfile) if options.verbose: print("@", end='', file=stderr) elif 'Unkwn' in analysis: deduct_unkwn += freq deduct_lemma += freq print_in = False else: if options.verbose: print("@", end='', file=stderr) print("NOANALMATCH:", freq, surf, analysis, sep="\t", end="\t", file=options.outfile) else: if options.verbose: print("@", end='', file=stderr) print("NOANALMATCH:", freq, surf, analysis, sep="\t", end="\t", file=options.outfile) elif not found_lemma: anal_matches += freq print("NOLEMMAMATCH:", freq, surf, lemma, sep="\t", end="\t", file=options.outfile) if options.verbose: print("#", end='', file=stderr) else: if options.verbose: print(".", end='', file=stderr) full_matches += freq print_in = False if print_in: print(":IN:", end="\t", file=options.outfile) for anal in anals: print(anal[0], end='\t', file=options.outfile) print(file=options.outfile) realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if options.additional_mapping == "ftb3.1": print("Deducting known bugs...\n", "Forgn:", deduct_forgn, "\nAdv Pos Man:", deduct_advposman, "\noli V Prt Act:", deduct_oliprt, "\nAbbr Prop:", deduct_abbr_prop, "\nUnkwn:", deduct_unkwn, file=options.statfile) lines = lines - deduct_forgn - deduct_advposman - deduct_oliprt - deduct_abbr_prop - deduct_unkwn no_results -= deduct_results no_matches -= deduct_matches lemma_matches -= deduct_lemma if options.additional_mapping != '': print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if lines == 0 or (full_matches / lines * 100 < threshold): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
tot = 0 for m, o in odistr: outputs[i].append(o) tot += m if m >= TH: break outputs['#'].append('#') outputs['_#_'].append('_#_') out = libhfst.create_hfst_output_stream("", libhfst.TROPICAL_OPENFST_TYPE, 1) ustr_model = libhfst.HfstInputStream(argv[2]).read() str_model = libhfst.HfstInputStream(argv[3]).read() for i, line in enumerate(imap(lambda x: x.strip(), stdin)): stderr.write("LINE: %u\r" % i) expr = '' if line == '': continue chars = ('_#_ _#_ # ' + line.replace('0','"0"') + ' # _#_ _#_').split(' ') for char in chars: expr += ('%s [%s] £ ' % (escape(char), '|'.join([escape(c) for c in outputs[char]]))) re = libhfst.regex(expr) re.compose(ustr_model)
from sys import stdin, argv from itertools import imap import libhfst dictionary = libhfst.HfstInputStream(argv[1]).read() first_candidate = None dict_candidate = None for line in imap(lambda x: x.strip(), stdin): if line == '<SEP>': if dict_candidate != None: print dict_candidate else: print first_candidate dict_candidate = None first_candidate = None else: if first_candidate == None: first_candidate = line if dict_candidate == None and len(dictionary.lookup(line)) != 0: dict_candidate = line if dict_candidate != None: print dict_candidate elif first_candidate != None: print first_candidate
def __init__(self, filename): self.istr = libhfst.HfstInputStream(filename) self.transducer = libhfst.HfstTransducer(self.istr)