def _analyse(this, token, automaton): res = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token)) if len(token) > 2 and token[0].islower() and not token[1:].islower() and this.can_titlecase: tcres = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token[0].lower() + token[1:].lower())) for r in tcres: r.output = r.output + convert_omor_tag('[CASECHANGE=TITLECASED]', automaton) res = res + tcres if not token.isupper() and this.can_uppercase: upres = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token.upper())) for r in tupes: r.output = r.output + convert_omor_tag('[CASECHANGE=UPPERCASED]'. automaton) res = res + tcres if not token.islower() and this.can_lowercase: lowres = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token.lower())) for r in lowres: r.output = r.output + convert_omor_tag('[CASECHANGE=LOWERCASED]', automaton) res += lowres for r in res: r.output = r.output + convert_omor_tag('[WEIGHT=%f]' %(r.weight), automaton) return res
out.redirect(tr) assert(False) except: # libhfst.StreamIsClosedException: assert(libhfst.hfst_get_exception() == "StreamIsClosedException") pass # Transducer is cyclic. # --------------------- print("TransducerIsCyclicException") for type in types: transducer = libhfst.HfstTransducer("a", "b", type) transducer.repeat_star() try: results = libhfst.detokenize_paths(libhfst.extract_paths(transducer)) print("The transducer has {0} paths".format(len(results))) assert(False) except: # libhfst.TransducerIsCyclicException: print("The transducer is cyclic and has an infinite number of paths.") # The stream does not contain transducers. # ---------------------------------------- print("NotTransducerStreamException") foofile = open('foofile', 'wb') foofile.write('This is a text file.\n'.encode('ascii')) foofile.write('Here is another line.\n'.encode('ascii')) foofile.write('The file ends here.'.encode('ascii')) foofile.close()
import libhfst import io tr = libhfst.HfstTransducer('a', 'b', libhfst.TROPICAL_OPENFST_TYPE) paths = libhfst.extract_paths(tr) for path in libhfst.detokenize_paths(paths): print("{0}:{1} {2}".format(path.input, path.output, path.weight)) tr = libhfst.HfstTransducer('a', 'b', libhfst.TROPICAL_OPENFST_TYPE) tr.convert(libhfst.HFST_OLW_TYPE) for path in libhfst.detokenize_paths(tr.lookup("a")): print("{0} {1}".format(path.output, path.weight))
out.close() out.redirect(tr) assert (False) except: # libhfst.StreamIsClosedException: assert (libhfst.hfst_get_exception() == "StreamIsClosedException") pass # Transducer is cyclic. # --------------------- print("TransducerIsCyclicException") for type in types: transducer = libhfst.HfstTransducer("a", "b", type) transducer.repeat_star() try: results = libhfst.detokenize_paths(libhfst.extract_paths(transducer)) print("The transducer has {0} paths".format(len(results))) assert (False) except: # libhfst.TransducerIsCyclicException: print("The transducer is cyclic and has an infinite number of paths.") # The stream does not contain transducers. # ---------------------------------------- print("NotTransducerStreamException") foofile = open('foofile', 'wb') foofile.write('This is a text file.\n'.encode('ascii')) foofile.write('Here is another line.\n'.encode('ascii')) foofile.write('The file ends here.'.encode('ascii')) foofile.close() try:
def main(): a = ArgumentParser( description="Tokeniser for plain text data using HFST automata. " "Takes a text stream input and outputs TSV token stream where " "one line is one token. Tokens should include white-space tokens," "but this decision is solely up to output of used automata. " "Some automata may be able to parse non-plain marked up text.", epilog="If INFILE or OFILE is omitted, standard streams will be " "used.\n" "If DISAMB is omitted, greedy LRLM will be used.") a.add_argument('inputs', metavar='INFILE', type=open, nargs='*', help="Files to process with corpus tool") a.add_argument('--output', '-o', metavar='OFILE', type=FileType('w'), help="store result in OFILE") a.add_argument('--tokeniser', '-t', action='append', metavar='TFILE', help="Pre-process input stream with automata from TFILE") a.add_argument('--disambiguation', '-d', metavar='DISAMB', choices=['LRLM'], default='LRLM', help="use DISAMB tactic to select from multiple paths") a.add_argument("--verbose", '-v', action='store_true', help="print verbosely while processing") opts = a.parse_args() tokenisers = list() if not opts.output: if opts.verbose: print("printing output to stdout, disabling verbose", stderr) opts.verbose = False opts.output = stdout if not opts.tokeniser: if opts.verbose: print("Using Unicode tokeniser with character classes") tokeniserstream = libhfst.HfstInputStream("tokeniser-unicode.openfst.hfst") t = libhfst.HfstTransducer(tokeniserstream) tokenisers.append(t) else: for tokeniserfile in opts.tokeniser: if opts.verbose: print("Reading from", tokeniserfile) tokeniserstream = libhfst.HfstInputStream(tokeniserfile) t = libhfst.HfstTransducer(tokeniserstream) if opts.verbose: print("Read tokeniser", t.get_property('name')) tokenisers.append(t) if len(opts.inputs) < 1: if opts.verbose: print("Reading corpus data from <stdin>") opts.inputs = [stdin] if opts.verbose: print("Creating UTF-8 character tokeniser for HFST") hfst_tokeniser = libhfst.HfstTokenizer() for inputfile in opts.inputs: print("# hfst-tokenise.py TSV token stream 1", file=opts.output) print("# From input file", inputfile, file=opts.output) print("# Next line is a header line", file=opts.output) print("Token", file=opts.output) for line in inputfile: line = line.strip('\n') if not line or line == '': print('\\n', file=opts.output) continue could_tokenise = False for tokeniser in tokenisers: if tokeniser.get_type() == libhfst.TROPICAL_OPENFST_TYPE: pathmaton = libhfst.HfstTransducer(line, hfst_tokeniser, libhfst.TROPICAL_OPENFST_TYPE) tokenisation = libhfst.extract_paths_fd(pathmaton.compose(tokeniser)) paths = libhfst.detokenize_paths(tokenisation) tokens = None if opts.disambiguation == 'LRLM': tokens = take_greedy_lrlm_tokens(paths) else: print("What is this DISAMB?", opts.disambiguation, file=stderr) if tokens: for token in tokens: print(token.replace('@_EPSILON_SYMBOL_@', '')) could_tokenise = True break else: if opts.verbose: print("Got no tokens with FOO using", opts.disambiguation) else: print("Not impl !OFST", file=stderr) exit(2) if not could_tokenise: for token in line.split(): print(token, file=opts.output) print("\\n", file=opts.output) exit(0)
def main(): a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", required=True, type=FileType('w'), dest="outfile", help="result file") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") options = a.parse_args() omorfi = libhfst.HfstTransducer(libhfst.HfstInputStream(options.fsa)) if not options.statfile: options.statfile = stdout # basic statistics full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # known bugs by type deduct_forgn = 0 deduct_advposman = 0 deduct_oliprt = 0 deduct_abbr_prop = 0 # known bugs by statistic to deduct deduct_lemma = 0 deduct_anal = 0 deduct_matches = 0 deduct_results = 0 # for make check target threshold = 90 for line in options.infile: conllxes = line.split('\t') if len(conllxes) < 10: if not line.startswith("<"): print("ERROR: Skipping line", line, file=stderr) continue lines += 1 ftbsurf = conllxes[1] ftblemma = conllxes[2] ftbanals = conllxes[5] anals = libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf)) if ftbsurf[0].isupper(): anals += libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf[0].lower() + ftbsurf[1:])) if ftbsurf.isupper(): anals += libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf.lower())) if ftbsurf.isupper(): anals += libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf[0] + ftbsurf[1:].lower())) found_anals = False found_lemma = False print_in = True for anal in anals: if ftbanals in anal.output: found_anals = True if ftblemma in anal.output: found_lemma = True if len(anals) == 0: print_in = False no_results += 1 if 'Forgn' in ftbanals: deduct_forgn += 1 deduct_results += 1 print_in = False else: print("NORESULTS:", ftbsurf, ftblemma, ftbanals, sep="\t", file=options.outfile) elif not found_anals and not found_lemma: no_matches += 1 if 'Adv Pos Man' in ftbanals: deduct_advposman += 1 deduct_matches += 1 print_in = False else: print("NOMATCH:", ftbsurf, ftblemma, ftbanals, sep="\t", end="\t", file=options.outfile) elif not found_anals: lemma_matches += 1 if 'Adv Pos Man' in ftbanals: deduct_advposman += 1 deduct_lemma += 1 print_in = False elif 'V Prt Act' in ftbanals and ftbsurf.startswith('oli'): deduct_oliprt += 1 deduct_lemma += 1 print_in = False elif 'Forgn' in ftbanals: deduct_forgn += 1 deduct_lemma += 1 print_in = False elif 'Abbr' in ftbanals: propfail = False for anal in anals: if 'Abbr Prop' in anal.output: propfail = True if propfail: deduct_abbr_prop += 1 deduct_lemma += 1 print_in = False else: print("NOANALMATCH:", ftbsurf, ftbanals, sep="\t", end="\t", file=options.outfile) else: print("NOANALMATCH:", ftbsurf, ftbanals, sep="\t", end="\t", file=options.outfile) elif not found_lemma: anal_matches += 1 print("NOLEMMAMATCH:", ftbsurf, ftblemma, sep="\t", end="\t", file=options.outfile) else: full_matches += 1 print_in = False if print_in: print(":IN:", end="\t", file=options.outfile) for anal in anals: print(anal.output, end='\t', file=options.outfile) print(file=options.outfile) print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100, full_matches / lines * 100, lemma_matches / lines * 100, anal_matches / lines * 100, no_matches / lines * 100, no_results / lines * 100, sep="\t", file=options.statfile) print("Deducting known bugs...\n", "Forgn:", deduct_forgn, "\nAdv Pos Man:", deduct_advposman, "\noli V Prt Act:", deduct_oliprt, "\nAbbr Prop:", deduct_abbr_prop, file=options.statfile) lines = lines - deduct_forgn - deduct_advposman - deduct_oliprt - deduct_abbr_prop no_results -= deduct_results no_matches -= deduct_matches lemma_matches -= deduct_lemma print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100, full_matches / lines * 100, lemma_matches / lines * 100, anal_matches / lines * 100, no_matches / lines * 100, no_results / lines * 100, sep="\t", file=options.statfile) if (full_matches / lines * 100 < threshold): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
hippopotamus2.set_final_weights(1.4) animals.disjunct(hippopotamus1) animals.disjunct(hippopotamus2) animals.minimize() results = libhfst.extract_paths(animals, 5, 0) #print results # Convert into optimized lookup format animals_ol = libhfst.HfstTransducer(animals) if type == libhfst.TROPICAL_OPENFST_TYPE: animals_ol.convert(libhfst.HFST_OLW_TYPE) else: animals_ol.convert(libhfst.HFST_OL_TYPE) result = libhfst.detokenize_paths(animals_ol.lookup("hippopotamus")) #for res in result: # print res[0] # print res[1] if type == libhfst.TROPICAL_OPENFST_TYPE: best_animals = libhfst.HfstTransducer(animals) best_animals.n_best(3) best_animals.convert(libhfst.HFST_OLW_TYPE) assert(libhfst.detokenize_paths(best_animals.lookup("mouse"))[0].output == "mice") assert(libhfst.detokenize_paths(best_animals.lookup("hippopotamus"))[0].output == "hippopotami") assert(libhfst.detokenize_paths(best_animals.lookup("hippopotamus"))[1].output == "hippopotamuses") print("Function insert_freely")