def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 for line in options.infile: line = line if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: tokens += 1 anals = omorfi.analyse(surf) print_analyses_vislcg3(surf, anals, options.outfile) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
class FinnishParser: def __init__(self): self.omorfi = Omorfi() self.omorfi.load_from_dir() self.tokenizer = RegexpTokenizer( '\w+\-\w+|\w+|\$[\d\.]+|\.\.\.|[",!\.\(\)]|\S+') @staticmethod def omorfi_to_base(omorfi_form): return re.search(r"\[WORD_ID=(.*?)\]", omorfi_form).group(1) @staticmethod def omorfi_to_grammar(omorfi_form): return re.sub(r"\[WORD_ID=.*?\]", "", omorfi_form) def tokenize(self, text): text = re.sub("\[\d+\]|\ufeff", "", text) return self.tokenizer.tokenize(text) def get_sentence_start_indexes(self, tokens): start_indexes = [] sentence_ended = False sentence_end_regex = r"\.\.\.|[\.!\?:;]" for i, token in enumerate(tokens): if re.match(sentence_end_regex, token): sentence_ended = True else: if sentence_ended: start_indexes.append(i) sentence_ended = False return start_indexes def parse(self, text): tokens = self.tokenize(text) parsed_words = [self.analyse(t) for t in tokens] sentence_start_indexes = self.get_sentence_start_indexes(tokens) return parsed_words, tokens, sentence_start_indexes def analyse(self, word): omorfi_form = self.omorfi.analyse(word) first_form = omorfi_form[0][0] return AnalysedWord(self.omorfi_to_base(first_form), self.omorfi_to_grammar(first_form)) def is_valid_word(self, word): return word.grammar != "[GUESS=UNKNOWN][WEIGHT=inf]"
def main(): a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the " "transducer to be applied") options = a.parse_args() omorfi = Omorfi() omorfi.load_from_dir(options.fsa, analyse=True, accept=True) tokens = omorfi.python_tokenise(WEIRD_TOK) # Check tokens are in same order as text start = 0 for token in tokens: start = WEIRD_TOK.index(token['surf'], start)
def get_omorfi(): """ Gets an Omorfi instance with everything possible enabled. Reuses the existing instance if already called once. """ from omorfi.omorfi import Omorfi global _omorfi if _omorfi is None: _omorfi = Omorfi() for var, fn in FSTS: getattr(_omorfi, "load_" + var)( "/usr/local/share/omorfi/omorfi." + fn ) return _omorfi
def stream(text): om = Omorfi() om.load_from_dir('/usr/local/share/omorfi/', analyse=True) for token in om.tokenise(text): yield "%s\n" % token[0] for analyse_res in om.analyse(token): text, weight = analyse_res[:2] if len(analyse_res) > 2: rest = " ".join([str(x) for x in analyse_res[2:]]) else: rest = '' yield "%s %s %s\n" % (text, weight, rest) yield "\n"
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('--hacks', metavar='HACKS', help="mangle anaelyses to match HACKS version of UD", choices=['ftb']) a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # conllu is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: if '-' in fields[0]: continue else: print( "Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if anals and len(anals) > 0: if options.debug: debug_analyses_conllu( fields, index, surf, anals, options.outfile, options.hacks) elif options.oracle: try_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) else: print_analyses_conllu(index, surf, anals[0], options.outfile, options.hacks) if not anals or len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 elif line.startswith('# doc-name:') or line.startswith('# sentence-text:'): # these comments I know need to be retained as-is print(line.strip(), file=options.outfile) elif line.startswith('#'): # unknown comment print(line.strip(), file=options.outfile) if options.verbose: print("Warning! Unrecognised comment line:", line, sep='\n') elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in conllu format:", line, sep='\n', file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License version 3 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. from omorfi.omorfi import Omorfi omorfi = Omorfi() omorfi.load_analyser("/usr/local/share/omorfi/omorfi.analyse.hfst") omorfi.load_generator("/usr/local/share/omorfi/omorfi.generate.hfst") import settings PROPERTIES = { "nominatiivi": [("CASE", "NOM")], "genetiivi": [("CASE", "GEN")], "partitiivi": [("CASE", "PAR")], "translatiivi": [("CASE", "TRA")], "essiivi": [("CASE", "ESS")], "inessiivi": [("CASE", "INE")], "elatiivi": [("CASE", "ELA")], "illatiivi": [("CASE", "ILL")], "adessiivi": [("CASE", "ADE")],
def main(): global sent a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help= "HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=str, required=True, dest="infile", help="source of analysis data") a.add_argument('-m', '--master', metavar="TSVFILE", type=str, required=True, dest="tsvfile", help="source of existing lexical data") opts = a.parse_args() if opts.infile: test_corpora_files = [opts.infile] else: test_corpora_files = glob("*.text") # hard-coded logs for now lemma_log = open('missing_word_ids.log', 'w') case_log = open('missing_nominal_cases.log', 'w') comp_log = open('missing_comparatives.log', 'w') adposition_log = open('adposition_complements.log', 'w') adposition_stats = open('adposition_complements_full.log', 'w') adjective_log = open('adjective_agreements.log', 'w') omorfi = Omorfi() omorfi.load_filename(opts.fsa) gather_lemmas(open(opts.tsvfile)) test_corpora = list() for test_corpus_file in test_corpora_files: try: test_corpora.append(open(test_corpus_file)) except IOError as ioe: print("Failed to open corpus ", test_corpus_file, ":", ioe) for test_corpus in test_corpora: print('lines from', test_corpus.name) linen = 0 for line in test_corpus: linen += 1 if (linen % 200000) == 0: print( linen, "...! Time to reload everything because memory is leaking very badly indeed!" ) sent = list() omorfi = None omorfi = Omorfi() omorfi.load_filename(opts.fsa) gc.collect() if (linen % 1000) == 0: print(linen, "...", end='\r') for punct in "\".,:;?!()": line = line.replace(punct, " " + punct + " ") for token in line.split(): analyses = omorfi.analyse(token) add_to_sent(analyses, token) stat_word_ids(token, analyses) stat_nominal_cases(token, analyses, case_log) stat_adjective_comps(token, analyses, comp_log) print("Testing statistics") test_zero_lemmas(lemma_log) test_zero_cases(case_log) test_zero_comps(comp_log) # test_case_deviations() test_adposition_complements(adposition_log) test_adjective_agreements(adjective_log) print("Writing accurate statistics") print_adposition_stats(adposition_stats) print_lemma_stats(open('lemmas.freqs', 'w')) print_case_stats(open('cases.freqs', 'w')) exit(0)
def main(): a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help= "HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'), required=True, dest="outfile", help="log file name") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int, help="require THOLD % coverage or exit 1 (for testing)") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() # statistics tokens = 0 uniqs = 0 found_tokens = 0 found_uniqs = 0 missed_tokens = 0 missed_uniqs = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] tokens += freq uniqs += 1 if options.verbose: print(tokens, "(", freq, ')...', end='\r') anals = omorfi.analyse(surf) if len(anals) > 0 and "GUESS=UNKNOWN" not in anals[0][0]: found_tokens += freq found_uniqs += 1 else: missed_tokens += freq missed_uniqs += 1 print(freq, surf, "? (missed)", sep="\t", file=options.outfile) if options.verbose: print() cpuend = process_time() realend = perf_counter() print("cpu time: ", cpuend - cpustart, "real time:", realend - realstart) print("Tokens", "Matches", "Misses", "%", sep="\t") print(tokens, found_tokens, missed_tokens, found_tokens / tokens * 100 if tokens != 0 else 0, sep="\t") print("Uniqs", "Matches", "Misses", "%", sep="\t") print(uniqs, found_uniqs, missed_uniqs, found_uniqs / uniqs * 100 if uniqs != 0 else 0, sep="\t") if tokens == 0 or (found_tokens / tokens * 100 < options.threshold): print("needs to have", options.threshold, "% non-unique matches to pass regress test\n", file=stderr) exit(1) else: exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load tokeniser model from (analyser) AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu', 'json', 'ftb3']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading language model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed for tokenisation", file=stderr) exit(1) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# new doc id=", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf['surf'] for surf in surfs]), file=options.outfile) elif options.output_format == 'json': print(json.encode(surfs)) elif options.output_format == 'conllu': print("# sent_id =", lines, file=options.outfile) print("# text =", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", format_misc_ud(surf), sep="\t", file=options.outfile) i += 1 elif options.output_format == 'ftb3': print("<s><loc file=\"", options.infile.name, "\" line=\"", lines, "\" />", file=options.outfile, sep="") i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 print("</s>", file=options.outfile) if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-F', '--format', metavar="INFORMAT", default='text', help="read input using INFORMAT tokenisation", choices=['text', 'vislcg', 'conllu']) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is required to vislcg", file=stderr) exit(4) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: if options.outfile == stdout: options.statfile = stdout else: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 last = None for line in options.infile: surfs = [] if options.format == 'vislcg': surfs = get_line_tokens_vislcg(line, last) elif options.format == 'text': surfs = get_line_tokens(line, omorfi) elif options.format == 'conllu': surfs = get_line_tokens_conllu(line, last) else: print("input format missing implementation", options.format, file=stderr) exit(2) for surf in surfs: if 'conllu_form' in surf: # skip conllu special forms in input for now: # (ellipsis and MWE magics) continue elif 'surf' in surf: tokens += 1 anals = omorfi.analyse(surf) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0]['anal']): unknowns += 1 anals = omorfi.guess(surf) print_analyses_vislcg3(surf, anals, options.outfile) elif 'comment' in surf: if surf['comment'].startswith(';') or \ surf['comment'].startswith('\t'): continue else: print(surf['comment'], file=options.outfile) elif 'error' in surf: print(surf['error'], file=stderr) exit(2) last = surf cpuend = process_time() realend = perf_counter() print("# Tokens:", tokens, "\n# Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("# CPU time:", cpuend - cpustart, "\n# Real time:", realend - realstart, file=options.statfile) print("# Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--split-words', action="store_true", default=True, help="split on word boundaries") a.add_argument( '--split-new-words', action="store_true", default=True, help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--split-morphs', action="store_true", default=True, help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=True, help="split on other boundaries") a.add_argument('--segment-marker', default=' ', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: segments = omorfi.segment(surf) labelsegments = omorfi.labelsegment(surf) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, surf, outfile) elif options.output_format == 'segments': print_segments(segments, labelsegments, surf, outfile, options) print(file=outfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to ftb3", file=stderr) exit(4) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout lexprobs = None tagprobs = None if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # ftb is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'OOV' in anals[0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.oracle: try_analyses_ftb(fields, index, surf, anals, options.outfile) else: print_analyses_ftb(index, surf, anals[0], options.outfile) else: print("Failed:", fields) exit(1) elif line.startswith('<') and line.rstrip().endswith('>'): print(line.strip(), file=options.outfile) elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in ftb3 format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def __init__(self): self.omorfi = Omorfi() self.omorfi.load_from_dir() self.tokenizer = RegexpTokenizer( '\w+\-\w+|\w+|\$[\d\.]+|\.\.\.|[",!\.\(\)]|\S+')
def stream(text): om = Omorfi() om.load_from_dir('/usr/local/share/omorfi/', lemmatise=True) for token in om.tokenise(text): yield " ".join(map(lambda x: str(x), om.lemmatise(token[0])))
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# doc-name:", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf[0] for surf in surfs]), file=options.outfile) else: print("# sentence-text:", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf[0], "_", "_", "_", "_", "_", "_", "_", surf[1], sep="\t", file=options.outfile) i += 1 if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)
def main(): a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'), required=True, dest="outfile", help="log file name") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int, help="require THOLD % coverage or exit 1 (for testing)") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() # statistics tokens = 0 uniqs = 0 found_tokens = 0 found_uniqs = 0 missed_tokens = 0 missed_uniqs = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] tokens += freq uniqs += 1 if options.verbose: print(tokens, "(", freq, ')...', end='\r') anals = omorfi.analyse(surf) if len(anals) > 0 and "GUESS=UNKNOWN" not in anals[0][0]: found_tokens += freq found_uniqs += 1 else: missed_tokens += freq missed_uniqs += 1 print(freq, surf, "? (missed)", sep="\t", file=options.outfile) if options.verbose: print() cpuend = process_time() realend = perf_counter() print("cpu time: ", cpuend - cpustart, "real time:", realend - realstart) print("Tokens", "Matches", "Misses", "%", sep="\t") print(tokens, found_tokens, missed_tokens, found_tokens / tokens * 100 if tokens != 0 else 0, sep="\t") print("Uniqs", "Matches", "Misses", "%", sep="\t") print(uniqs, found_uniqs, missed_uniqs, found_uniqs / uniqs * 100 if uniqs != 0 else 0, sep="\t") if tokens == 0 or (found_tokens / tokens * 100 < options.threshold): print("needs to have", options.threshold, "% non-unique matches to pass regress test\n", file=stderr) exit(1) else: exit(0)
def main(): global sent a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument( '-i', '--input', metavar="INFILE", type=str, required=True, dest="infile", help="source of analysis data") a.add_argument( '-m', '--master', metavar="TSVFILE", type=str, required=True, dest="tsvfile", help="source of existing lexical data") opts = a.parse_args() if opts.infile: test_corpora_files = [opts.infile] else: test_corpora_files = glob("*.text") # hard-coded logs for now lemma_log = open('missing_word_ids.log', 'w') case_log = open('missing_nominal_cases.log', 'w') comp_log = open('missing_comparatives.log', 'w') adposition_log = open('adposition_complements.log', 'w') adposition_stats = open('adposition_complements_full.log', 'w') adjective_log = open('adjective_agreements.log', 'w') omorfi = Omorfi() omorfi.load_filename(opts.fsa) gather_lemmas(open(opts.tsvfile)) test_corpora = list() for test_corpus_file in test_corpora_files: try: test_corpora.append(open(test_corpus_file)) except IOError as ioe: print("Failed to open corpus ", test_corpus_file, ":", ioe) for test_corpus in test_corpora: print('lines from', test_corpus.name) linen = 0 for line in test_corpus: linen += 1 if (linen % 200000) == 0: print( linen, "...! Time to reload everything because memory is leaking very badly indeed!") sent = list() omorfi = None omorfi = Omorfi() omorfi.load_filename(opts.fsa) gc.collect() if (linen % 1000) == 0: print(linen, "...", end='\r') for punct in "\".,:;?!()": line = line.replace(punct, " " + punct + " ") for token in line.split(): analyses = omorfi.analyse(token) add_to_sent(analyses, token) stat_word_ids(token, analyses) stat_nominal_cases(token, analyses, case_log) stat_adjective_comps(token, analyses, comp_log) print("Testing statistics") test_zero_lemmas(lemma_log) test_zero_cases(case_log) test_zero_comps(comp_log) # test_case_deviations() test_adposition_complements(adposition_log) test_adjective_agreements(adjective_log) print("Writing accurate statistics") print_adposition_stats(adposition_stats) print_lemma_stats(open('lemmas.freqs', 'w')) print_case_stats(open('cases.freqs', 'w')) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE", required=True) a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter model from SFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("Reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("at least analyser file is needed", file=stderr) exit(1) if options.segmenter: if options.verbose: print("Reading segmenter model", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("at least segmenter file is needed", file=stderr) exit(1) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", infile.name) if options.verbose: print("writign to", outfile.name) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue tokens = omorfi.tokenise_sentence(line) for token in tokens: if not token.surf: continue anals = omorfi.analyse(token) pos = "X" mrds = ["?"] lemmas = [token.surf] if anals: anal = token.get_best() pos = anal.get_upos() mrds = anal.get_ufeats() lemmas = anal.get_lemmas() segments = omorfi.segment(token) morphs = "0" if segments: segment = token.get_best_segments() if segment: parts = segment.get_segments() morphs = ".".join(parts) else: morphs = token.surf print(token.surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument('--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token[0]) labelsegments = omorfi.labelsegment(token[0]) if options.output_format == 'moses-factors': print_moses_factor_segments( segments, labelsegments, token[0], outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token[0], outfile, options) print(file=outfile) exit(0)
def main(): a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-f', '--format', metavar="FORMAT", help="use FORMAT formatter to compare analyses", choices=["coverage", "ftb3.1"], default="coverage") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = surf analysis = surf if options.format != 'coverage': lemma = fields[2] analysis = fields[3] lines += freq if options.verbose: print(lines, '(', freq, ') ...', end='\r') anals = omorfi.analyse(surf) if not is_tokenlist_oov(anals): covered += freq else: no_results += freq print("OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False for anal in anals: if options.format == 'ftb3.1': anal_ftb3 = format_feats_ftb(anal) lemma_ftb3 = '#'.join(get_lemmas(anal)) # hacks ftb3: analysis = analysis.replace(" >>>", "") if analysis == anal_ftb3: found_anals = True print("ANALHIT", analysis, anal_ftb3, file=options.outfile) elif set(anal_ftb3.split()) == set(analysis.split()): found_anals = True print("PERMUTAHIT", analysis, anal_ftb3, file=options.outfile) else: print("ANALMISS", analysis, anal_ftb3, file=options.outfile) if lemma == lemma_ftb3: found_lemma = True print("LEMMAHIT", lemma, lemma_ftb3, file=options.outfile) elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''): found_lemma = True print("LEMMARECOMP", lemma, lemma_ftb3, file=options.outfile) else: print("LEMMAMISS", lemma, lemma_ftb3, file=options.outfile) if options.format != 'coverage': if not found_anals and not found_lemma: no_matches += freq print("NOHITS!", surf, sep='\t', file=options.outfile) elif found_anals and found_lemma: print("HIT", surf, sep='\t', file=options.outfile) full_matches += freq elif not found_anals: anal_matches += freq print("LEMMANOANAL", surf, sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += freq print("ANALNOLEMMA", surf, sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if options.format == 'ftb3.1': print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) elif options.format == 'ftb3.1' and \ (full_matches / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif options.format == 'coverage' and \ (covered / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter from SFILE", required=True) a.add_argument('-S', '--labeller', metavar='LSFILE', help="load labelsegmenter from LSFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", required=True, choices=["moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument( '--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") a.add_argument('--show-ambiguous', default=False, metavar='ASEP', help="separate ambiguous segmentations with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.segmenter: if options.verbose: print("Reading segmenter", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("segmenter is needed for segmenting", file=stderr) exit(2) if options.labeller: if options.verbose: print("Reading labelsegmenter", options.labeller) omorfi.load_labelsegmenter(options.labeller) if not omorfi.can_segment: print("Could not load segmenter(s), re-compile them or use -f option") print() print("To compile segmenter, use --enable-segmenter, and/or", "--enable-labeled-segments") exit(1) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token) labelsegments = omorfi.labelsegment(token) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, token, outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token, outfile, options) print(file=outfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE", required=True) a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter model from SFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("Reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("at least analyser file is needed", file=stderr) exit(1) if options.segmenter: if options.verbose: print("Reading segmenter model", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("at least segmenter file is needed", file=stderr) exit(1) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", infile.name) if options.verbose: print("writign to", outfile.name) re_lemma = re.compile("\[WORD_ID=([^]]*)\]") re_pos = re.compile("\[UPOS=([^]]*)\]") re_mrd = re.compile("\[([^=]*)=([^]]*)]") linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = line.split() for surf in surfs: anals = omorfi.analyse(surf) segments = omorfi.segment(surf) pos_matches = re_pos.finditer(anals[0]['anal']) pos = "UNK" mrds = [] lemmas = [] for pm in pos_matches: pos = pm.group(1) lemma_matches = re_lemma.finditer(anals[0]['anal']) for lm in lemma_matches: lemmas += [lm.group(1)] mrd_matches = re_mrd.finditer(anals[0]['anal']) for mm in mrd_matches: if mm.group(1) == 'WORD_ID': mrds = [] elif mm.group(1) == 'WEIGHT': pass else: mrds += [mm.group(2)] parts = segments[0]['segments'] if '{DB}' in parts: suffixes = parts[parts.rfind('{DB}') + 4:] elif '{WB}' in parts: suffixes = parts[parts.rfind('{WB}') + 4:] elif '{hyph?}' in parts: suffixes = parts[parts.rfind('{hyph?}') + 6:] else: suffixes = "0" morphs = suffixes[suffixes.find("{"):].replace("{MB}", ".") print(surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle anaelyses to match HACKS version of UD", choices=['ftb']) a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, guesser=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 recognised_comments = [ 'sent_id =', 'text =', 'doc-name:', 'sentence-text:' ] for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # conllu is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: if '-' in fields[0]: # MWE continue elif '.' in fields[0]: # a ghost continue else: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.debug: debug_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) elif options.oracle: try_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) else: print_analyses_conllu(index, surf, anals[0], options.outfile, options.hacks) elif line.startswith('#'): print(line.strip(), file=options.outfile) recognised = False for rec in recognised_comments: if line.startswith('# ' + rec): recognised = True if not recognised and options.verbose: print("Warning! Unrecognised comment line:", line, sep='\n') elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in conllu format:", line, sep='\n', file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir() if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) re_lemma = re.compile("\[WORD_ID=([^]]*)\]") re_pos = re.compile("\[POS=([^]]*)\]") re_mrd = re.compile("\[([^=]*)=([^]]*)]") linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = line.split() for surf in surfs: anals = omorfi.analyse(surf) segments = omorfi.segment(surf) pos_matches = re_pos.finditer(anals[0][0]) pos = "UNK" mrds = [] lemmas = [] for pm in pos_matches: pos = pm.group(1) lemma_matches = re_lemma.finditer(anals[0][0]) for lm in lemma_matches: lemmas += [lm.group(1)] mrd_matches = re_mrd.finditer(anals[0][0]) for mm in mrd_matches: if mm.group(1) == 'WORD_ID': mrds = [] elif mm.group(1) == 'WEIGHT': pass else: mrds += [mm.group(2)] stemfixes = segments[0][0][ segments[0][0].rfind("{STUB}"):].replace("{STUB}", "") if '{' in stemfixes: morphs = stemfixes[stemfixes.find("{"):].replace("{MB}", ".") else: morphs = '0' print(surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)