def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 for line in options.infile: line = line if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: tokens += 1 anals = omorfi.analyse(surf) print_analyses_vislcg3(surf, anals, options.outfile) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def stream(text): om = Omorfi() om.load_from_dir('/usr/local/share/omorfi/', analyse=True) for token in om.tokenise(text): yield "%s\n" % token[0] for analyse_res in om.analyse(token): text, weight = analyse_res[:2] if len(analyse_res) > 2: rest = " ".join([str(x) for x in analyse_res[2:]]) else: rest = '' yield "%s %s %s\n" % (text, weight, rest) yield "\n"
class FinnishParser: def __init__(self): self.omorfi = Omorfi() self.omorfi.load_from_dir() self.tokenizer = RegexpTokenizer( '\w+\-\w+|\w+|\$[\d\.]+|\.\.\.|[",!\.\(\)]|\S+') @staticmethod def omorfi_to_base(omorfi_form): return re.search(r"\[WORD_ID=(.*?)\]", omorfi_form).group(1) @staticmethod def omorfi_to_grammar(omorfi_form): return re.sub(r"\[WORD_ID=.*?\]", "", omorfi_form) def tokenize(self, text): text = re.sub("\[\d+\]|\ufeff", "", text) return self.tokenizer.tokenize(text) def get_sentence_start_indexes(self, tokens): start_indexes = [] sentence_ended = False sentence_end_regex = r"\.\.\.|[\.!\?:;]" for i, token in enumerate(tokens): if re.match(sentence_end_regex, token): sentence_ended = True else: if sentence_ended: start_indexes.append(i) sentence_ended = False return start_indexes def parse(self, text): tokens = self.tokenize(text) parsed_words = [self.analyse(t) for t in tokens] sentence_start_indexes = self.get_sentence_start_indexes(tokens) return parsed_words, tokens, sentence_start_indexes def analyse(self, word): omorfi_form = self.omorfi.analyse(word) first_form = omorfi_form[0][0] return AnalysedWord(self.omorfi_to_base(first_form), self.omorfi_to_grammar(first_form)) def is_valid_word(self, word): return word.grammar != "[GUESS=UNKNOWN][WEIGHT=inf]"
def main(): a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the " "transducer to be applied") options = a.parse_args() omorfi = Omorfi() omorfi.load_from_dir(options.fsa, analyse=True, accept=True) tokens = omorfi.python_tokenise(WEIRD_TOK) # Check tokens are in same order as text start = 0 for token in tokens: start = WEIRD_TOK.index(token['surf'], start)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle anaelyses to match HACKS version of UD", choices=['ftb']) a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, guesser=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 recognised_comments = [ 'sent_id =', 'text =', 'doc-name:', 'sentence-text:' ] for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # conllu is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: if '-' in fields[0]: # MWE continue elif '.' in fields[0]: # a ghost continue else: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.debug: debug_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) elif options.oracle: try_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) else: print_analyses_conllu(index, surf, anals[0], options.outfile, options.hacks) elif line.startswith('#'): print(line.strip(), file=options.outfile) recognised = False for rec in recognised_comments: if line.startswith('# ' + rec): recognised = True if not recognised and options.verbose: print("Warning! Unrecognised comment line:", line, sep='\n') elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in conllu format:", line, sep='\n', file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# doc-name:", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf[0] for surf in surfs]), file=options.outfile) else: print("# sentence-text:", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf[0], "_", "_", "_", "_", "_", "_", "_", surf[1], sep="\t", file=options.outfile) i += 1 if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument('--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token[0]) labelsegments = omorfi.labelsegment(token[0]) if options.output_format == 'moses-factors': print_moses_factor_segments( segments, labelsegments, token[0], outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token[0], outfile, options) print(file=outfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('--hacks', metavar='HACKS', help="mangle anaelyses to match HACKS version of UD", choices=['ftb']) a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # conllu is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: if '-' in fields[0]: continue else: print( "Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if anals and len(anals) > 0: if options.debug: debug_analyses_conllu( fields, index, surf, anals, options.outfile, options.hacks) elif options.oracle: try_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) else: print_analyses_conllu(index, surf, anals[0], options.outfile, options.hacks) if not anals or len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 elif line.startswith('# doc-name:') or line.startswith('# sentence-text:'): # these comments I know need to be retained as-is print(line.strip(), file=options.outfile) elif line.startswith('#'): # unknown comment print(line.strip(), file=options.outfile) if options.verbose: print("Warning! Unrecognised comment line:", line, sep='\n') elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in conllu format:", line, sep='\n', file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help= "HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'), required=True, dest="outfile", help="log file name") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int, help="require THOLD % coverage or exit 1 (for testing)") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() # statistics tokens = 0 uniqs = 0 found_tokens = 0 found_uniqs = 0 missed_tokens = 0 missed_uniqs = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] tokens += freq uniqs += 1 if options.verbose: print(tokens, "(", freq, ')...', end='\r') anals = omorfi.analyse(surf) if len(anals) > 0 and "GUESS=UNKNOWN" not in anals[0][0]: found_tokens += freq found_uniqs += 1 else: missed_tokens += freq missed_uniqs += 1 print(freq, surf, "? (missed)", sep="\t", file=options.outfile) if options.verbose: print() cpuend = process_time() realend = perf_counter() print("cpu time: ", cpuend - cpustart, "real time:", realend - realstart) print("Tokens", "Matches", "Misses", "%", sep="\t") print(tokens, found_tokens, missed_tokens, found_tokens / tokens * 100 if tokens != 0 else 0, sep="\t") print("Uniqs", "Matches", "Misses", "%", sep="\t") print(uniqs, found_uniqs, missed_uniqs, found_uniqs / uniqs * 100 if uniqs != 0 else 0, sep="\t") if tokens == 0 or (found_tokens / tokens * 100 < options.threshold): print("needs to have", options.threshold, "% non-unique matches to pass regress test\n", file=stderr) exit(1) else: exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--split-words', action="store_true", default=True, help="split on word boundaries") a.add_argument( '--split-new-words', action="store_true", default=True, help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--split-morphs', action="store_true", default=True, help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=True, help="split on other boundaries") a.add_argument('--segment-marker', default=' ', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: segments = omorfi.segment(surf) labelsegments = omorfi.labelsegment(surf) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, surf, outfile) elif options.output_format == 'segments': print_segments(segments, labelsegments, surf, outfile, options) print(file=outfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir() if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) re_lemma = re.compile("\[WORD_ID=([^]]*)\]") re_pos = re.compile("\[POS=([^]]*)\]") re_mrd = re.compile("\[([^=]*)=([^]]*)]") linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = line.split() for surf in surfs: anals = omorfi.analyse(surf) segments = omorfi.segment(surf) pos_matches = re_pos.finditer(anals[0][0]) pos = "UNK" mrds = [] lemmas = [] for pm in pos_matches: pos = pm.group(1) lemma_matches = re_lemma.finditer(anals[0][0]) for lm in lemma_matches: lemmas += [lm.group(1)] mrd_matches = re_mrd.finditer(anals[0][0]) for mm in mrd_matches: if mm.group(1) == 'WORD_ID': mrds = [] elif mm.group(1) == 'WEIGHT': pass else: mrds += [mm.group(2)] stemfixes = segments[0][0][ segments[0][0].rfind("{STUB}"):].replace("{STUB}", "") if '{' in stemfixes: morphs = stemfixes[stemfixes.find("{"):].replace("{MB}", ".") else: morphs = '0' print(surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def stream(text): om = Omorfi() om.load_from_dir('/usr/local/share/omorfi/', lemmatise=True) for token in om.tokenise(text): yield " ".join(map(lambda x: str(x), om.lemmatise(token[0])))
def main(): a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'), required=True, dest="outfile", help="log file name") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int, help="require THOLD % coverage or exit 1 (for testing)") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() # statistics tokens = 0 uniqs = 0 found_tokens = 0 found_uniqs = 0 missed_tokens = 0 missed_uniqs = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] tokens += freq uniqs += 1 if options.verbose: print(tokens, "(", freq, ')...', end='\r') anals = omorfi.analyse(surf) if len(anals) > 0 and "GUESS=UNKNOWN" not in anals[0][0]: found_tokens += freq found_uniqs += 1 else: missed_tokens += freq missed_uniqs += 1 print(freq, surf, "? (missed)", sep="\t", file=options.outfile) if options.verbose: print() cpuend = process_time() realend = perf_counter() print("cpu time: ", cpuend - cpustart, "real time:", realend - realstart) print("Tokens", "Matches", "Misses", "%", sep="\t") print(tokens, found_tokens, missed_tokens, found_tokens / tokens * 100 if tokens != 0 else 0, sep="\t") print("Uniqs", "Matches", "Misses", "%", sep="\t") print(uniqs, found_uniqs, missed_uniqs, found_uniqs / uniqs * 100 if uniqs != 0 else 0, sep="\t") if tokens == 0 or (found_tokens / tokens * 100 < options.threshold): print("needs to have", options.threshold, "% non-unique matches to pass regress test\n", file=stderr) exit(1) else: exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", required=True, choices=["moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument( '--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") a.add_argument('--show-ambiguous', default=False, metavar='ASEP', help="separate ambiguous segmentations with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True, accept=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True, accept=True) if not omorfi.can_segment: print("Could not load segmenter(s), re-compile them or use -f option") print() print("To compile segmenter, use --enable-segmenter, and/or", "--enable-labeled-segments") exit(1) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token[0]) labelsegments = omorfi.labelsegment(token[0]) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, token[0], outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token[0], outfile, options) print(file=outfile) exit(0)