def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to ftb3", file=stderr) exit(4) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # ftb is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: print("Cannot figure out token index", fields[0], file=stderr) exit(1) token = Token(fields[1]) token.pos = int(fields[0]) omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) if options.oracle: try_analyses_ftb(fields, index, token, options.outfile) else: print_analyses_ftb(index, token, options.outfile) elif line.startswith('<') and line.rstrip().endswith('>'): print(line.strip(), file=options.outfile) elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in ftb3 format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-f', '--format', metavar="FORMAT", help="use FORMAT formatter to compare analyses", choices=["coverage", "ftb3.1"], default="coverage") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 only_permuted = 0 only_rehashed = 0 no_matches = 0 no_results = 0 lines = 0 # types types_covered = 0 types_no_results = 0 types = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = surf analysis = surf if options.format != 'coverage': lemma = fields[2] analysis = fields[3] lines += freq types += 1 if options.verbose: print(lines, '(', freq, ') ...', end='\r') token = Token(surf) # pos 1 triggers acceptable detitlecasing token.pos = 1 omorfi.analyse(token) if token.is_oov(): omorfi.guess(token) if not token.is_oov(): covered += freq types_covered += 1 else: no_results += freq types_no_results += 1 print(freq, "OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False rehashed = True permuted = True for anal in token.analyses: if options.format == 'ftb3.1': anal_ftb3 = ' '.join(anal.get_ftb_feats()) lemma_ftb3 = '#'.join(anal.get_lemmas()) # hacks ftb3: analysis = analysis.replace(" >>>", "") if analysis == anal_ftb3: found_anals = True permuted = False elif set(anal_ftb3.split()) == set(analysis.split()): found_anals = True print(freq, "PERMUTAHIT", analysis, anal_ftb3, sep='\t', file=options.outfile) else: print(freq, "ANALMISS", analysis, anal_ftb3, sep='\t', file=options.outfile) if lemma == lemma_ftb3: found_lemma = True rehashed = False elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''): found_lemma = True print(freq, "LEMMARECOMP", lemma, lemma_ftb3, sep='\t', file=options.outfile) else: print(freq, "LEMMAMISS", lemma, lemma_ftb3, sep='\t', file=options.outfile) if options.format != 'coverage': if not found_anals and not found_lemma: no_matches += freq print(freq, "NOHITS!", surf, sep='\t', file=options.outfile) elif found_anals and found_lemma: full_matches += freq elif not found_anals: anal_matches += freq print(freq, "LEMMANOANAL", surf, sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += freq print(freq, "ANALNOLEMMA", surf, sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) if rehashed: only_rehashed += freq if permuted: only_permuted += freq realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Types", "Covered", "OOV", sep="\t", file=options.statfile) print(types, types_covered, types - types_covered, sep="\t", file=options.statfile) print(types / types * 100 if types != 0 else 0, types_covered / types * 100 if types != 0 else 0, (types - types_covered) / types * 100 if types != 0 else 0, sep="\t", file=options.statfile) if options.format == 'ftb3.1': print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Of which", "Tag permuations", "Lemma rehashing", sep='\t', file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, only_permuted / lines * 100 if lines != 0 else 0, only_rehashed / lines * 100 if lines != 0 else 0, sep='\t', file=options.statfile) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) elif options.format == 'ftb3.1' and \ (full_matches / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif options.format == 'coverage' and \ (covered / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 only_permuted = 0 accfails = 0 lines = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().split('\t') if len(fields) < 3: print("ERROR: Skipping line", fields, file=stderr) continue if ' ' in fields[1] or ' ' in fields[0]: continue lines += 1 if options.verbose and lines % 1000 == 0: print(lines, '...') lemma = fields[0] surf = fields[1] unimorph = fields[2].replace('ACC', 'NOM').replace('GEADJ', 'GEN') token = Token(surf) omorfi.analyse(token) if not token.is_oov(): covered += 1 else: no_results += 1 print(1, "OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False permuted = True accfail = False for anal in token.analyses: analhyp = anal.printable_unimorph() lemmahyp = ''.join(anal.get_lemmas()) if analhyp == unimorph: found_anals = True permuted = False elif set(analhyp.split(';')) == set(unimorph.split(';')): found_anals = True # print("PERMUTAHIT", analhyp, unimorph, sep='\t', # file=options.outfile) else: pass # print("ANALMISS", analhyp, unimorph, sep='\t', # file=options.outfile) if lemma == lemmahyp: found_lemma = True else: pass # print("LEMMAMISS", lemmahyp, lemma, sep='\t', # file=options.outfile) if not found_anals and not found_lemma: no_matches += 1 print("NOHITS!", surf, lemma, unimorph, [a.printable_unimorph() for a in token.analyses], sep='\t', file=options.outfile) elif found_anals and found_lemma: full_matches += 1 elif not found_anals: anal_matches += 1 print("LEMMANOANAL", surf, unimorph, [a.printable_unimorph() for a in token.analyses], sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += 1 print("ANALNOLEMMA", surf, lemma, [a.get_lemmas() for a in token.analyses], sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) if permuted: only_permuted += 1 if accfail: accfails += 1 realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="% \t", file=options.statfile) print("Of which", "Tag permuations", sep='\t', file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, only_permuted / lines * 100 if lines != 0 else 0, sep='\t', file=options.statfile) if full_matches / lines * 100 <= int(options.threshold): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif covered / lines * 100 <= int(options.threshold): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', required=True, help="read analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle analyses to match HACKS version of UD", choices=['ftb']) a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--not-rules', metavar="RULEFILE", type=open, required=True, help="read non-rules from RULEFILE") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to conllu", file=stderr) exit(4) disamparsulator = Disamparsulator() if options.not_rules: if options.verbose: print("Loading", options.not_rules) disamparsulator.frobblesnizz(options.not_rules) if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 eoffed = False while not eoffed: sentplus = next_conllu(options.infile) if not sentplus: eoffed = True break for token in sentplus: if token.nontoken: if token.nontoken == 'comment': pass elif token.nontoken == 'eof': eoffed = True break elif token.nontoken == 'separator': sentences += 1 elif token.nontoken == 'error': print("Unrecognisable line:", token.error, file=stderr) exit(1) else: print("Error:", token, file=stderr) exit(1) continue elif not token.surf: print("No surface in CONLL-U?", token, file=stderr) exit(1) tokens += 1 omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) disamparsulator.linguisticate(sentplus) print_analyses(sentplus, options) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) print("Sentences per timeunit:", sentences / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-F', '--format', metavar="INFORMAT", default='text', help="read input using INFORMAT tokenisation", choices=['text', 'vislcg', 'conllu', 'sentences']) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('--not-rules', metavar="RULEFILE", type=open, help="read non-rules from RULEFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is required to vislcg", file=stderr) exit(4) disamparsulator = None if options.not_rules: if options.verbose: print("Reading rulestuff", options.not_rules.name) disamparsulator = Disamparsulator() disamparsulator.frobblesnizz(options.not_rules) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: if options.outfile == stdout: options.statfile = stdout else: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokencount = 0 unknowns = 0 eoffed = False while not eoffed: if options.format == 'vislcg': tokens = next_vislcg(options.infile) elif options.format == 'text': tokens = next_plaintext(options.infile) elif options.format == 'conllu': tokens = next_conllu(options.infile) else: print("input format missing implementation", options.format, file=stderr) exit(2) if not tokens: break for token in tokens: if token.surf: tokencount += 1 omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) elif token.error or token.nontoken: pass else: print("Unrecognised", token, file=stderr) exit(2) if disamparsulator: disamparsulator.linguisticate(tokens) for token in tokens: if token.nontoken and token.nontoken == "eof": eoffed = True break print(token.printable_vislcg(), file=options.outfile) cpuend = process_time() realend = perf_counter() print("# Tokens:", tokencount, "\n# Unknown:", unknowns, unknowns / tokencount * 100 if tokencount > 0 else 0, "%", file=options.statfile) print("# CPU time:", cpuend - cpustart, "\n# Real time:", realend - realstart, file=options.statfile) print("# Tokens per timeunit:", tokencount / (realend - realstart), file=options.statfile) exit(0)
def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-g', '--generator', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") a.add_argument('-F', '--format', metavar="FMT", required=True, help="which SIGMORHON shared task format is used") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if options.generator: if options.verbose: print("reading generator from", options.generator) omorfi.load_generator(options.generator) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics correct = 0 incorrect = 0 oov = 0 lines = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().split('\t') if len(fields) < 3: print("ERROR: Skipping line", fields, file=stderr) continue omors = None lemma = None print("<<<", fields) if options.format == '1': lemma = fields[0] omors = unimorph2omor(fields[1]) elif options.format == '2': srcomors = unimorph2omor(fields[0]) srchyps = omorfi.analyse(fields[1]) for srchyp in srchyps: if srcomors in srchyp.raw and len(srchyp.get_lemmas()) == 1: lemma = srchyp.get_lemmas()[0] if not lemma: lemma = ''.join(srchyps[0].get_lemmas()) omors = unimorph2omor(fields[2]) elif options.format == '3': srchyps = omorfi.analyse(fields[0]) for srchyp in srchyps: if len(srchyp.get_lemmas()) == 1: lemma = srchyp.get_lemmas()[0] if not lemma: lemma = ''.join(srchyps[0].get_lemmas()) omors = unimorph2omor(fields[1]) else: print("format fail", options.format) exit(1) genomor = '[WORD_ID=' + lemma + ']' + omors print(">>> ", genomor) generations = omorfi.generate(genomor) if not generations or '[' in generations: oov += 1 genat1 = lemma print("OOV", genat1) else: genat1 = generations.split('/')[0] print("@1 ", genat1) if options.format == '1': if genat1 == fields[2]: correct += 1 else: print("MIS", genat1, "!=", fields[2]) incorrect += 1 elif options.format == '2': if genat1 == fields[3]: correct += 1 else: print("MIS", genat1, "!=", fields[2]) incorrect += 1 elif options.format == '3': if genat1 == fields[2]: correct += 1 else: print("MIS", genat1, "!=", fields[2]) incorrect += 1 lines += 1 if options.verbose and lines % 1000 == 0: print(lines, '...') realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) print("Lines", "Corect", "OOV", sep="\t", file=options.statfile) print(lines, correct, oov, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, correct / lines * 100 if lines != 0 else 0, oov / lines * 100, sep="\t", file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load tokeniser model from (analyser) AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu', 'json', 'ftb3']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading language model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed for tokenisation", file=stderr) exit(1) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# new doc id=", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf.surf for surf in surfs]), file=options.outfile) elif options.output_format == 'json': print(json.encode(surfs)) elif options.output_format == 'conllu': print("# sent_id =", lines, file=options.outfile) print("# text =", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 elif options.output_format == 'ftb3': print("<s><loc file=\"", options.infile.name, "\" line=\"", lines, "\" />", file=options.outfile, sep="") i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 print("</s>", file=options.outfile) if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)