def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-f', '--format', metavar="FORMAT", help="use FORMAT formatter to compare analyses", choices=["coverage", "ftb3.1"], default="coverage") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 only_permuted = 0 only_rehashed = 0 no_matches = 0 no_results = 0 lines = 0 # types types_covered = 0 types_no_results = 0 types = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = surf analysis = surf if options.format != 'coverage': lemma = fields[2] analysis = fields[3] lines += freq types += 1 if options.verbose: print(lines, '(', freq, ') ...', end='\r') token = Token(surf) # pos 1 triggers acceptable detitlecasing token.pos = 1 omorfi.analyse(token) if token.is_oov(): omorfi.guess(token) if not token.is_oov(): covered += freq types_covered += 1 else: no_results += freq types_no_results += 1 print(freq, "OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False rehashed = True permuted = True for anal in token.analyses: if options.format == 'ftb3.1': anal_ftb3 = ' '.join(anal.get_ftb_feats()) lemma_ftb3 = '#'.join(anal.get_lemmas()) # hacks ftb3: analysis = analysis.replace(" >>>", "") if analysis == anal_ftb3: found_anals = True permuted = False elif set(anal_ftb3.split()) == set(analysis.split()): found_anals = True print(freq, "PERMUTAHIT", analysis, anal_ftb3, sep='\t', file=options.outfile) else: print(freq, "ANALMISS", analysis, anal_ftb3, sep='\t', file=options.outfile) if lemma == lemma_ftb3: found_lemma = True rehashed = False elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''): found_lemma = True print(freq, "LEMMARECOMP", lemma, lemma_ftb3, sep='\t', file=options.outfile) else: print(freq, "LEMMAMISS", lemma, lemma_ftb3, sep='\t', file=options.outfile) if options.format != 'coverage': if not found_anals and not found_lemma: no_matches += freq print(freq, "NOHITS!", surf, sep='\t', file=options.outfile) elif found_anals and found_lemma: full_matches += freq elif not found_anals: anal_matches += freq print(freq, "LEMMANOANAL", surf, sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += freq print(freq, "ANALNOLEMMA", surf, sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) if rehashed: only_rehashed += freq if permuted: only_permuted += freq realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Types", "Covered", "OOV", sep="\t", file=options.statfile) print(types, types_covered, types - types_covered, sep="\t", file=options.statfile) print(types / types * 100 if types != 0 else 0, types_covered / types * 100 if types != 0 else 0, (types - types_covered) / types * 100 if types != 0 else 0, sep="\t", file=options.statfile) if options.format == 'ftb3.1': print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Of which", "Tag permuations", "Lemma rehashing", sep='\t', file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, only_permuted / lines * 100 if lines != 0 else 0, only_rehashed / lines * 100 if lines != 0 else 0, sep='\t', file=options.statfile) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) elif options.format == 'ftb3.1' and \ (full_matches / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif options.format == 'coverage' and \ (covered / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', required=True, help="read analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle analyses to match HACKS version of UD", choices=['ftb']) a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--not-rules', metavar="RULEFILE", type=open, required=True, help="read non-rules from RULEFILE") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to conllu", file=stderr) exit(4) disamparsulator = Disamparsulator() if options.not_rules: if options.verbose: print("Loading", options.not_rules) disamparsulator.frobblesnizz(options.not_rules) if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 eoffed = False while not eoffed: sentplus = next_conllu(options.infile) if not sentplus: eoffed = True break for token in sentplus: if token.nontoken: if token.nontoken == 'comment': pass elif token.nontoken == 'eof': eoffed = True break elif token.nontoken == 'separator': sentences += 1 elif token.nontoken == 'error': print("Unrecognisable line:", token.error, file=stderr) exit(1) else: print("Error:", token, file=stderr) exit(1) continue elif not token.surf: print("No surface in CONLL-U?", token, file=stderr) exit(1) tokens += 1 omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) disamparsulator.linguisticate(sentplus) print_analyses(sentplus, options) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) print("Sentences per timeunit:", sentences / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to ftb3", file=stderr) exit(4) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # ftb is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: print("Cannot figure out token index", fields[0], file=stderr) exit(1) token = Token(fields[1]) token.pos = int(fields[0]) omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) if options.oracle: try_analyses_ftb(fields, index, token, options.outfile) else: print_analyses_ftb(index, token, options.outfile) elif line.startswith('<') and line.rstrip().endswith('>'): print(line.strip(), file=options.outfile) elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in ftb3 format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-F', '--format', metavar="INFORMAT", default='text', help="read input using INFORMAT tokenisation", choices=['text', 'vislcg', 'conllu', 'sentences']) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('--not-rules', metavar="RULEFILE", type=open, help="read non-rules from RULEFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is required to vislcg", file=stderr) exit(4) disamparsulator = None if options.not_rules: if options.verbose: print("Reading rulestuff", options.not_rules.name) disamparsulator = Disamparsulator() disamparsulator.frobblesnizz(options.not_rules) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: if options.outfile == stdout: options.statfile = stdout else: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokencount = 0 unknowns = 0 eoffed = False while not eoffed: if options.format == 'vislcg': tokens = next_vislcg(options.infile) elif options.format == 'text': tokens = next_plaintext(options.infile) elif options.format == 'conllu': tokens = next_conllu(options.infile) else: print("input format missing implementation", options.format, file=stderr) exit(2) if not tokens: break for token in tokens: if token.surf: tokencount += 1 omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) elif token.error or token.nontoken: pass else: print("Unrecognised", token, file=stderr) exit(2) if disamparsulator: disamparsulator.linguisticate(tokens) for token in tokens: if token.nontoken and token.nontoken == "eof": eoffed = True break print(token.printable_vislcg(), file=options.outfile) cpuend = process_time() realend = perf_counter() print("# Tokens:", tokencount, "\n# Unknown:", unknowns, unknowns / tokencount * 100 if tokencount > 0 else 0, "%", file=options.statfile) print("# CPU time:", cpuend - cpustart, "\n# Real time:", realend - realstart, file=options.statfile) print("# Tokens per timeunit:", tokencount / (realend - realstart), file=options.statfile) exit(0)