def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-F', '--format', metavar="INFORMAT", default='text', help="read input using INFORMAT tokenisation", choices=['text', 'vislcg', 'conllu']) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is required to vislcg", file=stderr) exit(4) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: if options.outfile == stdout: options.statfile = stdout else: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 last = None for line in options.infile: surfs = [] if options.format == 'vislcg': surfs = get_line_tokens_vislcg(line, last) elif options.format == 'text': surfs = get_line_tokens(line, omorfi) elif options.format == 'conllu': surfs = get_line_tokens_conllu(line, last) else: print("input format missing implementation", options.format, file=stderr) exit(2) for surf in surfs: if 'conllu_form' in surf: # skip conllu special forms in input for now: # (ellipsis and MWE magics) continue elif 'surf' in surf: tokens += 1 anals = omorfi.analyse(surf) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0]['anal']): unknowns += 1 anals = omorfi.guess(surf) print_analyses_vislcg3(surf, anals, options.outfile) elif 'comment' in surf: if surf['comment'].startswith(';') or \ surf['comment'].startswith('\t'): continue else: print(surf['comment'], file=options.outfile) elif 'error' in surf: print(surf['error'], file=stderr) exit(2) last = surf cpuend = process_time() realend = perf_counter() print("# Tokens:", tokens, "\n# Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("# CPU time:", cpuend - cpustart, "\n# Real time:", realend - realstart, file=options.statfile) print("# Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle anaelyses to match HACKS version of UD", choices=['ftb']) a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, guesser=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 recognised_comments = [ 'sent_id =', 'text =', 'doc-name:', 'sentence-text:' ] for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # conllu is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: if '-' in fields[0]: # MWE continue elif '.' in fields[0]: # a ghost continue else: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.debug: debug_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) elif options.oracle: try_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) else: print_analyses_conllu(index, surf, anals[0], options.outfile, options.hacks) elif line.startswith('#'): print(line.strip(), file=options.outfile) recognised = False for rec in recognised_comments: if line.startswith('# ' + rec): recognised = True if not recognised and options.verbose: print("Warning! Unrecognised comment line:", line, sep='\n') elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in conllu format:", line, sep='\n', file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to ftb3", file=stderr) exit(4) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout lexprobs = None tagprobs = None if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # ftb is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'OOV' in anals[0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.oracle: try_analyses_ftb(fields, index, surf, anals, options.outfile) else: print_analyses_ftb(index, surf, anals[0], options.outfile) else: print("Failed:", fields) exit(1) elif line.startswith('<') and line.rstrip().endswith('>'): print(line.strip(), file=options.outfile) elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in ftb3 format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)