Example #1
0
def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-f',
                   '--format',
                   metavar="FORMAT",
                   help="use FORMAT formatter to compare analyses",
                   choices=["coverage", "ftb3.1"],
                   default="coverage")
    a.add_argument('-c',
                   '--count',
                   metavar="FREQ",
                   default=0,
                   help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    covered = 0
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    only_permuted = 0
    only_rehashed = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # types
    types_covered = 0
    types_no_results = 0
    types = 0
    # for make check target
    threshold = options.threshold
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        lemma = surf
        analysis = surf
        if options.format != 'coverage':
            lemma = fields[2]
            analysis = fields[3]
        lines += freq
        types += 1
        if options.verbose:
            print(lines, '(', freq, ') ...', end='\r')
        token = Token(surf)
        # pos 1 triggers acceptable detitlecasing
        token.pos = 1
        omorfi.analyse(token)
        if token.is_oov():
            omorfi.guess(token)
        if not token.is_oov():
            covered += freq
            types_covered += 1
        else:
            no_results += freq
            types_no_results += 1
            print(freq, "OOV", surf, sep='\t', file=options.outfile)
        found_anals = False
        found_lemma = False
        rehashed = True
        permuted = True
        for anal in token.analyses:
            if options.format == 'ftb3.1':
                anal_ftb3 = ' '.join(anal.get_ftb_feats())
                lemma_ftb3 = '#'.join(anal.get_lemmas())
                # hacks ftb3:
                analysis = analysis.replace(" >>>", "")
                if analysis == anal_ftb3:
                    found_anals = True
                    permuted = False
                elif set(anal_ftb3.split()) == set(analysis.split()):
                    found_anals = True
                    print(freq,
                          "PERMUTAHIT",
                          analysis,
                          anal_ftb3,
                          sep='\t',
                          file=options.outfile)
                else:
                    print(freq,
                          "ANALMISS",
                          analysis,
                          anal_ftb3,
                          sep='\t',
                          file=options.outfile)
                if lemma == lemma_ftb3:
                    found_lemma = True
                    rehashed = False
                elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''):
                    found_lemma = True
                    print(freq,
                          "LEMMARECOMP",
                          lemma,
                          lemma_ftb3,
                          sep='\t',
                          file=options.outfile)
                else:
                    print(freq,
                          "LEMMAMISS",
                          lemma,
                          lemma_ftb3,
                          sep='\t',
                          file=options.outfile)
        if options.format != 'coverage':
            if not found_anals and not found_lemma:
                no_matches += freq
                print(freq, "NOHITS!", surf, sep='\t', file=options.outfile)
            elif found_anals and found_lemma:
                full_matches += freq
            elif not found_anals:
                anal_matches += freq
                print(freq,
                      "LEMMANOANAL",
                      surf,
                      sep='\t',
                      file=options.outfile)
            elif not found_lemma:
                lemma_matches += freq
                print(freq,
                      "ANALNOLEMMA",
                      surf,
                      sep='\t',
                      file=options.outfile)
            else:
                print("Logical error, kill everyone")
                exit(13)
            if rehashed:
                only_rehashed += freq
            if permuted:
                only_permuted += freq
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    print("Lines", "Covered", "OOV", sep="\t", file=options.statfile)
    print(lines, covered, lines - covered, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          covered / lines * 100 if lines != 0 else 0,
          (lines - covered) / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    print("Types", "Covered", "OOV", sep="\t", file=options.statfile)
    print(types,
          types_covered,
          types - types_covered,
          sep="\t",
          file=options.statfile)
    print(types / types * 100 if types != 0 else 0,
          types_covered / types * 100 if types != 0 else 0,
          (types - types_covered) / types * 100 if types != 0 else 0,
          sep="\t",
          file=options.statfile)
    if options.format == 'ftb3.1':
        print("Lines",
              "Matches",
              "Lemma",
              "Anals",
              "Mismatch",
              "No results",
              sep="\t",
              file=options.statfile)
        print(lines,
              full_matches,
              lemma_matches,
              anal_matches,
              no_matches,
              no_results,
              sep="\t",
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              full_matches / lines * 100 if lines != 0 else 0,
              lemma_matches / lines * 100 if lines != 0 else 0,
              anal_matches / lines * 100 if lines != 0 else 0,
              no_matches / lines * 100 if lines != 0 else 0,
              no_results / lines * 100 if lines != 0 else 0,
              sep="\t",
              file=options.statfile)
        print("Of which",
              "Tag permuations",
              "Lemma rehashing",
              sep='\t',
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              only_permuted / lines * 100 if lines != 0 else 0,
              only_rehashed / lines * 100 if lines != 0 else 0,
              sep='\t',
              file=options.statfile)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    elif options.format == 'ftb3.1' and \
            (full_matches / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    elif options.format == 'coverage' and \
            (covered / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% coverage to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)
Example #2
0
def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-g', '--generator', metavar='FSAFILE', required=True,
                   help="load generator from FSAFILE")
    a.add_argument('-w', '--word', metavar="WORD_ID", required=True,
                   help="generate forms of word WORD_ID")
    a.add_argument('-o', '--output', metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile", help="log outputs to OUTFILE")
    a.add_argument('-X', '--statistics', metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile", help="statistics")
    a.add_argument('-v', '--verbose', action="store_true", default=False,
                   help="Print verbosely while processing")
    a.add_argument('-O', '--output-format', metavar="OFORMAT",
                   default="markdown",
                   help="Create output table in OFORMAT")
    a.add_argument('-u', '--upos', metavar="UPOS", required=True,
                   choices=["ADJ", "NOUN", "VERB", "NUM", "X"],
                   help="generate inflection table for UPOS")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.generator:
            if options.verbose:
                print("reading generator from", options.generator)
            omorfi.load_generator(options.generator)
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.generator, file=stderr)
        exit(2)
    # for make check target
    realstart = perf_counter()
    cpustart = process_time()
    print("### Inflection of", options.word, file=options.outfile)
    print(file=options.outfile)
    if options.upos == 'NOUN':
        print_nominals(omorfi, options.word, options.upos, options.outfile)
    elif options.upos == 'ADJ':
        print_comparatives(omorfi, options.word, options.upos, 'POS',
                           options.outfile)
        # comparisons
        print(file=options.outfile)
        print_comparatives(omorfi, options.word, options.upos, "CMP",
                           options.outfile)
        print(file=options.outfile)
        print_comparatives(omorfi, options.word, options.upos, "SUP",
                           options.outfile)
    elif options.upos == 'NUM':
        print_numerals(omorfi, options.word, options.upos, options.outfile)
    elif options.upos == 'VERB':
        print_finites(omorfi, options.word, options.upos, options.outfile)
        print(file=options.outfile)
        print_infinites(omorfi, options.word, options.upos, options.outfile)
    print(file=options.outfile)
    print("_Note:_ the inflection tables cover small percentage of the " +
          "whole inflectional paradigm, for full list, see [" +
          options.word + " full form list](" + options.word + ".html)",
          file=options.outfile)
    print(file=options.outfile)
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    exit(0)
Example #3
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="read analyser model from AFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to ftb3", file=stderr)
        exit(4)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # ftb is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                print("Cannot figure out token index", fields[0], file=stderr)
                exit(1)
            token = Token(fields[1])
            token.pos = int(fields[0])
            omorfi.analyse(token)
            if token.is_oov():
                unknowns += 1
                omorfi.guess(token)
            if options.oracle:
                try_analyses_ftb(fields, index, token, options.outfile)
            else:
                print_analyses_ftb(index, token, options.outfile)
        elif line.startswith('<') and line.rstrip().endswith('>'):
            print(line.strip(), file=options.outfile)
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in ftb3 format: '", line, "'", file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Example #4
0
def main():
    global total_token_count, sent
    a = ArgumentParser()
    a.add_argument(
        '-f', '--fsa', metavar='FSAFILE', required=True,
        help="HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument(
        '-i', '--input', metavar="INFILE", type=str, required=True,
        dest="infile", help="source of analysis data")
    a.add_argument(
        '-m', '--master', metavar="TSVFILE", type=str, required=True,
        dest="tsvfile", help="source of existing lexical data")
    opts = a.parse_args()
    if opts.infile:
        test_corpora_files = glob(opts.infile)
    else:
        test_corpora_files = glob("*.text")
    # hard-coded logs for now
    # lemma_log = open('missing_word_ids.log', 'w')
    # case_log = open('missing_nominal_cases.log', 'w')
    # comp_log = open('missing_comparatives.log', 'w')
    # adposition_log = open('adposition_complements.log', 'w')
    # adposition_stats = open('adposition_complements_full.log', 'w')
    # adjective_log = open('adjective_agreements.log', 'w')
    proper_stats = open('proper_contexts_full.log', 'w')
    # open('../src/probabilistics/lemmas.freqs', 'w')
    lemma_stats = open('lemmas.freqs', 'w')
    # case_stats = open('../src/probabilistics/cases.freqs', 'w')
    omorfi = Omorfi()
    omorfi.load_filename(opts.fsa)
    gather_lemmas(open(opts.tsvfile))
    test_corpora = list()
    for test_corpus_file in test_corpora_files:
        try:
            test_corpora.append(open(test_corpus_file))
        except IOError as ioe:
            print("Failed to open corpus ", test_corpus_file, ":", ioe)
    for test_corpus in test_corpora:
        print('lines from', test_corpus)
        linen = 0
        for line in test_corpus:
            linen += 1
            if (linen % 500000) == 0:
                print(
                    linen, "...! Time to reload everything because memory is leaking very badly indeed!")
                sent = list()
                omorfi = None
                omorfi = Omorfi()
                omorfi.load_filename(opts.fsa)
                gc.collect()

            if (linen % 1000) == 0:
                print(linen, "...", end='\r')
            for punct in ".,:;?!()":
                line = line.replace(punct, " " + punct + " ")
            for token in line.split():
                total_token_count += 1
                analyses = omorfi.analyse(token)
                add_to_sent(analyses, token)
                stat_word_ids(token, analyses)
                # stat_nominal_cases(token, analyses, case_log)
                # stat_adjective_comps(token, analyses, comp_log)
    print("Testing statistics")
    # test_zero_lemmas(lemma_log)
    # test_zero_cases(case_log)
    # test_zero_comps(comp_log)
    # test_case_deviations()
    # test_adposition_complements(adposition_log)
    # test_adjective_agreements(adjective_log)
    print("Writing accurate statistics")
    # print_adposition_stats(adposition_stats)
    print_proper_stats(proper_stats)
    print_lemma_stats(lemma_stats)
    # print_case_stats(case_stats)
    exit(0)
Example #5
0
def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    covered = 0
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    only_permuted = 0
    accfails = 0
    lines = 0
    # for make check target
    threshold = options.threshold
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        if ' ' in fields[1] or ' ' in fields[0]:
            continue
        lines += 1
        if options.verbose and lines % 1000 == 0:
            print(lines, '...')
        lemma = fields[0]
        surf = fields[1]
        unimorph = fields[2].replace('ACC', 'NOM').replace('GEADJ', 'GEN')
        token = Token(surf)
        omorfi.analyse(token)
        if not token.is_oov():
            covered += 1
        else:
            no_results += 1
            print(1, "OOV", surf, sep='\t', file=options.outfile)
        found_anals = False
        found_lemma = False
        permuted = True
        accfail = False
        for anal in token.analyses:
            analhyp = anal.printable_unimorph()
            lemmahyp = ''.join(anal.get_lemmas())
            if analhyp == unimorph:
                found_anals = True
                permuted = False
            elif set(analhyp.split(';')) == set(unimorph.split(';')):
                found_anals = True
                # print("PERMUTAHIT", analhyp, unimorph, sep='\t',
                #      file=options.outfile)
            else:
                pass
                # print("ANALMISS", analhyp, unimorph, sep='\t',
                #      file=options.outfile)
            if lemma == lemmahyp:
                found_lemma = True
            else:
                pass
                # print("LEMMAMISS", lemmahyp, lemma, sep='\t',
                #      file=options.outfile)
        if not found_anals and not found_lemma:
            no_matches += 1
            print("NOHITS!",
                  surf,
                  lemma,
                  unimorph, [a.printable_unimorph() for a in token.analyses],
                  sep='\t',
                  file=options.outfile)
        elif found_anals and found_lemma:
            full_matches += 1
        elif not found_anals:
            anal_matches += 1
            print("LEMMANOANAL",
                  surf,
                  unimorph, [a.printable_unimorph() for a in token.analyses],
                  sep='\t',
                  file=options.outfile)
        elif not found_lemma:
            lemma_matches += 1
            print("ANALNOLEMMA",
                  surf,
                  lemma, [a.get_lemmas() for a in token.analyses],
                  sep='\t',
                  file=options.outfile)
        else:
            print("Logical error, kill everyone")
            exit(13)
        if permuted:
            only_permuted += 1
        if accfail:
            accfails += 1
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    print("Lines", "Covered", "OOV", sep="\t", file=options.statfile)
    print(lines, covered, lines - covered, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          covered / lines * 100 if lines != 0 else 0,
          (lines - covered) / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    print("Lines",
          "Matches",
          "Lemma",
          "Anals",
          "Mismatch",
          "No results",
          sep="\t",
          file=options.statfile)
    print(lines,
          full_matches,
          lemma_matches,
          anal_matches,
          no_matches,
          no_results,
          sep="\t",
          file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          full_matches / lines * 100 if lines != 0 else 0,
          lemma_matches / lines * 100 if lines != 0 else 0,
          anal_matches / lines * 100 if lines != 0 else 0,
          no_matches / lines * 100 if lines != 0 else 0,
          no_results / lines * 100 if lines != 0 else 0,
          sep="% \t",
          file=options.statfile)
    print("Of which", "Tag permuations", sep='\t', file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          only_permuted / lines * 100 if lines != 0 else 0,
          sep='\t',
          file=options.statfile)
    if full_matches / lines * 100 <= int(options.threshold):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    elif covered / lines * 100 <= int(options.threshold):
        print("needs to have",
              threshold,
              "% coverage to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)
Example #6
0
def main():
    print("""Please note that the licence of FTC does not allow you to do much
    with the results or anything, other than including approximate numbers of
    recall for scientific purposes. Please do not look at the differences or
    do any processing with any of the data since it will automatically make
    your versions of all your future work on this or any other analysers of
    the Finnish language illegal and other bad things.""", file=stderr)
    password = "******"
    userpass = input("Write '%s': " % (password))
    if userpass != password:
        print(
            "You have chosen not to use badly licenced FTC data", file=stderr)
        exit(2)
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSADIR', required=True,
                   help="Location of omorfi automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True,
                   dest="infile", help="source of analysis data")
    a.add_argument('-o', '--output', metavar="OUTFILE", required=True,
                   type=FileType('w'),
                   dest="outfile", help="result file")
    a.add_argument('-X', '--statistics', metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile", help="statistics")
    options = a.parse_args()
    omorfi = Omorfi()
    omorfi.load_from_dir(options.fsa)
    if not options.statfile:
        options.statfile = stdout
    # basic statistics
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # for make check target
    threshold = 0
    for line in options.infile:
        if '<w lemma' not in line or 'msd=' not in line:
            continue
        matches = re.search('<w.*lemma="([^"]*).*msd="([^"]*)".*>([^<]*)</w>',
                            line)
        if not matches:
            print("ERROR: Skipping line", line, file=stderr)
            continue
        lines += 1
        if lines % 100000 == 0:
            print(lines, "...", file=stderr)
        ftcsurf = matches.group(3)
        ftclemma = matches.group(1)
        ftcanals = matches.group(2)
        omors = omorfi.analyse(ftcsurf)
        anals = []
        for omor in omors:
            anals.append(convert_omor_string(omor.output, 'ftc'))
        found_anals = False
        found_lemma = False
        print_in = True
        for anal in anals:
            if ftcanals in anal:
                found_anals = True
            if ftclemma in anal:
                found_lemma = True
        if len(anals) == 0:
            print_in = False
            no_results += 1
            print("NORESULTS:", ftcsurf, ftclemma, ftcanals, sep="\t",
                  file=options.outfile)
        elif not found_anals and not found_lemma:
            no_matches += 1
            print("NOMATCH:", ftcsurf, ftclemma, ftcanals, sep="\t", end="\t",
                  file=options.outfile)
        elif not found_anals:
            lemma_matches += 1
            print("NOANALMATCH:", ftcsurf, ftcanals, sep="\t", end="\t",
                  file=options.outfile)
        elif not found_lemma:
            anal_matches += 1
            print("NOLEMMAMATCH:", ftcsurf, ftclemma, sep="\t", end="\t",
                  file=options.outfile)
        else:
            full_matches += 1
            print_in = False
        if print_in:
            print(":IN:", end="\t", file=options.outfile)
            for anal in anals:
                print(anal, end='\t', file=options.outfile)
            print(file=options.outfile)
    print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t",
          file=options.statfile)
    print(lines, full_matches, lemma_matches, anal_matches, no_matches,
          no_results,
          sep="\t", file=options.statfile)
    print(lines / lines * 100, full_matches / lines * 100,
          lemma_matches / lines * 100, anal_matches / lines * 100,
          no_matches / lines * 100, no_results / lines * 100,
          sep="\t", file=options.statfile)
    if (full_matches / lines * 100 < threshold):
        print("needs to have", threshold, "% matches to pass regress test\n",
              "please examine", options.outfile.name, "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)
Example #7
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   required=True,
                   help="read analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-u',
                   '--udpipe',
                   metavar="UDPIPE",
                   help='use UDPIPE for additional guesses (experi-mental)')
    a.add_argument('--hacks',
                   metavar='HACKS',
                   help="mangle analyses to match HACKS version of UD",
                   choices=['ftb'])
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--not-rules',
                   metavar="RULEFILE",
                   type=open,
                   required=True,
                   help="read non-rules from RULEFILE")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to conllu", file=stderr)
        exit(4)
    disamparsulator = Disamparsulator()
    if options.not_rules:
        if options.verbose:
            print("Loading", options.not_rules)
        disamparsulator.frobblesnizz(options.not_rules)
    if options.udpipe:
        if options.verbose:
            print("Loading udpipe", options.udpipe)
        omorfi.load_udpipe(options.udpipe)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    eoffed = False
    while not eoffed:
        sentplus = next_conllu(options.infile)
        if not sentplus:
            eoffed = True
            break
        for token in sentplus:
            if token.nontoken:
                if token.nontoken == 'comment':
                    pass
                elif token.nontoken == 'eof':
                    eoffed = True
                    break
                elif token.nontoken == 'separator':
                    sentences += 1
                elif token.nontoken == 'error':
                    print("Unrecognisable line:", token.error, file=stderr)
                    exit(1)
                else:
                    print("Error:", token, file=stderr)
                    exit(1)
                continue
            elif not token.surf:
                print("No surface in CONLL-U?", token, file=stderr)
                exit(1)
            tokens += 1
            omorfi.analyse(token)
            if token.is_oov():
                unknowns += 1
                omorfi.guess(token)
        disamparsulator.linguisticate(sentplus)
        print_analyses(sentplus, options)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    print("Sentences per timeunit:",
          sentences / (realend - realstart),
          file=options.statfile)
    exit(0)
Example #8
0
def main():
    global total_token_count
    a = ArgumentParser()
    a.add_argument(
        '-f',
        '--fsa',
        metavar='FSAFILE',
        required=True,
        help=
        "HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=str,
                   required=True,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-m',
                   '--master',
                   metavar="TSVFILE",
                   type=str,
                   required=True,
                   dest="tsvfile",
                   help="source of existing lexical data")
    opts = a.parse_args()
    if opts.infile:
        test_corpora_files = glob(opts.infile)
    else:
        test_corpora_files = glob("*.text")
    # hard-coded logs for now
    #lemma_log = open('missing_word_ids.log', 'w')
    #case_log = open('missing_nominal_cases.log', 'w')
    #comp_log = open('missing_comparatives.log', 'w')
    #adposition_log = open('adposition_complements.log', 'w')
    #adposition_stats = open('adposition_complements_full.log', 'w')
    #adjective_log = open('adjective_agreements.log', 'w')
    proper_stats = open('proper_contexts_full.log', 'w')
    lemma_stats = open('lemmas.freqs',
                       'w')  #open('../src/probabilistics/lemmas.freqs', 'w')
    #case_stats = open('../src/probabilistics/cases.freqs', 'w')
    omorfi = Omorfi()
    omorfi.load_filename(opts.fsa)
    gather_lemmas(open(opts.tsvfile))
    test_corpora = list()
    for test_corpus_file in test_corpora_files:
        try:
            test_corpora.append(open(test_corpus_file))
        except IOError as ioe:
            print("Failed to open corpus ", test_corpus_file, ":", ioe)
    for test_corpus in test_corpora:
        print('lines from', test_corpus)
        linen = 0
        for line in test_corpus:
            linen += 1
            if (linen % 500000) == 0:
                print(
                    linen,
                    "...! Time to reload everything because memory is leaking very badly indeed!"
                )
                previous = list()
                sent = list()
                omorfi = None
                omorfi = Omorfi()
                omorfi.load_filename(opts.fsa)
                gc.collect()

            if (linen % 1000) == 0:
                print(linen, "...", end='\r')
            for punct in ".,:;?!()":
                line = line.replace(punct, " " + punct + " ")
            for token in line.split():
                total_token_count += 1
                analyses = omorfi.analyse(token)
                add_to_sent(analyses, token)
                stat_word_ids(token, analyses)
                #stat_nominal_cases(token, analyses, case_log)
                #stat_adjective_comps(token, analyses, comp_log)
    print("Testing statistics")
    #test_zero_lemmas(lemma_log)
    #test_zero_cases(case_log)
    #test_zero_comps(comp_log)
    #test_case_deviations()
    #test_adposition_complements(adposition_log)
    #test_adjective_agreements(adjective_log)
    print("Writing accurate statistics")
    #print_adposition_stats(adposition_stats)
    print_proper_stats(proper_stats)
    print_lemma_stats(lemma_stats)
    #print_case_stats(case_stats)
    exit(0)
Example #9
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-F',
                   '--format',
                   metavar="INFORMAT",
                   default='text',
                   help="read input using INFORMAT tokenisation",
                   choices=['text', 'vislcg', 'conllu', 'sentences'])
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('--not-rules',
                   metavar="RULEFILE",
                   type=open,
                   help="read non-rules from RULEFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is required to vislcg", file=stderr)
        exit(4)
    disamparsulator = None
    if options.not_rules:
        if options.verbose:
            print("Reading rulestuff", options.not_rules.name)
        disamparsulator = Disamparsulator()
        disamparsulator.frobblesnizz(options.not_rules)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        if options.outfile == stdout:
            options.statfile = stdout
        else:
            options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokencount = 0
    unknowns = 0
    eoffed = False
    while not eoffed:
        if options.format == 'vislcg':
            tokens = next_vislcg(options.infile)
        elif options.format == 'text':
            tokens = next_plaintext(options.infile)
        elif options.format == 'conllu':
            tokens = next_conllu(options.infile)
        else:
            print("input format missing implementation",
                  options.format,
                  file=stderr)
            exit(2)
        if not tokens:
            break
        for token in tokens:
            if token.surf:
                tokencount += 1
                omorfi.analyse(token)
                if token.is_oov():
                    unknowns += 1
                    omorfi.guess(token)
            elif token.error or token.nontoken:
                pass
            else:
                print("Unrecognised", token, file=stderr)
                exit(2)
        if disamparsulator:
            disamparsulator.linguisticate(tokens)
        for token in tokens:
            if token.nontoken and token.nontoken == "eof":
                eoffed = True
                break
            print(token.printable_vislcg(), file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("# Tokens:",
          tokencount,
          "\n# Unknown:",
          unknowns,
          unknowns / tokencount * 100 if tokencount > 0 else 0,
          "%",
          file=options.statfile)
    print("# CPU time:",
          cpuend - cpustart,
          "\n# Real time:",
          realend - realstart,
          file=options.statfile)
    print("# Tokens per timeunit:",
          tokencount / (realend - realstart),
          file=options.statfile)
    exit(0)
Example #10
0
def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-s',
                   '--segmenter',
                   metavar='SFILE',
                   help="load segmenter from SFILE",
                   required=True)
    a.add_argument('-S',
                   '--labeller',
                   metavar='LSFILE',
                   help="load labelsegmenter from LSFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O',
                   '--output-format',
                   metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   required=True,
                   choices=["moses-factors", "segments"])
    a.add_argument('--no-split-words',
                   action="store_false",
                   default=True,
                   dest="split_words",
                   help="split on word boundaries")
    a.add_argument('--no-split-new-words',
                   action="store_false",
                   default=True,
                   dest="split_new_words",
                   help="split on new word boundaries " +
                   "(prev. unattested compounds)")
    a.add_argument('--no-split-morphs',
                   action="store_false",
                   default=True,
                   dest="split_morphs",
                   help="split on morph boundaries")
    a.add_argument('--split-derivs',
                   action="store_true",
                   default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords',
                   action="store_true",
                   default=False,
                   help="split on other boundaries")
    a.add_argument('--segment-marker',
                   default='→ ←',
                   metavar='SEG',
                   help="mark segment boundaries with SEG")
    a.add_argument('--show-ambiguous',
                   default=False,
                   metavar='ASEP',
                   help="separate ambiguous segmentations with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("segmenter is needed for segmenting", file=stderr)
        exit(2)
    if options.labeller:
        if options.verbose:
            print("Reading labelsegmenter", options.labeller)
        omorfi.load_labelsegmenter(options.labeller)
    if not omorfi.can_segment or not omorfi.can_labelsegment:
        print("Could not load segmenter(s), re-compile them or use -f option")
        print()
        print("To compile segmenter, use --enable-segmenter, and/or",
              "--enable-labeled-segments")
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        options.infile = stdin
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        options.output = "<stdout>"
        outfile = stdout
    if options.segment_marker is None:
        if options.verbose:
            print("Default segment marker is → ←")
        options.segment_marker = '→ ←'
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            print(file=outfile)
            continue
        tokens = omorfi.tokenise(line)
        for token in tokens:
            omorfi.segment(token)
            omorfi.labelsegment(token)
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(token, outfile, options)
            elif options.output_format == 'segments':
                print_segments(token, outfile, options)
        print(file=outfile)
    exit(0)
Example #11
0
def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-g',
                   '--generator',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    a.add_argument('-F',
                   '--format',
                   metavar="FMT",
                   required=True,
                   help="which SIGMORHON shared task format is used")

    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if options.generator:
            if options.verbose:
                print("reading generator from", options.generator)
            omorfi.load_generator(options.generator)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    correct = 0
    incorrect = 0
    oov = 0
    lines = 0
    # for make check target
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        omors = None
        lemma = None
        print("<<<", fields)
        if options.format == '1':
            lemma = fields[0]
            omors = unimorph2omor(fields[1])
        elif options.format == '2':
            srcomors = unimorph2omor(fields[0])
            srchyps = omorfi.analyse(fields[1])
            for srchyp in srchyps:
                if srcomors in srchyp.raw and len(srchyp.get_lemmas()) == 1:
                    lemma = srchyp.get_lemmas()[0]
            if not lemma:
                lemma = ''.join(srchyps[0].get_lemmas())
            omors = unimorph2omor(fields[2])
        elif options.format == '3':
            srchyps = omorfi.analyse(fields[0])
            for srchyp in srchyps:
                if len(srchyp.get_lemmas()) == 1:
                    lemma = srchyp.get_lemmas()[0]
            if not lemma:
                lemma = ''.join(srchyps[0].get_lemmas())
            omors = unimorph2omor(fields[1])
        else:
            print("format fail", options.format)
            exit(1)
        genomor = '[WORD_ID=' + lemma + ']' + omors
        print(">>> ", genomor)
        generations = omorfi.generate(genomor)
        if not generations or '[' in generations:
            oov += 1
            genat1 = lemma
            print("OOV", genat1)
        else:
            genat1 = generations.split('/')[0]
            print("@1 ", genat1)
        if options.format == '1':
            if genat1 == fields[2]:
                correct += 1
            else:
                print("MIS", genat1, "!=", fields[2])
                incorrect += 1
        elif options.format == '2':
            if genat1 == fields[3]:
                correct += 1
            else:
                print("MIS", genat1, "!=", fields[2])
                incorrect += 1
        elif options.format == '3':
            if genat1 == fields[2]:
                correct += 1
            else:
                print("MIS", genat1, "!=", fields[2])
                incorrect += 1
        lines += 1
        if options.verbose and lines % 1000 == 0:
            print(lines, '...')
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    print("Lines", "Corect", "OOV", sep="\t", file=options.statfile)
    print(lines, correct, oov, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          correct / lines * 100 if lines != 0 else 0,
          oov / lines * 100,
          sep="\t",
          file=options.statfile)
    exit(0)
Example #12
0
def main():
    print("""Please note that the licence of FTC does not allow you to do much
    with the results or anything, other than including approximate numbers of
    recall for scientific purposes. Please do not look at the differences or
    do any processing with any of the data since it will automatically make
    your versions of all your future work on this or any other analysers of
    the Finnish language illegal and other bad things.""",
          file=stderr)
    password = "******"
    userpass = input("Write '%s': " % (password))
    if userpass != password:
        print("You have chosen not to use badly licenced FTC data",
              file=stderr)
        exit(2)
    a = ArgumentParser()
    a.add_argument('-f',
                   '--fsa',
                   metavar='FSADIR',
                   required=True,
                   help="Location of omorfi automata")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   required=True,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   required=True,
                   type=FileType('w'),
                   dest="outfile",
                   help="result file")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    options = a.parse_args()
    omorfi = Omorfi()
    omorfi.load_from_dir(options.fsa)
    if not options.statfile:
        options.statfile = stdout
    # basic statistics
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # for make check target
    threshold = 0
    for line in options.infile:
        if '<w lemma' not in line or 'msd=' not in line:
            continue
        matches = re.search('<w.*lemma="([^"]*).*msd="([^"]*)".*>([^<]*)</w>',
                            line)
        if not matches:
            print("ERROR: Skipping line", line, file=stderr)
            continue
        lines += 1
        if lines % 100000 == 0:
            print(lines, "...", file=stderr)
        ftcsurf = matches.group(3)
        ftclemma = matches.group(1)
        ftcanals = matches.group(2)
        omors = omorfi.analyse(ftcsurf)
        anals = []
        for omor in omors:
            anals.append(convert_omor_string(omor.output, 'ftc'))
        found_anals = False
        found_lemma = False
        print_in = True
        for anal in anals:
            if ftcanals in anal:
                found_anals = True
            if ftclemma in anal:
                found_lemma = True
        if len(anals) == 0:
            print_in = False
            no_results += 1
            print("NORESULTS:",
                  ftcsurf,
                  ftclemma,
                  ftcanals,
                  sep="\t",
                  file=options.outfile)
        elif not found_anals and not found_lemma:
            no_matches += 1
            print("NOMATCH:",
                  ftcsurf,
                  ftclemma,
                  ftcanals,
                  sep="\t",
                  end="\t",
                  file=options.outfile)
        elif not found_anals:
            lemma_matches += 1
            print("NOANALMATCH:",
                  ftcsurf,
                  ftcanals,
                  sep="\t",
                  end="\t",
                  file=options.outfile)
        elif not found_lemma:
            anal_matches += 1
            print("NOLEMMAMATCH:",
                  ftcsurf,
                  ftclemma,
                  sep="\t",
                  end="\t",
                  file=options.outfile)
        else:
            full_matches += 1
            print_in = False
        if print_in:
            print(":IN:", end="\t", file=options.outfile)
            for anal in anals:
                print(anal, end='\t', file=options.outfile)
            print(file=options.outfile)
    print("Lines",
          "Matches",
          "Lemma",
          "Anals",
          "Mismatch",
          "No results",
          sep="\t",
          file=options.statfile)
    print(lines,
          full_matches,
          lemma_matches,
          anal_matches,
          no_matches,
          no_results,
          sep="\t",
          file=options.statfile)
    print(lines / lines * 100,
          full_matches / lines * 100,
          lemma_matches / lines * 100,
          anal_matches / lines * 100,
          no_matches / lines * 100,
          no_results / lines * 100,
          sep="\t",
          file=options.statfile)
    if (full_matches / lines * 100 < threshold):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)
Example #13
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a', '--analyser', metavar='AFILE',
                   help="load tokeniser model from (analyser) AFILE",
                   required=True)
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--output-format', metavar="OUTFORMAT",
                   default="moses",
                   help="format output for OUTFORMAT", choices=['moses',
                                                                'conllu',
                                                                'json',
                                                                'ftb3'])
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading language model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed for tokenisation", file=stderr)
        exit(1)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    lines = 0
    if options.output_format == 'conllu':
        print("# new doc id=", options.infile.name, file=options.outfile)
    for line in options.infile:
        line = line
        lines += 1
        if options.verbose and lines % 10000 == 0:
            print(lines, "...")
        if not line or line.rstrip('\n') == '':
            continue
        surfs = omorfi.tokenise(line)
        tokens += len(surfs)
        if options.output_format == 'moses':
            print(' '.join([surf.surf for surf in surfs]),
                  file=options.outfile)
        elif options.output_format == 'json':
            print(json.encode(surfs))
        elif options.output_format == 'conllu':
            print("# sent_id =", lines, file=options.outfile)
            print("# text =", line.rstrip("\n"), file=options.outfile)
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_",
                      sep="\t", file=options.outfile)
                i += 1
        elif options.output_format == 'ftb3':
            print("<s><loc file=\"", options.infile.name, "\" line=\"",
                  lines, "\" />", file=options.outfile, sep="")
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_",
                      sep="\t", file=options.outfile)
                i += 1
            print("</s>", file=options.outfile)
        if options.output_format == 'conllu':
            print(file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines,
          "tokens/line", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          "Lines per timeunit:", lines / (realend - realstart),
          file=options.statfile)
    exit(0)