Python Omorfi.guess Examples

Programming Language: Python

Namespace/Package Name: omorfi

Class/Type: Omorfi

Method/Function: guess

Examples at hotexamples.com: 4

Python Omorfi.guess - 4 examples found. These are the top rated real world Python examples of omorfi.Omorfi.guess extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Omorfi(11)

analyse(8)

load_analyser(7)

guess(4)

load_generator(2)

load_lexical_frequencies(2)

load_omortag_frequencies(2)

tokenise(2)

generate(1)

labelsegment(1)

load_filename(1)

load_from_dir(1)

load_labelsegmenter(1)

load_segmenter(1)

load_udpipe(1)

segment(1)

Example #1

Show file

File: omorfi-freq-evals.py Project: tow/omorfi

def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-f',
                   '--format',
                   metavar="FORMAT",
                   help="use FORMAT formatter to compare analyses",
                   choices=["coverage", "ftb3.1"],
                   default="coverage")
    a.add_argument('-c',
                   '--count',
                   metavar="FREQ",
                   default=0,
                   help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    covered = 0
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    only_permuted = 0
    only_rehashed = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # types
    types_covered = 0
    types_no_results = 0
    types = 0
    # for make check target
    threshold = options.threshold
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        lemma = surf
        analysis = surf
        if options.format != 'coverage':
            lemma = fields[2]
            analysis = fields[3]
        lines += freq
        types += 1
        if options.verbose:
            print(lines, '(', freq, ') ...', end='\r')
        token = Token(surf)
        # pos 1 triggers acceptable detitlecasing
        token.pos = 1
        omorfi.analyse(token)
        if token.is_oov():
            omorfi.guess(token)
        if not token.is_oov():
            covered += freq
            types_covered += 1
        else:
            no_results += freq
            types_no_results += 1
            print(freq, "OOV", surf, sep='\t', file=options.outfile)
        found_anals = False
        found_lemma = False
        rehashed = True
        permuted = True
        for anal in token.analyses:
            if options.format == 'ftb3.1':
                anal_ftb3 = ' '.join(anal.get_ftb_feats())
                lemma_ftb3 = '#'.join(anal.get_lemmas())
                # hacks ftb3:
                analysis = analysis.replace(" >>>", "")
                if analysis == anal_ftb3:
                    found_anals = True
                    permuted = False
                elif set(anal_ftb3.split()) == set(analysis.split()):
                    found_anals = True
                    print(freq,
                          "PERMUTAHIT",
                          analysis,
                          anal_ftb3,
                          sep='\t',
                          file=options.outfile)
                else:
                    print(freq,
                          "ANALMISS",
                          analysis,
                          anal_ftb3,
                          sep='\t',
                          file=options.outfile)
                if lemma == lemma_ftb3:
                    found_lemma = True
                    rehashed = False
                elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''):
                    found_lemma = True
                    print(freq,
                          "LEMMARECOMP",
                          lemma,
                          lemma_ftb3,
                          sep='\t',
                          file=options.outfile)
                else:
                    print(freq,
                          "LEMMAMISS",
                          lemma,
                          lemma_ftb3,
                          sep='\t',
                          file=options.outfile)
        if options.format != 'coverage':
            if not found_anals and not found_lemma:
                no_matches += freq
                print(freq, "NOHITS!", surf, sep='\t', file=options.outfile)
            elif found_anals and found_lemma:
                full_matches += freq
            elif not found_anals:
                anal_matches += freq
                print(freq,
                      "LEMMANOANAL",
                      surf,
                      sep='\t',
                      file=options.outfile)
            elif not found_lemma:
                lemma_matches += freq
                print(freq,
                      "ANALNOLEMMA",
                      surf,
                      sep='\t',
                      file=options.outfile)
            else:
                print("Logical error, kill everyone")
                exit(13)
            if rehashed:
                only_rehashed += freq
            if permuted:
                only_permuted += freq
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    print("Lines", "Covered", "OOV", sep="\t", file=options.statfile)
    print(lines, covered, lines - covered, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          covered / lines * 100 if lines != 0 else 0,
          (lines - covered) / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    print("Types", "Covered", "OOV", sep="\t", file=options.statfile)
    print(types,
          types_covered,
          types - types_covered,
          sep="\t",
          file=options.statfile)
    print(types / types * 100 if types != 0 else 0,
          types_covered / types * 100 if types != 0 else 0,
          (types - types_covered) / types * 100 if types != 0 else 0,
          sep="\t",
          file=options.statfile)
    if options.format == 'ftb3.1':
        print("Lines",
              "Matches",
              "Lemma",
              "Anals",
              "Mismatch",
              "No results",
              sep="\t",
              file=options.statfile)
        print(lines,
              full_matches,
              lemma_matches,
              anal_matches,
              no_matches,
              no_results,
              sep="\t",
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              full_matches / lines * 100 if lines != 0 else 0,
              lemma_matches / lines * 100 if lines != 0 else 0,
              anal_matches / lines * 100 if lines != 0 else 0,
              no_matches / lines * 100 if lines != 0 else 0,
              no_results / lines * 100 if lines != 0 else 0,
              sep="\t",
              file=options.statfile)
        print("Of which",
              "Tag permuations",
              "Lemma rehashing",
              sep='\t',
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              only_permuted / lines * 100 if lines != 0 else 0,
              only_rehashed / lines * 100 if lines != 0 else 0,
              sep='\t',
              file=options.statfile)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    elif options.format == 'ftb3.1' and \
            (full_matches / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    elif options.format == 'coverage' and \
            (covered / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% coverage to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)

Example #2

Show file

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   required=True,
                   help="read analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-u',
                   '--udpipe',
                   metavar="UDPIPE",
                   help='use UDPIPE for additional guesses (experi-mental)')
    a.add_argument('--hacks',
                   metavar='HACKS',
                   help="mangle analyses to match HACKS version of UD",
                   choices=['ftb'])
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--not-rules',
                   metavar="RULEFILE",
                   type=open,
                   required=True,
                   help="read non-rules from RULEFILE")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to conllu", file=stderr)
        exit(4)
    disamparsulator = Disamparsulator()
    if options.not_rules:
        if options.verbose:
            print("Loading", options.not_rules)
        disamparsulator.frobblesnizz(options.not_rules)
    if options.udpipe:
        if options.verbose:
            print("Loading udpipe", options.udpipe)
        omorfi.load_udpipe(options.udpipe)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    eoffed = False
    while not eoffed:
        sentplus = next_conllu(options.infile)
        if not sentplus:
            eoffed = True
            break
        for token in sentplus:
            if token.nontoken:
                if token.nontoken == 'comment':
                    pass
                elif token.nontoken == 'eof':
                    eoffed = True
                    break
                elif token.nontoken == 'separator':
                    sentences += 1
                elif token.nontoken == 'error':
                    print("Unrecognisable line:", token.error, file=stderr)
                    exit(1)
                else:
                    print("Error:", token, file=stderr)
                    exit(1)
                continue
            elif not token.surf:
                print("No surface in CONLL-U?", token, file=stderr)
                exit(1)
            tokens += 1
            omorfi.analyse(token)
            if token.is_oov():
                unknowns += 1
                omorfi.guess(token)
        disamparsulator.linguisticate(sentplus)
        print_analyses(sentplus, options)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    print("Sentences per timeunit:",
          sentences / (realend - realstart),
          file=options.statfile)
    exit(0)

Example #3

Show file

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="read analyser model from AFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to ftb3", file=stderr)
        exit(4)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # ftb is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                print("Cannot figure out token index", fields[0], file=stderr)
                exit(1)
            token = Token(fields[1])
            token.pos = int(fields[0])
            omorfi.analyse(token)
            if token.is_oov():
                unknowns += 1
                omorfi.guess(token)
            if options.oracle:
                try_analyses_ftb(fields, index, token, options.outfile)
            else:
                print_analyses_ftb(index, token, options.outfile)
        elif line.startswith('<') and line.rstrip().endswith('>'):
            print(line.strip(), file=options.outfile)
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in ftb3 format: '", line, "'", file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)

Example #4

Show file

File: omorfi-vislcg.py Project: tow/omorfi

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-F',
                   '--format',
                   metavar="INFORMAT",
                   default='text',
                   help="read input using INFORMAT tokenisation",
                   choices=['text', 'vislcg', 'conllu', 'sentences'])
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('--not-rules',
                   metavar="RULEFILE",
                   type=open,
                   help="read non-rules from RULEFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is required to vislcg", file=stderr)
        exit(4)
    disamparsulator = None
    if options.not_rules:
        if options.verbose:
            print("Reading rulestuff", options.not_rules.name)
        disamparsulator = Disamparsulator()
        disamparsulator.frobblesnizz(options.not_rules)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        if options.outfile == stdout:
            options.statfile = stdout
        else:
            options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokencount = 0
    unknowns = 0
    eoffed = False
    while not eoffed:
        if options.format == 'vislcg':
            tokens = next_vislcg(options.infile)
        elif options.format == 'text':
            tokens = next_plaintext(options.infile)
        elif options.format == 'conllu':
            tokens = next_conllu(options.infile)
        else:
            print("input format missing implementation",
                  options.format,
                  file=stderr)
            exit(2)
        if not tokens:
            break
        for token in tokens:
            if token.surf:
                tokencount += 1
                omorfi.analyse(token)
                if token.is_oov():
                    unknowns += 1
                    omorfi.guess(token)
            elif token.error or token.nontoken:
                pass
            else:
                print("Unrecognised", token, file=stderr)
                exit(2)
        if disamparsulator:
            disamparsulator.linguisticate(tokens)
        for token in tokens:
            if token.nontoken and token.nontoken == "eof":
                eoffed = True
                break
            print(token.printable_vislcg(), file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("# Tokens:",
          tokencount,
          "\n# Unknown:",
          unknowns,
          unknowns / tokencount * 100 if tokencount > 0 else 0,
          "%",
          file=options.statfile)
    print("# CPU time:",
          cpuend - cpustart,
          "\n# Real time:",
          realend - realstart,
          file=options.statfile)
    print("# Tokens per timeunit:",
          tokencount / (realend - realstart),
          file=options.statfile)
    exit(0)