Ejemplos de Omorfi.load_analyser en Python

Lenguaje de programación: Python

Namespace/Package Name: omorfi

Clase / Tipo: Omorfi

Método / Función: load_analyser

Ejemplos en hotexamples.com: 7

Python Omorfi.load_analyser - 7 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de omorfi.Omorfi.load_analyser extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Omorfi(11)

analyse(8)

load_analyser(7)

guess(4)

load_generator(2)

load_lexical_frequencies(2)

load_omortag_frequencies(2)

tokenise(2)

generate(1)

labelsegment(1)

load_filename(1)

load_from_dir(1)

load_labelsegmenter(1)

load_segmenter(1)

load_udpipe(1)

segment(1)

Ejemplo n.º 1

Mostrar archivo

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="read analyser model from AFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to ftb3", file=stderr)
        exit(4)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # ftb is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                print("Cannot figure out token index", fields[0], file=stderr)
                exit(1)
            token = Token(fields[1])
            token.pos = int(fields[0])
            omorfi.analyse(token)
            if token.is_oov():
                unknowns += 1
                omorfi.guess(token)
            if options.oracle:
                try_analyses_ftb(fields, index, token, options.outfile)
            else:
                print_analyses_ftb(index, token, options.outfile)
        elif line.startswith('<') and line.rstrip().endswith('>'):
            print(line.strip(), file=options.outfile)
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in ftb3 format: '", line, "'", file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)

Ejemplo n.º 2

Mostrar archivo

Archivo: omorfi-freq-evals.py Proyecto: tow/omorfi

def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-f',
                   '--format',
                   metavar="FORMAT",
                   help="use FORMAT formatter to compare analyses",
                   choices=["coverage", "ftb3.1"],
                   default="coverage")
    a.add_argument('-c',
                   '--count',
                   metavar="FREQ",
                   default=0,
                   help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    covered = 0
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    only_permuted = 0
    only_rehashed = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # types
    types_covered = 0
    types_no_results = 0
    types = 0
    # for make check target
    threshold = options.threshold
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        lemma = surf
        analysis = surf
        if options.format != 'coverage':
            lemma = fields[2]
            analysis = fields[3]
        lines += freq
        types += 1
        if options.verbose:
            print(lines, '(', freq, ') ...', end='\r')
        token = Token(surf)
        # pos 1 triggers acceptable detitlecasing
        token.pos = 1
        omorfi.analyse(token)
        if token.is_oov():
            omorfi.guess(token)
        if not token.is_oov():
            covered += freq
            types_covered += 1
        else:
            no_results += freq
            types_no_results += 1
            print(freq, "OOV", surf, sep='\t', file=options.outfile)
        found_anals = False
        found_lemma = False
        rehashed = True
        permuted = True
        for anal in token.analyses:
            if options.format == 'ftb3.1':
                anal_ftb3 = ' '.join(anal.get_ftb_feats())
                lemma_ftb3 = '#'.join(anal.get_lemmas())
                # hacks ftb3:
                analysis = analysis.replace(" >>>", "")
                if analysis == anal_ftb3:
                    found_anals = True
                    permuted = False
                elif set(anal_ftb3.split()) == set(analysis.split()):
                    found_anals = True
                    print(freq,
                          "PERMUTAHIT",
                          analysis,
                          anal_ftb3,
                          sep='\t',
                          file=options.outfile)
                else:
                    print(freq,
                          "ANALMISS",
                          analysis,
                          anal_ftb3,
                          sep='\t',
                          file=options.outfile)
                if lemma == lemma_ftb3:
                    found_lemma = True
                    rehashed = False
                elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''):
                    found_lemma = True
                    print(freq,
                          "LEMMARECOMP",
                          lemma,
                          lemma_ftb3,
                          sep='\t',
                          file=options.outfile)
                else:
                    print(freq,
                          "LEMMAMISS",
                          lemma,
                          lemma_ftb3,
                          sep='\t',
                          file=options.outfile)
        if options.format != 'coverage':
            if not found_anals and not found_lemma:
                no_matches += freq
                print(freq, "NOHITS!", surf, sep='\t', file=options.outfile)
            elif found_anals and found_lemma:
                full_matches += freq
            elif not found_anals:
                anal_matches += freq
                print(freq,
                      "LEMMANOANAL",
                      surf,
                      sep='\t',
                      file=options.outfile)
            elif not found_lemma:
                lemma_matches += freq
                print(freq,
                      "ANALNOLEMMA",
                      surf,
                      sep='\t',
                      file=options.outfile)
            else:
                print("Logical error, kill everyone")
                exit(13)
            if rehashed:
                only_rehashed += freq
            if permuted:
                only_permuted += freq
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    print("Lines", "Covered", "OOV", sep="\t", file=options.statfile)
    print(lines, covered, lines - covered, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          covered / lines * 100 if lines != 0 else 0,
          (lines - covered) / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    print("Types", "Covered", "OOV", sep="\t", file=options.statfile)
    print(types,
          types_covered,
          types - types_covered,
          sep="\t",
          file=options.statfile)
    print(types / types * 100 if types != 0 else 0,
          types_covered / types * 100 if types != 0 else 0,
          (types - types_covered) / types * 100 if types != 0 else 0,
          sep="\t",
          file=options.statfile)
    if options.format == 'ftb3.1':
        print("Lines",
              "Matches",
              "Lemma",
              "Anals",
              "Mismatch",
              "No results",
              sep="\t",
              file=options.statfile)
        print(lines,
              full_matches,
              lemma_matches,
              anal_matches,
              no_matches,
              no_results,
              sep="\t",
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              full_matches / lines * 100 if lines != 0 else 0,
              lemma_matches / lines * 100 if lines != 0 else 0,
              anal_matches / lines * 100 if lines != 0 else 0,
              no_matches / lines * 100 if lines != 0 else 0,
              no_results / lines * 100 if lines != 0 else 0,
              sep="\t",
              file=options.statfile)
        print("Of which",
              "Tag permuations",
              "Lemma rehashing",
              sep='\t',
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              only_permuted / lines * 100 if lines != 0 else 0,
              only_rehashed / lines * 100 if lines != 0 else 0,
              sep='\t',
              file=options.statfile)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    elif options.format == 'ftb3.1' and \
            (full_matches / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    elif options.format == 'coverage' and \
            (covered / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% coverage to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)

Ejemplo n.º 3

Mostrar archivo

def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    covered = 0
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    only_permuted = 0
    accfails = 0
    lines = 0
    # for make check target
    threshold = options.threshold
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        if ' ' in fields[1] or ' ' in fields[0]:
            continue
        lines += 1
        if options.verbose and lines % 1000 == 0:
            print(lines, '...')
        lemma = fields[0]
        surf = fields[1]
        unimorph = fields[2].replace('ACC', 'NOM').replace('GEADJ', 'GEN')
        token = Token(surf)
        omorfi.analyse(token)
        if not token.is_oov():
            covered += 1
        else:
            no_results += 1
            print(1, "OOV", surf, sep='\t', file=options.outfile)
        found_anals = False
        found_lemma = False
        permuted = True
        accfail = False
        for anal in token.analyses:
            analhyp = anal.printable_unimorph()
            lemmahyp = ''.join(anal.get_lemmas())
            if analhyp == unimorph:
                found_anals = True
                permuted = False
            elif set(analhyp.split(';')) == set(unimorph.split(';')):
                found_anals = True
                # print("PERMUTAHIT", analhyp, unimorph, sep='\t',
                #      file=options.outfile)
            else:
                pass
                # print("ANALMISS", analhyp, unimorph, sep='\t',
                #      file=options.outfile)
            if lemma == lemmahyp:
                found_lemma = True
            else:
                pass
                # print("LEMMAMISS", lemmahyp, lemma, sep='\t',
                #      file=options.outfile)
        if not found_anals and not found_lemma:
            no_matches += 1
            print("NOHITS!",
                  surf,
                  lemma,
                  unimorph, [a.printable_unimorph() for a in token.analyses],
                  sep='\t',
                  file=options.outfile)
        elif found_anals and found_lemma:
            full_matches += 1
        elif not found_anals:
            anal_matches += 1
            print("LEMMANOANAL",
                  surf,
                  unimorph, [a.printable_unimorph() for a in token.analyses],
                  sep='\t',
                  file=options.outfile)
        elif not found_lemma:
            lemma_matches += 1
            print("ANALNOLEMMA",
                  surf,
                  lemma, [a.get_lemmas() for a in token.analyses],
                  sep='\t',
                  file=options.outfile)
        else:
            print("Logical error, kill everyone")
            exit(13)
        if permuted:
            only_permuted += 1
        if accfail:
            accfails += 1
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    print("Lines", "Covered", "OOV", sep="\t", file=options.statfile)
    print(lines, covered, lines - covered, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          covered / lines * 100 if lines != 0 else 0,
          (lines - covered) / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    print("Lines",
          "Matches",
          "Lemma",
          "Anals",
          "Mismatch",
          "No results",
          sep="\t",
          file=options.statfile)
    print(lines,
          full_matches,
          lemma_matches,
          anal_matches,
          no_matches,
          no_results,
          sep="\t",
          file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          full_matches / lines * 100 if lines != 0 else 0,
          lemma_matches / lines * 100 if lines != 0 else 0,
          anal_matches / lines * 100 if lines != 0 else 0,
          no_matches / lines * 100 if lines != 0 else 0,
          no_results / lines * 100 if lines != 0 else 0,
          sep="% \t",
          file=options.statfile)
    print("Of which", "Tag permuations", sep='\t', file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          only_permuted / lines * 100 if lines != 0 else 0,
          sep='\t',
          file=options.statfile)
    if full_matches / lines * 100 <= int(options.threshold):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    elif covered / lines * 100 <= int(options.threshold):
        print("needs to have",
              threshold,
              "% coverage to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)

Ejemplo n.º 4

Mostrar archivo

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   required=True,
                   help="read analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-u',
                   '--udpipe',
                   metavar="UDPIPE",
                   help='use UDPIPE for additional guesses (experi-mental)')
    a.add_argument('--hacks',
                   metavar='HACKS',
                   help="mangle analyses to match HACKS version of UD",
                   choices=['ftb'])
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--not-rules',
                   metavar="RULEFILE",
                   type=open,
                   required=True,
                   help="read non-rules from RULEFILE")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to conllu", file=stderr)
        exit(4)
    disamparsulator = Disamparsulator()
    if options.not_rules:
        if options.verbose:
            print("Loading", options.not_rules)
        disamparsulator.frobblesnizz(options.not_rules)
    if options.udpipe:
        if options.verbose:
            print("Loading udpipe", options.udpipe)
        omorfi.load_udpipe(options.udpipe)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    eoffed = False
    while not eoffed:
        sentplus = next_conllu(options.infile)
        if not sentplus:
            eoffed = True
            break
        for token in sentplus:
            if token.nontoken:
                if token.nontoken == 'comment':
                    pass
                elif token.nontoken == 'eof':
                    eoffed = True
                    break
                elif token.nontoken == 'separator':
                    sentences += 1
                elif token.nontoken == 'error':
                    print("Unrecognisable line:", token.error, file=stderr)
                    exit(1)
                else:
                    print("Error:", token, file=stderr)
                    exit(1)
                continue
            elif not token.surf:
                print("No surface in CONLL-U?", token, file=stderr)
                exit(1)
            tokens += 1
            omorfi.analyse(token)
            if token.is_oov():
                unknowns += 1
                omorfi.guess(token)
        disamparsulator.linguisticate(sentplus)
        print_analyses(sentplus, options)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    print("Sentences per timeunit:",
          sentences / (realend - realstart),
          file=options.statfile)
    exit(0)

Ejemplo n.º 5

Mostrar archivo

Archivo: omorfi-vislcg.py Proyecto: tow/omorfi

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-F',
                   '--format',
                   metavar="INFORMAT",
                   default='text',
                   help="read input using INFORMAT tokenisation",
                   choices=['text', 'vislcg', 'conllu', 'sentences'])
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('--not-rules',
                   metavar="RULEFILE",
                   type=open,
                   help="read non-rules from RULEFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is required to vislcg", file=stderr)
        exit(4)
    disamparsulator = None
    if options.not_rules:
        if options.verbose:
            print("Reading rulestuff", options.not_rules.name)
        disamparsulator = Disamparsulator()
        disamparsulator.frobblesnizz(options.not_rules)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        if options.outfile == stdout:
            options.statfile = stdout
        else:
            options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokencount = 0
    unknowns = 0
    eoffed = False
    while not eoffed:
        if options.format == 'vislcg':
            tokens = next_vislcg(options.infile)
        elif options.format == 'text':
            tokens = next_plaintext(options.infile)
        elif options.format == 'conllu':
            tokens = next_conllu(options.infile)
        else:
            print("input format missing implementation",
                  options.format,
                  file=stderr)
            exit(2)
        if not tokens:
            break
        for token in tokens:
            if token.surf:
                tokencount += 1
                omorfi.analyse(token)
                if token.is_oov():
                    unknowns += 1
                    omorfi.guess(token)
            elif token.error or token.nontoken:
                pass
            else:
                print("Unrecognised", token, file=stderr)
                exit(2)
        if disamparsulator:
            disamparsulator.linguisticate(tokens)
        for token in tokens:
            if token.nontoken and token.nontoken == "eof":
                eoffed = True
                break
            print(token.printable_vislcg(), file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("# Tokens:",
          tokencount,
          "\n# Unknown:",
          unknowns,
          unknowns / tokencount * 100 if tokencount > 0 else 0,
          "%",
          file=options.statfile)
    print("# CPU time:",
          cpuend - cpustart,
          "\n# Real time:",
          realend - realstart,
          file=options.statfile)
    print("# Tokens per timeunit:",
          tokencount / (realend - realstart),
          file=options.statfile)
    exit(0)

Ejemplo n.º 6

Mostrar archivo

Archivo: omorfi-sigmorphons.py Proyecto: mikahama/omorfi

def main():
    """Command-line interface for omorfi's sort | uniq -c tester."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-g',
                   '--generator',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    a.add_argument('-F',
                   '--format',
                   metavar="FMT",
                   required=True,
                   help="which SIGMORHON shared task format is used")

    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if options.generator:
            if options.verbose:
                print("reading generator from", options.generator)
            omorfi.load_generator(options.generator)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    correct = 0
    incorrect = 0
    oov = 0
    lines = 0
    # for make check target
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        omors = None
        lemma = None
        print("<<<", fields)
        if options.format == '1':
            lemma = fields[0]
            omors = unimorph2omor(fields[1])
        elif options.format == '2':
            srcomors = unimorph2omor(fields[0])
            srchyps = omorfi.analyse(fields[1])
            for srchyp in srchyps:
                if srcomors in srchyp.raw and len(srchyp.get_lemmas()) == 1:
                    lemma = srchyp.get_lemmas()[0]
            if not lemma:
                lemma = ''.join(srchyps[0].get_lemmas())
            omors = unimorph2omor(fields[2])
        elif options.format == '3':
            srchyps = omorfi.analyse(fields[0])
            for srchyp in srchyps:
                if len(srchyp.get_lemmas()) == 1:
                    lemma = srchyp.get_lemmas()[0]
            if not lemma:
                lemma = ''.join(srchyps[0].get_lemmas())
            omors = unimorph2omor(fields[1])
        else:
            print("format fail", options.format)
            exit(1)
        genomor = '[WORD_ID=' + lemma + ']' + omors
        print(">>> ", genomor)
        generations = omorfi.generate(genomor)
        if not generations or '[' in generations:
            oov += 1
            genat1 = lemma
            print("OOV", genat1)
        else:
            genat1 = generations.split('/')[0]
            print("@1 ", genat1)
        if options.format == '1':
            if genat1 == fields[2]:
                correct += 1
            else:
                print("MIS", genat1, "!=", fields[2])
                incorrect += 1
        elif options.format == '2':
            if genat1 == fields[3]:
                correct += 1
            else:
                print("MIS", genat1, "!=", fields[2])
                incorrect += 1
        elif options.format == '3':
            if genat1 == fields[2]:
                correct += 1
            else:
                print("MIS", genat1, "!=", fields[2])
                incorrect += 1
        lines += 1
        if options.verbose and lines % 1000 == 0:
            print(lines, '...')
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    print("Lines", "Corect", "OOV", sep="\t", file=options.statfile)
    print(lines, correct, oov, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          correct / lines * 100 if lines != 0 else 0,
          oov / lines * 100,
          sep="\t",
          file=options.statfile)
    exit(0)

Ejemplo n.º 7

Mostrar archivo

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a', '--analyser', metavar='AFILE',
                   help="load tokeniser model from (analyser) AFILE",
                   required=True)
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--output-format', metavar="OUTFORMAT",
                   default="moses",
                   help="format output for OUTFORMAT", choices=['moses',
                                                                'conllu',
                                                                'json',
                                                                'ftb3'])
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading language model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed for tokenisation", file=stderr)
        exit(1)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    lines = 0
    if options.output_format == 'conllu':
        print("# new doc id=", options.infile.name, file=options.outfile)
    for line in options.infile:
        line = line
        lines += 1
        if options.verbose and lines % 10000 == 0:
            print(lines, "...")
        if not line or line.rstrip('\n') == '':
            continue
        surfs = omorfi.tokenise(line)
        tokens += len(surfs)
        if options.output_format == 'moses':
            print(' '.join([surf.surf for surf in surfs]),
                  file=options.outfile)
        elif options.output_format == 'json':
            print(json.encode(surfs))
        elif options.output_format == 'conllu':
            print("# sent_id =", lines, file=options.outfile)
            print("# text =", line.rstrip("\n"), file=options.outfile)
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_",
                      sep="\t", file=options.outfile)
                i += 1
        elif options.output_format == 'ftb3':
            print("<s><loc file=\"", options.infile.name, "\" line=\"",
                  lines, "\" />", file=options.outfile, sep="")
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_",
                      sep="\t", file=options.outfile)
                i += 1
            print("</s>", file=options.outfile)
        if options.output_format == 'conllu':
            print(file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines,
          "tokens/line", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          "Lines per timeunit:", lines / (realend - realstart),
          file=options.statfile)
    exit(0)