Ejemplo n.º 1
0
 def _analyse(this, token, automaton):
     res = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token))
     if len(token) > 2 and token[0].islower() and not token[1:].islower() and this.can_titlecase:
         tcres = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token[0].lower() + token[1:].lower()))
         for r in tcres:
             r.output = r.output + convert_omor_tag('[CASECHANGE=TITLECASED]',
                     automaton)
         res = res + tcres
     if not token.isupper() and this.can_uppercase:
         upres = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token.upper()))
         for r in tupes:
             r.output = r.output + convert_omor_tag('[CASECHANGE=UPPERCASED]'.
                     automaton)
         res = res + tcres
     if not token.islower() and this.can_lowercase:
         lowres = libhfst.detokenize_paths(this.analysers[automaton].lookup_fd(token.lower()))
         for r in lowres:
             r.output = r.output + convert_omor_tag('[CASECHANGE=LOWERCASED]',
                     automaton)
         res += lowres
     for r in res:
         r.output = r.output + convert_omor_tag('[WEIGHT=%f]' %(r.weight),
                 automaton)
     return res
Ejemplo n.º 2
0
        out.redirect(tr)
        assert(False)
    except: # libhfst.StreamIsClosedException:
        assert(libhfst.hfst_get_exception() == "StreamIsClosedException")
        pass


# Transducer is cyclic. 
# ---------------------
print("TransducerIsCyclicException")

for type in types:
    transducer = libhfst.HfstTransducer("a", "b", type)
    transducer.repeat_star()
    try:
        results = libhfst.detokenize_paths(libhfst.extract_paths(transducer))
        print("The transducer has {0} paths".format(len(results)))
        assert(False)
    except: # libhfst.TransducerIsCyclicException:
        print("The transducer is cyclic and has an infinite number of paths.")


# The stream does not contain transducers. 
# ----------------------------------------
print("NotTransducerStreamException")

foofile = open('foofile', 'wb')
foofile.write('This is a text file.\n'.encode('ascii'))
foofile.write('Here is another line.\n'.encode('ascii'))
foofile.write('The file ends here.'.encode('ascii'))
foofile.close()
Ejemplo n.º 3
0
import libhfst
import io

tr = libhfst.HfstTransducer('a', 'b', libhfst.TROPICAL_OPENFST_TYPE)
paths = libhfst.extract_paths(tr)
for path in libhfst.detokenize_paths(paths):
    print("{0}:{1}  {2}".format(path.input, path.output, path.weight))

tr = libhfst.HfstTransducer('a', 'b', libhfst.TROPICAL_OPENFST_TYPE)
tr.convert(libhfst.HFST_OLW_TYPE)
for path in libhfst.detokenize_paths(tr.lookup("a")):
    print("{0}  {1}".format(path.output, path.weight))

Ejemplo n.º 4
0
        out.close()
        out.redirect(tr)
        assert (False)
    except:  # libhfst.StreamIsClosedException:
        assert (libhfst.hfst_get_exception() == "StreamIsClosedException")
        pass

# Transducer is cyclic.
# ---------------------
print("TransducerIsCyclicException")

for type in types:
    transducer = libhfst.HfstTransducer("a", "b", type)
    transducer.repeat_star()
    try:
        results = libhfst.detokenize_paths(libhfst.extract_paths(transducer))
        print("The transducer has {0} paths".format(len(results)))
        assert (False)
    except:  # libhfst.TransducerIsCyclicException:
        print("The transducer is cyclic and has an infinite number of paths.")

# The stream does not contain transducers.
# ----------------------------------------
print("NotTransducerStreamException")

foofile = open('foofile', 'wb')
foofile.write('This is a text file.\n'.encode('ascii'))
foofile.write('Here is another line.\n'.encode('ascii'))
foofile.write('The file ends here.'.encode('ascii'))
foofile.close()
try:
Ejemplo n.º 5
0
def main():
    a = ArgumentParser(
            description="Tokeniser for plain text data using HFST automata. "
            "Takes a text stream input and outputs TSV token stream where "
            "one line is one token. Tokens should include white-space tokens,"
            "but this decision is solely up to output of used automata. "
            "Some automata may be able to parse non-plain marked up text.",
            epilog="If INFILE or OFILE is omitted, standard streams will be "
            "used.\n"
            "If DISAMB is omitted, greedy LRLM will be used.")
    a.add_argument('inputs', metavar='INFILE', type=open,
            nargs='*', help="Files to process with corpus tool")
    a.add_argument('--output', '-o', metavar='OFILE',
            type=FileType('w'), help="store result in OFILE")
    a.add_argument('--tokeniser', '-t', action='append', metavar='TFILE',
            help="Pre-process input stream with automata from TFILE")
    a.add_argument('--disambiguation', '-d', metavar='DISAMB', 
            choices=['LRLM'], default='LRLM',
            help="use DISAMB tactic to select from multiple paths")
    a.add_argument("--verbose", '-v', action='store_true',
            help="print verbosely while processing")
    opts = a.parse_args()
    tokenisers = list()
    if not opts.output:
        if opts.verbose:
            print("printing output to stdout, disabling verbose", stderr)
            opts.verbose = False
        opts.output = stdout
    if not opts.tokeniser:
        if opts.verbose:
            print("Using Unicode tokeniser with character classes")
        tokeniserstream = libhfst.HfstInputStream("tokeniser-unicode.openfst.hfst")
        t = libhfst.HfstTransducer(tokeniserstream)
        tokenisers.append(t)
    else:
        for tokeniserfile in opts.tokeniser:
            if opts.verbose:
                print("Reading from", tokeniserfile)
            tokeniserstream = libhfst.HfstInputStream(tokeniserfile)
            t = libhfst.HfstTransducer(tokeniserstream)
            if opts.verbose:
                print("Read tokeniser", t.get_property('name'))
            tokenisers.append(t)
    if len(opts.inputs) < 1:
        if opts.verbose:
            print("Reading corpus data from <stdin>")
        opts.inputs = [stdin]
    if opts.verbose:
        print("Creating UTF-8 character tokeniser for HFST")
    hfst_tokeniser = libhfst.HfstTokenizer()
    for inputfile in opts.inputs:
        print("# hfst-tokenise.py TSV token stream 1", file=opts.output)
        print("# From input file", inputfile, file=opts.output)
        print("# Next line is a header line", file=opts.output)
        print("Token", file=opts.output)
        for line in inputfile:
            line = line.strip('\n')
            if not line or line == '':
                print('\\n', file=opts.output)
                continue
            could_tokenise = False
            for tokeniser in tokenisers:
                if tokeniser.get_type() == libhfst.TROPICAL_OPENFST_TYPE:
                    pathmaton = libhfst.HfstTransducer(line, hfst_tokeniser,
                            libhfst.TROPICAL_OPENFST_TYPE)
                    tokenisation = libhfst.extract_paths_fd(pathmaton.compose(tokeniser))
                    paths = libhfst.detokenize_paths(tokenisation)
                    tokens = None
                    if opts.disambiguation == 'LRLM':
                        tokens = take_greedy_lrlm_tokens(paths)
                    else:
                        print("What is this DISAMB?", opts.disambiguation,
                                file=stderr)
                    if tokens:
                        for token in tokens:
                            print(token.replace('@_EPSILON_SYMBOL_@', ''))
                        could_tokenise = True
                        break
                    else:
                        if opts.verbose:
                            print("Got no tokens with FOO using",
                                    opts.disambiguation)
                else:
                    print("Not impl !OFST", file=stderr)
                    exit(2)
            if not could_tokenise:
                for token in line.split():
                    print(token, file=opts.output)
            print("\\n", file=opts.output)
    exit(0)
Ejemplo n.º 6
0
def main():
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True,
            help="HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True,
            dest="infile", help="source of analysis data")
    a.add_argument('-o', '--output', metavar="OUTFILE", required=True,
            type=FileType('w'),
            dest="outfile", help="result file")
    a.add_argument('-X', '--statistics', metavar="STATFILE",
            type=FileType('w'),
            dest="statfile", help="statistics")
    options = a.parse_args()
    omorfi = libhfst.HfstTransducer(libhfst.HfstInputStream(options.fsa))
    if not options.statfile:
        options.statfile = stdout
    # basic statistics
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # known bugs by type
    deduct_forgn = 0
    deduct_advposman = 0
    deduct_oliprt = 0
    deduct_abbr_prop = 0
    # known bugs by statistic to deduct
    deduct_lemma = 0
    deduct_anal = 0
    deduct_matches = 0
    deduct_results = 0
    # for make check target
    threshold = 90
    for line in options.infile:
        conllxes = line.split('\t')
        if len(conllxes) < 10:
            if not line.startswith("<"):
                print("ERROR: Skipping line", line, file=stderr)
            continue
        lines += 1
        ftbsurf = conllxes[1]
        ftblemma = conllxes[2]
        ftbanals = conllxes[5]
        anals = libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf))
        if ftbsurf[0].isupper():
            anals += libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf[0].lower() + ftbsurf[1:]))
        if ftbsurf.isupper():
            anals += libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf.lower()))
        if ftbsurf.isupper():
            anals += libhfst.detokenize_paths(omorfi.lookup_fd(ftbsurf[0] + ftbsurf[1:].lower()))
        found_anals = False
        found_lemma = False
        print_in = True
        for anal in anals:
            if ftbanals in anal.output:
                found_anals = True
            if ftblemma in anal.output:
                found_lemma = True
        if len(anals) == 0:
            print_in = False
            no_results += 1
            if 'Forgn' in ftbanals:
                deduct_forgn += 1
                deduct_results += 1
                print_in = False
            else:
                print("NORESULTS:", ftbsurf, ftblemma, ftbanals, sep="\t",
                    file=options.outfile)
        elif not found_anals and not found_lemma:
            no_matches += 1
            if 'Adv Pos Man' in ftbanals:
                deduct_advposman += 1
                deduct_matches += 1
                print_in = False
            else:
                print("NOMATCH:", ftbsurf, ftblemma, ftbanals, sep="\t", end="\t",
                    file=options.outfile)
        elif not found_anals:
            lemma_matches += 1
            if 'Adv Pos Man' in ftbanals:
                deduct_advposman += 1
                deduct_lemma += 1
                print_in = False
            elif 'V Prt Act' in ftbanals and ftbsurf.startswith('oli'):
                deduct_oliprt += 1
                deduct_lemma += 1
                print_in = False
            elif 'Forgn' in ftbanals:
                deduct_forgn += 1
                deduct_lemma += 1
                print_in = False
            elif 'Abbr' in ftbanals:
                propfail = False
                for anal in anals:
                    if 'Abbr Prop' in anal.output:
                        propfail = True
                if propfail:
                    deduct_abbr_prop += 1
                    deduct_lemma += 1
                    print_in = False
                else:
                    print("NOANALMATCH:", ftbsurf, ftbanals, sep="\t", end="\t",
                        file=options.outfile)
            else:
                print("NOANALMATCH:", ftbsurf, ftbanals, sep="\t", end="\t",
                    file=options.outfile)
        elif not found_lemma:
            anal_matches += 1
            print("NOLEMMAMATCH:", ftbsurf, ftblemma, sep="\t", end="\t",
                    file=options.outfile)
        else:
            full_matches += 1
            print_in = False
        if print_in:
            print(":IN:", end="\t", file=options.outfile)
            for anal in anals:
                print(anal.output, end='\t', file=options.outfile)
            print(file=options.outfile)
    print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t",
            file=options.statfile)
    print(lines, full_matches, lemma_matches, anal_matches, no_matches,
            no_results,
            sep="\t", file=options.statfile)
    print(lines / lines * 100, full_matches / lines * 100,
            lemma_matches / lines * 100, anal_matches / lines * 100,
            no_matches / lines * 100, no_results / lines * 100,
            sep="\t", file=options.statfile)
    print("Deducting known bugs...\n",
            "Forgn:", deduct_forgn,
            "\nAdv Pos Man:", deduct_advposman,
            "\noli V Prt Act:", deduct_oliprt,
            "\nAbbr Prop:", deduct_abbr_prop,
            file=options.statfile)
    lines = lines - deduct_forgn - deduct_advposman - deduct_oliprt - deduct_abbr_prop
    no_results -= deduct_results
    no_matches -= deduct_matches
    lemma_matches -= deduct_lemma
    print(lines, full_matches, lemma_matches, anal_matches, no_matches,
            no_results,
            sep="\t", file=options.statfile)
    print(lines / lines * 100, full_matches / lines * 100,
            lemma_matches / lines * 100, anal_matches / lines * 100,
            no_matches / lines * 100, no_results / lines * 100,
            sep="\t", file=options.statfile)
    if (full_matches / lines * 100 < threshold):
        print("needs to have", threshold, "% matches to pass regress test\n",
                "please examine", options.outfile.name, "for regressions",
                file=stderr)
        exit(1)
    else:
        exit(0)
Ejemplo n.º 7
0
    hippopotamus2.set_final_weights(1.4)
    animals.disjunct(hippopotamus1)
    animals.disjunct(hippopotamus2)
    animals.minimize()

    results = libhfst.extract_paths(animals, 5, 0)
    #print results

    # Convert into optimized lookup format
    animals_ol = libhfst.HfstTransducer(animals)
    if type == libhfst.TROPICAL_OPENFST_TYPE:
        animals_ol.convert(libhfst.HFST_OLW_TYPE)
    else:
        animals_ol.convert(libhfst.HFST_OL_TYPE)
    
    result = libhfst.detokenize_paths(animals_ol.lookup("hippopotamus"))
    #for res in result:
    #    print res[0]
    #    print res[1]

    if type == libhfst.TROPICAL_OPENFST_TYPE:
        best_animals = libhfst.HfstTransducer(animals)
        best_animals.n_best(3)
        best_animals.convert(libhfst.HFST_OLW_TYPE)
        assert(libhfst.detokenize_paths(best_animals.lookup("mouse"))[0].output == "mice")
        assert(libhfst.detokenize_paths(best_animals.lookup("hippopotamus"))[0].output == "hippopotami")
        assert(libhfst.detokenize_paths(best_animals.lookup("hippopotamus"))[1].output == "hippopotamuses")
    
    
    print("Function insert_freely")