Ejemplo n.º 1
0
 def lookup(self):
     # getting analysis data used in test
     analysis_stream = libhfst.HfstInputStream(self.morph_path).read()
     for section in self.sections:
       for test in section.tests:
         for result in analysis_stream.lookup(test.right):
           test.ana_result.append(result[0])
     
     # getting generation data used in test
     generation_stream = libhfst.HfstInputStream(self.gen_path).read()
     for section in self.sections:  
       for test in section.tests:
         for result in generation_stream.lookup(test.left):
           test.gen_result.append(result[0])
Ejemplo n.º 2
0
def get_transducer(fsa):
    istr = libhfst.HfstInputStream(fsa)
    transducers = []
    while not (istr.is_eof()):
        transducers.append(istr.read())
    istr.close()
    return transducers[0]
Ejemplo n.º 3
0
def getTransducer(fsa):
    """Returns transducer from finite state automata"""
    istr = libhfst.HfstInputStream(fsa)
    transducers = []
    while not (istr.is_eof()):
        transducers.append(istr.read())
    istr.close()
    return transducers[0]
Ejemplo n.º 4
0
    def load_hfst(self, f):
        """Load an automaton from file.

        @param f containing single hfst automaton binary.
        """
        try:
            his = libhfst.HfstInputStream(f)
            return his.read()
        except libhfst.NotTransducerStreamException:
            raise IOError
Ejemplo n.º 5
0
def generate(parse_str):
    '''
    Generate a word given a parser output (lemma + tags).
    '''
    global generator
    s = parse_str.replace('#', '+Use/NoHyphens#')
    if not generator:
        generator = libhfst.HfstInputStream(generator_file).read()
    g = generator.lookup(s, output='tuple')
    try:
        g = g[0][0]
        return g
    except IndexError:
        return s.split('+')[0]
Ejemplo n.º 6
0
def load_analyser(filename):
    """Load an automaton from file.

    Args:
        filename:  containing single hfst automaton binary.

    Throws:
        FileNotFoundError if file is not found
    """
    try:
        his = libhfst.HfstInputStream(filename)
        return his.read()
    except libhfst.NotTransducerStreamException:
        raise IOError(2, filename) from None
Ejemplo n.º 7
0
def loadTransducer():
    try:
        istr = libhfst.HfstInputStream(r"italian_verb_analyzer.hfst")
        transducers = []        
        while not (istr.is_eof()):
            transducers.append(istr.read())
            
        istr.close()
        td = hfst.HfstBasicTransducer(transducers[1])       
        return td  
    except:
        print('Transducer file was invalid or not found.')
        time.sleep(3)
        exit()
Ejemplo n.º 8
0
            freq_rel_feat[feat] = {}
        freq_rel_feat[feat][rel] = int(freq)
        if ':' in rel:
            rel = rel.split(':')[0]
            freq_rel_feat[feat][rel] = int(freq)
    return freq_rel_feat


###############################################################################

if len(sys.argv) < 3:  #{
    print('conllu-morph.py <fst> <tsv>')
    sys.exit(-1)
#}

istr = libhfst.HfstInputStream(sys.argv[1])
morf = istr.read()
#morf.remove_epsilons();

af = open(sys.argv[2])
apertium_symbs = read_rules(af)

freq_rel_feat = {}
if len(sys.argv) == 4:
    freq_rel_feat = read_rel_feat(sys.argv[3])

#print(freq_rel_feat, file=sys.stderr)

#print(apertium_symbs);

unknown = 0
Ejemplo n.º 9
0
    results = transducer.extract_paths(output='text')
    print("The transducer has %i paths:" % len(results))
    print(results)
except libhfst.TransducerIsCyclicException:
    print(
        "The transducer is cyclic and has an infinite number of paths. Some of them:"
    )
    results = transducer.extract_paths(output='text', max_cycles=5)
    print(results)

# NotTransducerStreamException
f = open('foofile', 'w')
f.write('This is an ordinary text file.\n')
f.close()
try:
    instr = libhfst.HfstInputStream('foofile')
    tr = instr.read()
    print(tr)
    instr.close()
except libhfst.NotTransducerStreamException:
    print(
        "Could not print transducer: the file does not contain binary transducers."
    )

f = open('testfile1.att', 'w')
f.write('0 1 a b\n\
1 2 c\n\
2\n')
f.close()
f = libhfst.hfst_open('testfile1.att', 'r')
try:
Ejemplo n.º 10
0
    msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)

ttype = 0
if sys.argv[1] == 'sfst':
    ttype = libhfst.SFST_TYPE
elif sys.argv[1] == 'openfst':
    ttype = libhfst.TROPICAL_OPENFST_TYPE
elif sys.argv[1] == 'foma':
    ttype = libhfst.FOMA_TYPE
else:
    print("ERROR: could not parse transducer format argument.")
    sys.exit(1)
    
transducers_in_stream = int(sys.argv[2])

istr = libhfst.HfstInputStream()
ostr = libhfst.HfstOutputStream(ttype)
transducers_read = 0
transducers_written = 0

while True:
    try:
        tr = libhfst.HfstTransducer(istr)
        transducers_read += 1
        ostr.redirect(tr)
        transducers_written += 1
    except: # libhfst.EndOfStreamException:
        assert(libhfst.hfst_get_exception() == "EndOfStreamException")
        break;
if transducers_read != transducers_in_stream:
    print("ERROR: wrong number of transducers read")
Ejemplo n.º 11
0
 def __init__(self, lang, analyzer_file, **kwargs):
     self.language = lang
     self.analyzer = libhfst.HfstInputStream(analyzer_file).read()
Ejemplo n.º 12
0
    def load_filename(self, path, **include):
        """Load omorfi automaton from filename and guess its use.

        A file name should consist of three parts separated by full stop.
        The second part must be a keyword describing the use of the
        automaton, first part is parsed as an identifier typically starting
        with the word omorfi, followed by any extras, such as the tagset for
        analysis or generation.

        The named arguments can include a name of automaton type as name,
        and truth value as value, for types of automata allowed to load.
        By default, the names `analyse`, `generate` and `segment` are loaded.
        Names not included are defaulted to False. E.g.,
        `omorfi.load_filename(fn, analyse=True)`
        will only load file named fn if it can be identified as omorfi
        analyser. This is best used in conjunction with omorfi.load_from_dir.
        """
        if len(include) == 0:
            include['analyse'] = True
            include['generate'] = True
            include['segment'] = True
            include['accept'] = True
        for ttype in [
                'analyse', 'generate', 'accept', 'tokenise', 'lemmatise',
                'hyphenate', 'segment', 'labelsegment'
        ]:
            if ttype not in include:
                include[ttype] = False
        his = None
        if self._verbosity:
            print('Opening file', path)
        if access(path, F_OK):
            his = libhfst.HfstInputStream(path)
        else:
            # FIXME: should fail
            if self._verbosity:
                print('No access to ', path, file=stderr)
            pass
        parts = path[path.rfind('/') + 1:path.rfind('.')].split('.')
        if len(parts) != 2:
            if self._verbosity:
                print('not loaded', path)
        elif not parts[0].startswith('omorfi'):
            if self._verbosity:
                print('not omorfi', path)
        elif parts[1] == 'analyse' and include['analyse']:
            if self._verbosity:
                print('analyser', parts[0])
            self.analysers[parts[0]] = his.read()
        elif parts[1] == 'generate' and include['generate']:
            if self._verbosity:
                print('generator', parts[0])
            self.generators[parts[0]] = his.read()
        elif parts[1] == 'accept' and include['accept']:
            if self._verbosity:
                print('acceptor', parts[0])
            self.acceptors[parts[0]] = his.read()
        elif parts[1] == 'tokenise' and include['tokenise']:
            if self._verbosity:
                print('tokeniser', parts[0])
            self.tokenisers[parts[0]] = his.read()
        elif parts[1] == 'lemmatise' and include['lemmatise']:
            if self._verbosity:
                print('lemmatiser', parts[0])
            self.lemmatisers[parts[0]] = his.read()
        elif parts[1] == 'hyphenate' and include['hyphenate']:
            if self._verbosity:
                print('hyphenator', parts[0])
            self.hyphenators[parts[0]] = his.read()
        elif parts[1] == 'segment' and include['segment']:
            if self._verbosity:
                print('segmenter', parts[0])
            self.segmenters[parts[0]] = his.read()
        elif parts[1] == 'labelsegment' and include['labelsegment']:
            if self._verbosity:
                print('labelsegmenter', parts[0])
            self.labelsegmenters[parts[0]] = his.read()
        elif self._verbosity:
            print('skipped', parts)
Ejemplo n.º 13
0
        print("The transducer has {0} paths".format(len(results)))
        assert (False)
    except:  # libhfst.TransducerIsCyclicException:
        print("The transducer is cyclic and has an infinite number of paths.")

# The stream does not contain transducers.
# ----------------------------------------
print("NotTransducerStreamException")

foofile = open('foofile', 'wb')
foofile.write('This is a text file.\n'.encode('ascii'))
foofile.write('Here is another line.\n'.encode('ascii'))
foofile.write('The file ends here.'.encode('ascii'))
foofile.close()
try:
    instr = libhfst.HfstInputStream("foofile")
except:  # libhfst.NotTransducerStreamException:
    print("file does not contain transducers.")

# The stream is not in valid AT&T format.
# ---------------------------------------
print("NotValidAttFormatException")

testfile_att = open("testfile.att", "wb")
testfile_att.write('0 1 a b\n'.encode('ascii'))
testfile_att.write('1\n'.encode('ascii'))
testfile_att.write('c\n'.encode('ascii'))
testfile_att.close()

for type in types:
    transducers = []
Ejemplo n.º 14
0
def main():
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True,
            help="HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True,
            dest="infile", help="source of analysis data")
    a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'),
            required=True,
            dest="outfile", help="log file name")
    a.add_argument('-v', '--verbose', action="store_true", default=False,
            help="Print verbosely while processing")
    a.add_argument('-c', '--count', metavar="FREQ", default=0,
            help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int,
            help="require THOLD % coverage or exit 1 (for testing)")
    options = a.parse_args()
    his = libhfst.HfstInputStream(options.fsa)
    omorfi = his.read()
    #libhfst.HfstTransducer(libhfst.HfstInputStream(options.fsa))
    # statistics
    tokens = 0
    uniqs = 0
    found_tokens = 0
    found_uniqs = 0
    missed_tokens = 0
    missed_uniqs = 0
    # for make check target
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        tokens += freq
        uniqs += 1
        if options.verbose:
            print(tokens, "(", freq, ')...', end='\r')
        anals = omorfi.lookup(surf)
        if surf[0].isupper():
            anals += omorfi.lookup(surf[0].lower() + surf[1:])
        if surf.isupper():
            anals += omorfi.lookup(surf.lower())
        if surf.isupper():
            anals += omorfi.lookup(surf[0] + surf[1:].lower())
        if len(anals) > 0:
            found_tokens += freq
            found_uniqs += 1
        else:
            missed_tokens += freq
            missed_uniqs += 1
            print(freq, surf, "? (missed)", sep="\t", file=options.outfile)
    if options.verbose:
        print()
    cpuend = process_time()
    realend = perf_counter()
    print("cpu time: ", cpuend - cpustart,
            "real time:", realend - realstart)
    print("Tokens", "Matches", "Misses", "%", sep="\t")
    print(tokens, found_tokens, missed_tokens, 
            found_tokens / tokens * 100 if tokens != 0 else 0,
            sep="\t")
    print("Uniqs", "Matches", "Misses", "%", sep="\t")
    print(uniqs, found_uniqs, missed_uniqs, 
            found_uniqs / uniqs * 100 if uniqs != 0 else 0,
            sep="\t")
    if tokens == 0 or (found_tokens / tokens * 100 < options.threshold):
        print("needs to have", options.threshold,
                "% non-unique matches to pass regress test\n",
                file=stderr)
        exit(1)
    else:
        exit(0)
Ejemplo n.º 15
0
if len(sys.argv) <= 1: #{
	print('test.py <lang code>');
	sys.exit(-1);
#}

lang = sys.argv[1];

testf = [];

if len(sys.argv) == 3: #{
	testf = [sys.argv[2]];
else:
	testf = glob.glob('*.tsv');
#}

istr1 = libhfst.HfstInputStream('../'+lang+'.automorf.hfst');
anal = istr1.read();
#anal.remove_epsilons();

istr2 = libhfst.HfstInputStream('../'+lang+'.autogen.hfst');
gene = istr2.read();
#gene.remove_epsilons();


print(testf);
err_g = 0;
corr_g = 0;
total_g = 0;
err_a = 0;
corr_a = 0;
total_a = 0;
Ejemplo n.º 16
0
'''This is a demo python script to show how you might do lookup through
libhfst, in this case using an omorfi installation.'''

import os, sys
from itertools import ifilterfalse as ffilter
import libhfst
datadir = "/usr/local/share/hfst/fi"
omorfipath = os.path.abspath(datadir + "/morphology.finntreebank.hfstol")

def process_result_vector(vector):
    results = []
    for entry in vector:
        if len(entry) < 2:
            continue
        weight = entry[0]
        string = ''.join(ffilter(libhfst.FdOperation.is_diacritic, entry[1]))
        results.append((string, weight))
    return results

istr = libhfst.HfstInputStream(omorfipath)
transducer = libhfst.HfstTransducer(istr)
input = raw_input()
while input:
    results = process_result_vector(libhfst.vectorize(transducer.lookup_fd(input)))
    for result in results:
        print result[0] + '\t' + str(result[1])
    try:
        input = raw_input()
    except EOFError:
        sys.exit()
Ejemplo n.º 17
0
    return cf.f_back.f_lineno

for type in (libhfst.TROPICAL_OPENFST_TYPE, libhfst.FOMA_TYPE):

    print('\n--- Testing implementation type %s ---\n' % libhfst.fst_type_to_string(type))

    libhfst.set_default_fst_type(type)

    tr1 = None
    tr2 = None
    tr3 = None

    if not os.path.isfile('foobar.hfst'):
        raise RuntimeError('Missing file: foobar.hfst')

    istr = libhfst.HfstInputStream('foobar.hfst')
    numtr = 0
    try:
        tr1 = istr.read()
        numtr += 1
        tr2 = istr.read()
        numtr += 1
        tr3 = istr.read()
        numtr += 1
    except libhfst.EndOfStreamException:
        pass
    except:
        raise RuntimeError(get_linenumber())
    istr.close()

    if numtr != 2:
Ejemplo n.º 18
0
    #os.remove('foo.hfst')
    pass


for ttype in (libhfst.SFST_TYPE, libhfst.TROPICAL_OPENFST_TYPE,
              libhfst.FOMA_TYPE):

    tr1 = libhfst.HfstTransducer('a', 'b', ttype)
    tr2 = libhfst.HfstTransducer('c', 'd', ttype)
    ostr = libhfst.HfstOutputStream('foo.hfst', tr1.get_type())
    ostr.redirect(tr1)
    ostr.redirect(tr2)
    ostr.close()
    att_file = libhfst.hfst_open('foo.att', 'w')

    istr = libhfst.HfstInputStream('foo.hfst')

    transducers_read = 0
    while True:
        try:
            tr = libhfst.HfstTransducer(istr)
            transducers_read += 1
            if transducers_read == 1:
                if not tr.compare(tr1):
                    print("ERROR: transducer 1 changed.")
                    remove_generated_files()
                    sys.exit(1)
            if transducers_read == 2:
                if not tr.compare(tr2):
                    print("ERROR: transducer 2 changed.")
                    remove_generated_files()
Ejemplo n.º 19
0
def main():
    a = ArgumentParser()
    a.add_argument(
        '-f',
        '--fsa',
        metavar='FSAFILE',
        required=True,
        help=
        "HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   required=True,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   required=True,
                   type=FileType('w'),
                   dest="outfile",
                   help="result file")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-a',
                   '--additional-mapping',
                   default="",
                   metavar="MAP",
                   help="Also try using MAP to match analyses and lemmas",
                   choices=["ftb3.1", ""])
    a.add_argument('-c',
                   '--count',
                   metavar="FREQ",
                   default=0,
                   help="test only word-forms with frequency higher than FREQ")
    options = a.parse_args()
    his = libhfst.HfstInputStream(options.fsa)
    omorfi = his.read()
    if not options.statfile:
        options.statfile = stdout
    # basic statistics
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # known bugs by type (FTB 3.1)
    deduct_forgn = 0
    deduct_advposman = 0
    deduct_oliprt = 0
    deduct_abbr_prop = 0
    deduct_unkwn = 0
    # known bugs by statistic to deduct (all)
    deduct_lemma = 0
    deduct_anal = 0
    deduct_matches = 0
    deduct_results = 0
    # for make check target
    threshold = 90
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 4:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        lemma = fields[2]
        analysis = fields[3]
        lines += freq
        if options.verbose:
            print(lines, '(', freq, ') ...', end='\r')
        anals = omorfi.lookup(surf)
        if not options.no_casing:
            if surf[0].isupper():
                anals += omorfi.lookup(surf[0].lower() + surf[1:])
            if surf.isupper():
                anals += omorfi.lookup(surf.lower())
            if surf.isupper():
                anals += omorfi.lookup(surf[0] + surf[1:].lower())
        found_anals = False
        found_lemma = False
        print_in = True
        for anal in anals:
            if analysis in anal[0]:
                found_anals = True
            if lemma in anal[0]:
                found_lemma = True
            if not options.no_casing:
                if lemma.lower() in anal[0]:
                    found_lemma = True
                elif lemma.upper() in anal[0]:
                    found_lemma = True
        if len(anals) == 0:
            print_in = False
            no_results += freq
            if options.additional_mapping == "ftb3.1":
                if 'Forgn' in analysis:
                    deduct_forgn += freq
                    deduct_results += freq
                    print_in = False
                elif 'Unkwn' in analysis:
                    deduct_unkwn += freq
                    deduct_results += freq
                    print_in = False
                else:
                    print("NORESULTS:",
                          freq,
                          surf,
                          lemma,
                          anals,
                          sep="\t",
                          file=options.outfile)
                    if options.verbose:
                        print("?", end='', file=stderr)
            else:
                print("NORESULTS:",
                      freq,
                      surf,
                      lemma,
                      anals,
                      sep="\t",
                      file=options.outfile)
                if options.verbose:
                    print("?", end='', file=stderr)
        elif not found_anals and not found_lemma:
            no_matches += freq
            if options.additional_mapping == "ftb3.1":
                if 'Adv Pos Man' in analysis:
                    deduct_advposman += freq
                    deduct_matches += freq
                    print_in = False
                elif 'Unkwn' in analysis:
                    deduct_unkwn += 1
                    deduct_matches += 1
                    print_in = False
                else:
                    print("NOMATCH:",
                          freq,
                          surf,
                          lemma + " " + analysis,
                          sep="\t",
                          end="\t",
                          file=options.outfile)
                    if options.verbose:
                        print("!", end='', file=stderr)
            else:
                print("NOMATCH:",
                      freq,
                      surf,
                      lemma + " " + analysis,
                      sep="\t",
                      end="\t",
                      file=options.outfile)
                if options.verbose:
                    print("!", end='', file=stderr)
        elif not found_anals:
            lemma_matches += freq
            if options.additional_mapping == "ftb3.1":
                if 'Adv Pos Man' in analysis:
                    deduct_advposman += freq
                    deduct_lemma += freq
                    print_in = False
                elif 'V Prt Act' in analysis and surf.startswith('oli'):
                    deduct_oliprt += freq
                    deduct_lemma += freq
                    print_in = False
                elif 'Forgn' in analysis:
                    deduct_forgn += freq
                    deduct_lemma += freq
                    print_in = False
                elif 'Abbr' in analysis:
                    propfail = False
                    for anal in anals:
                        if 'Abbr Prop' in anal[0]:
                            propfail = True
                    if propfail:
                        deduct_abbr_prop += freq
                        deduct_lemma += freq
                        print_in = False
                    else:
                        print("NOANALMATCH:",
                              freq,
                              surf,
                              analysis,
                              sep="\t",
                              end="\t",
                              file=options.outfile)
                    if options.verbose:
                        print("@", end='', file=stderr)
                elif 'Unkwn' in analysis:
                    deduct_unkwn += freq
                    deduct_lemma += freq
                    print_in = False
                else:
                    if options.verbose:
                        print("@", end='', file=stderr)
                    print("NOANALMATCH:",
                          freq,
                          surf,
                          analysis,
                          sep="\t",
                          end="\t",
                          file=options.outfile)
            else:
                if options.verbose:
                    print("@", end='', file=stderr)
                print("NOANALMATCH:",
                      freq,
                      surf,
                      analysis,
                      sep="\t",
                      end="\t",
                      file=options.outfile)
        elif not found_lemma:
            anal_matches += freq
            print("NOLEMMAMATCH:",
                  freq,
                  surf,
                  lemma,
                  sep="\t",
                  end="\t",
                  file=options.outfile)
            if options.verbose:
                print("#", end='', file=stderr)
        else:
            if options.verbose:
                print(".", end='', file=stderr)
            full_matches += freq
            print_in = False
        if print_in:
            print(":IN:", end="\t", file=options.outfile)
            for anal in anals:
                print(anal[0], end='\t', file=options.outfile)
            print(file=options.outfile)
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    print("Lines",
          "Matches",
          "Lemma",
          "Anals",
          "Mismatch",
          "No results",
          sep="\t",
          file=options.statfile)
    print(lines,
          full_matches,
          lemma_matches,
          anal_matches,
          no_matches,
          no_results,
          sep="\t",
          file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          full_matches / lines * 100 if lines != 0 else 0,
          lemma_matches / lines * 100 if lines != 0 else 0,
          anal_matches / lines * 100 if lines != 0 else 0,
          no_matches / lines * 100 if lines != 0 else 0,
          no_results / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    if options.additional_mapping == "ftb3.1":
        print("Deducting known bugs...\n",
              "Forgn:",
              deduct_forgn,
              "\nAdv Pos Man:",
              deduct_advposman,
              "\noli V Prt Act:",
              deduct_oliprt,
              "\nAbbr Prop:",
              deduct_abbr_prop,
              "\nUnkwn:",
              deduct_unkwn,
              file=options.statfile)
        lines = lines - deduct_forgn - deduct_advposman - deduct_oliprt - deduct_abbr_prop - deduct_unkwn
        no_results -= deduct_results
        no_matches -= deduct_matches
        lemma_matches -= deduct_lemma
    if options.additional_mapping != '':
        print(lines,
              full_matches,
              lemma_matches,
              anal_matches,
              no_matches,
              no_results,
              sep="\t",
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              full_matches / lines * 100 if lines != 0 else 0,
              lemma_matches / lines * 100 if lines != 0 else 0,
              anal_matches / lines * 100 if lines != 0 else 0,
              no_matches / lines * 100 if lines != 0 else 0,
              no_results / lines * 100 if lines != 0 else 0,
              sep="\t",
              file=options.statfile)
    if lines == 0 or (full_matches / lines * 100 < threshold):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)
Ejemplo n.º 20
0
    tot = 0

    for m, o in odistr:
        outputs[i].append(o)
        tot += m
        if m >= TH:
            break

outputs['#'].append('#')
outputs['_#_'].append('_#_')

out = libhfst.create_hfst_output_stream("",
                                        libhfst.TROPICAL_OPENFST_TYPE, 1)

ustr_model = libhfst.HfstInputStream(argv[2]).read()
str_model = libhfst.HfstInputStream(argv[3]).read()

for i, line in enumerate(imap(lambda x: x.strip(), stdin)):
    stderr.write("LINE: %u\r" % i)
    expr = ''

    if line == '':
        continue
    chars = ('_#_ _#_ # ' + line.replace('0','"0"') + ' # _#_ _#_').split(' ')
    
    for char in chars:
        expr += ('%s [%s] £ ' % (escape(char),
                                 '|'.join([escape(c) for c in outputs[char]])))
    re = libhfst.regex(expr)
    re.compose(ustr_model)
Ejemplo n.º 21
0
from sys import stdin, argv
from itertools import imap

import libhfst

dictionary = libhfst.HfstInputStream(argv[1]).read()

first_candidate = None
dict_candidate = None

for line in imap(lambda x: x.strip(), stdin):
    if line == '<SEP>':
        if dict_candidate != None:
            print dict_candidate
        else:
            print first_candidate
        dict_candidate = None
        first_candidate = None
    else:
        if first_candidate == None:
            first_candidate = line
        if dict_candidate == None and len(dictionary.lookup(line)) != 0:
            dict_candidate = line

if dict_candidate != None:
    print dict_candidate
elif first_candidate != None:
    print first_candidate
Ejemplo n.º 22
0
 def __init__(self, filename):
     self.istr = libhfst.HfstInputStream(filename)
     self.transducer = libhfst.HfstTransducer(self.istr)