Ejemplo n.º 1
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    for line in options.infile:
        line = line
        if not line or line == '':
            continue
        surfs = omorfi.tokenise(line)
        for surf in surfs:
            tokens += 1
            anals = omorfi.analyse(surf)
            print_analyses_vislcg3(surf, anals, options.outfile)
            if len(anals) == 0 or (len(anals) == 1 and
                                   'UNKNOWN' in anals[0][0]):
                unknowns += 1
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100,
          "%", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 2
0
class FinnishParser:
    def __init__(self):
        self.omorfi = Omorfi()
        self.omorfi.load_from_dir()
        self.tokenizer = RegexpTokenizer(
            '\w+\-\w+|\w+|\$[\d\.]+|\.\.\.|[",!\.\(\)]|\S+')

    @staticmethod
    def omorfi_to_base(omorfi_form):
        return re.search(r"\[WORD_ID=(.*?)\]", omorfi_form).group(1)

    @staticmethod
    def omorfi_to_grammar(omorfi_form):
        return re.sub(r"\[WORD_ID=.*?\]", "", omorfi_form)

    def tokenize(self, text):
        text = re.sub("\[\d+\]|\ufeff", "", text)
        return self.tokenizer.tokenize(text)

    def get_sentence_start_indexes(self, tokens):
        start_indexes = []
        sentence_ended = False
        sentence_end_regex = r"\.\.\.|[\.!\?:;]"
        for i, token in enumerate(tokens):
            if re.match(sentence_end_regex, token):
                sentence_ended = True
            else:
                if sentence_ended:
                    start_indexes.append(i)
                sentence_ended = False
        return start_indexes

    def parse(self, text):
        tokens = self.tokenize(text)
        parsed_words = [self.analyse(t) for t in tokens]
        sentence_start_indexes = self.get_sentence_start_indexes(tokens)
        return parsed_words, tokens, sentence_start_indexes

    def analyse(self, word):
        omorfi_form = self.omorfi.analyse(word)
        first_form = omorfi_form[0][0]
        return AnalysedWord(self.omorfi_to_base(first_form),
                            self.omorfi_to_grammar(first_form))

    def is_valid_word(self, word):
        return word.grammar != "[GUESS=UNKNOWN][WEIGHT=inf]"
Ejemplo n.º 3
0
def main():
    a = ArgumentParser()
    a.add_argument('-f',
                   '--fsa',
                   metavar='FSAFILE',
                   required=True,
                   help="HFST's optimised lookup binary data for the "
                   "transducer to be applied")
    options = a.parse_args()
    omorfi = Omorfi()
    omorfi.load_from_dir(options.fsa, analyse=True, accept=True)

    tokens = omorfi.python_tokenise(WEIRD_TOK)
    # Check tokens are in same order as text
    start = 0
    for token in tokens:
        start = WEIRD_TOK.index(token['surf'], start)
Ejemplo n.º 4
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    for line in options.infile:
        line = line
        if not line or line == '':
            continue
        surfs = omorfi.tokenise(line)
        for surf in surfs:
            tokens += 1
            anals = omorfi.analyse(surf)
            print_analyses_vislcg3(surf, anals, options.outfile)
            if len(anals) == 0 or (len(anals) == 1 and
                                   'UNKNOWN' in anals[0][0]):
                unknowns += 1
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100,
          "%", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 5
0
def get_omorfi():
    """
    Gets an Omorfi instance with everything possible enabled. Reuses the
    existing instance if already called once.
    """
    from omorfi.omorfi import Omorfi

    global _omorfi
    if _omorfi is None:
        _omorfi = Omorfi()
        for var, fn in FSTS:
            getattr(_omorfi, "load_" + var)(
                "/usr/local/share/omorfi/omorfi." + fn
            )
    return _omorfi
Ejemplo n.º 6
0
    def stream(text):
        om = Omorfi()
        om.load_from_dir('/usr/local/share/omorfi/', analyse=True)
        for token in om.tokenise(text):
            yield "%s\n" % token[0]
            for analyse_res in om.analyse(token):
                text, weight = analyse_res[:2]
                if len(analyse_res) > 2:
                    rest = " ".join([str(x) for x in analyse_res[2:]])
                else:
                    rest = ''

                yield "%s %s %s\n" % (text, weight, rest)

            yield "\n"
Ejemplo n.º 7
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--oracle', action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('--hacks', metavar='HACKS',
                   help="mangle anaelyses to match HACKS version of UD",
                   choices=['ftb'])
    a.add_argument('--debug', action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # conllu is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                if '-' in fields[0]:
                    continue
                else:
                    print(
                        "Cannot figure out token index", fields[0], file=stderr)
                    exit(1)
            surf = fields[1]
            anals = omorfi.analyse(surf)
            if anals and len(anals) > 0:
                if options.debug:
                    debug_analyses_conllu(
                        fields, index, surf, anals, options.outfile, options.hacks)
                elif options.oracle:
                    try_analyses_conllu(fields, index, surf, anals,
                                        options.outfile, options.hacks)
                else:
                    print_analyses_conllu(index, surf, anals[0],
                                          options.outfile, options.hacks)
            if not anals or len(anals) == 0 or (len(anals) == 1 and
                                                'UNKNOWN' in anals[0][0]):
                unknowns += 1
        elif line.startswith('# doc-name:') or line.startswith('# sentence-text:'):
            # these comments I know need to be retained as-is
            print(line.strip(), file=options.outfile)
        elif line.startswith('#'):
            # unknown comment
            print(line.strip(), file=options.outfile)
            if options.verbose:
                print("Warning! Unrecognised comment line:", line, sep='\n')
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in conllu format:", line, sep='\n', file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences,
          file=options.statfile)
    print("Unknowns / OOV:", unknowns, "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 8
0
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from omorfi.omorfi import Omorfi

omorfi = Omorfi()
omorfi.load_analyser("/usr/local/share/omorfi/omorfi.analyse.hfst")
omorfi.load_generator("/usr/local/share/omorfi/omorfi.generate.hfst")

import settings

PROPERTIES = {
    "nominatiivi": [("CASE", "NOM")],
    "genetiivi": [("CASE", "GEN")],
    "partitiivi": [("CASE", "PAR")],
    "translatiivi": [("CASE", "TRA")],
    "essiivi": [("CASE", "ESS")],
    "inessiivi": [("CASE", "INE")],
    "elatiivi": [("CASE", "ELA")],
    "illatiivi": [("CASE", "ILL")],
    "adessiivi": [("CASE", "ADE")],
Ejemplo n.º 9
0
def main():
    global sent
    a = ArgumentParser()
    a.add_argument(
        '-f',
        '--fsa',
        metavar='FSAFILE',
        required=True,
        help=
        "HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=str,
                   required=True,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-m',
                   '--master',
                   metavar="TSVFILE",
                   type=str,
                   required=True,
                   dest="tsvfile",
                   help="source of existing lexical data")
    opts = a.parse_args()
    if opts.infile:
        test_corpora_files = [opts.infile]
    else:
        test_corpora_files = glob("*.text")
    # hard-coded logs for now
    lemma_log = open('missing_word_ids.log', 'w')
    case_log = open('missing_nominal_cases.log', 'w')
    comp_log = open('missing_comparatives.log', 'w')
    adposition_log = open('adposition_complements.log', 'w')
    adposition_stats = open('adposition_complements_full.log', 'w')
    adjective_log = open('adjective_agreements.log', 'w')
    omorfi = Omorfi()
    omorfi.load_filename(opts.fsa)
    gather_lemmas(open(opts.tsvfile))
    test_corpora = list()
    for test_corpus_file in test_corpora_files:
        try:
            test_corpora.append(open(test_corpus_file))
        except IOError as ioe:
            print("Failed to open corpus ", test_corpus_file, ":", ioe)
    for test_corpus in test_corpora:
        print('lines from', test_corpus.name)
        linen = 0
        for line in test_corpus:
            linen += 1
            if (linen % 200000) == 0:
                print(
                    linen,
                    "...! Time to reload everything because memory is leaking very badly indeed!"
                )
                sent = list()
                omorfi = None
                omorfi = Omorfi()
                omorfi.load_filename(opts.fsa)
                gc.collect()

            if (linen % 1000) == 0:
                print(linen, "...", end='\r')
            for punct in "\".,:;?!()":
                line = line.replace(punct, " " + punct + " ")
            for token in line.split():
                analyses = omorfi.analyse(token)
                add_to_sent(analyses, token)
                stat_word_ids(token, analyses)
                stat_nominal_cases(token, analyses, case_log)
                stat_adjective_comps(token, analyses, comp_log)
    print("Testing statistics")
    test_zero_lemmas(lemma_log)
    test_zero_cases(case_log)
    test_zero_comps(comp_log)
    # test_case_deviations()
    test_adposition_complements(adposition_log)
    test_adjective_agreements(adjective_log)
    print("Writing accurate statistics")
    print_adposition_stats(adposition_stats)
    print_lemma_stats(open('lemmas.freqs', 'w'))
    print_case_stats(open('cases.freqs', 'w'))
    exit(0)
Ejemplo n.º 10
0
def main():
    a = ArgumentParser()
    a.add_argument(
        '-f',
        '--fsa',
        metavar='FSAFILE',
        required=True,
        help=
        "HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   required=True,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="outFILE",
                   type=FileType('w'),
                   required=True,
                   dest="outfile",
                   help="log file name")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-c',
                   '--count',
                   metavar="FREQ",
                   default=0,
                   help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t',
                   '--threshold',
                   metavar='THOLD',
                   default=99,
                   type=int,
                   help="require THOLD % coverage or exit 1 (for testing)")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    # statistics
    tokens = 0
    uniqs = 0
    found_tokens = 0
    found_uniqs = 0
    missed_tokens = 0
    missed_uniqs = 0
    # for make check target
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        tokens += freq
        uniqs += 1
        if options.verbose:
            print(tokens, "(", freq, ')...', end='\r')
        anals = omorfi.analyse(surf)
        if len(anals) > 0 and "GUESS=UNKNOWN" not in anals[0][0]:
            found_tokens += freq
            found_uniqs += 1
        else:
            missed_tokens += freq
            missed_uniqs += 1
            print(freq, surf, "? (missed)", sep="\t", file=options.outfile)
    if options.verbose:
        print()
    cpuend = process_time()
    realend = perf_counter()
    print("cpu time: ", cpuend - cpustart, "real time:", realend - realstart)
    print("Tokens", "Matches", "Misses", "%", sep="\t")
    print(tokens,
          found_tokens,
          missed_tokens,
          found_tokens / tokens * 100 if tokens != 0 else 0,
          sep="\t")
    print("Uniqs", "Matches", "Misses", "%", sep="\t")
    print(uniqs,
          found_uniqs,
          missed_uniqs,
          found_uniqs / uniqs * 100 if uniqs != 0 else 0,
          sep="\t")
    if tokens == 0 or (found_tokens / tokens * 100 < options.threshold):
        print("needs to have",
              options.threshold,
              "% non-unique matches to pass regress test\n",
              file=stderr)
        exit(1)
    else:
        exit(0)
Ejemplo n.º 11
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a', '--analyser', metavar='AFILE',
                   help="load tokeniser model from (analyser) AFILE",
                   required=True)
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--output-format', metavar="OUTFORMAT",
                   default="moses",
                   help="format output for OUTFORMAT", choices=['moses',
                       'conllu', 'json', 'ftb3'])
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading language model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed for tokenisation", file=stderr)
        exit(1)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    lines = 0
    if options.output_format == 'conllu':
        print("# new doc id=", options.infile.name, file=options.outfile)
    for line in options.infile:
        line = line
        lines += 1
        if options.verbose and lines % 10000 == 0:
            print(lines, "...")
        if not line or line.rstrip('\n') == '':
            continue
        surfs = omorfi.tokenise(line)
        tokens += len(surfs)
        if options.output_format == 'moses':
            print(' '.join([surf['surf'] for surf in surfs]), file=options.outfile)
        elif options.output_format == 'json':
            print(json.encode(surfs))
        elif options.output_format == 'conllu':
            print("# sent_id =", lines, file=options.outfile)
            print("# text =", line.rstrip("\n"), file=options.outfile)
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_",
                      format_misc_ud(surf),
                      sep="\t", file=options.outfile)
                i += 1
        elif options.output_format == 'ftb3':
            print("<s><loc file=\"", options.infile.name, "\" line=\"",
                    lines, "\" />", file=options.outfile, sep="")
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_",
                        sep="\t", file=options.outfile)
                i += 1
            print("</s>", file=options.outfile)
        if options.output_format == 'conllu':
            print(file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines,
          "tokens/line", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          "Lines per timeunit:", lines / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 12
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-F',
                   '--format',
                   metavar="INFORMAT",
                   default='text',
                   help="read input using INFORMAT tokenisation",
                   choices=['text', 'vislcg', 'conllu'])
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is required to vislcg", file=stderr)
        exit(4)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        if options.outfile == stdout:
            options.statfile = stdout
        else:
            options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    last = None
    for line in options.infile:
        surfs = []
        if options.format == 'vislcg':
            surfs = get_line_tokens_vislcg(line, last)
        elif options.format == 'text':
            surfs = get_line_tokens(line, omorfi)
        elif options.format == 'conllu':
            surfs = get_line_tokens_conllu(line, last)
        else:
            print("input format missing implementation",
                  options.format,
                  file=stderr)
            exit(2)
        for surf in surfs:
            if 'conllu_form' in surf:
                # skip conllu special forms in input for now:
                # (ellipsis and MWE magics)
                continue
            elif 'surf' in surf:
                tokens += 1
                anals = omorfi.analyse(surf)
                if len(anals) == 0 or (len(anals) == 1
                                       and 'UNKNOWN' in anals[0]['anal']):
                    unknowns += 1
                    anals = omorfi.guess(surf)
                print_analyses_vislcg3(surf, anals, options.outfile)
            elif 'comment' in surf:
                if surf['comment'].startswith(';') or \
                       surf['comment'].startswith('\t'):
                    continue
                else:
                    print(surf['comment'], file=options.outfile)
            elif 'error' in surf:
                print(surf['error'], file=stderr)
                exit(2)
            last = surf
    cpuend = process_time()
    realend = perf_counter()
    print("# Tokens:",
          tokens,
          "\n# Unknown:",
          unknowns,
          unknowns / tokens * 100,
          "%",
          file=options.statfile)
    print("# CPU time:",
          cpuend - cpustart,
          "\n# Real time:",
          realend - realstart,
          file=options.statfile)
    print("# Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 13
0
def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-f',
                   '--fsa',
                   metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O',
                   '--output-format',
                   metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   choices=["labels-tsv", "moses-factors", "segments"])
    a.add_argument('--split-words',
                   action="store_true",
                   default=True,
                   help="split on word boundaries")
    a.add_argument(
        '--split-new-words',
        action="store_true",
        default=True,
        help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--split-morphs',
                   action="store_true",
                   default=True,
                   help="split on morph boundaries")
    a.add_argument('--split-derivs',
                   action="store_true",
                   default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords',
                   action="store_true",
                   default=True,
                   help="split on other boundaries")
    a.add_argument('--segment-marker',
                   default=' ',
                   metavar='SEG',
                   help="mark segment boundaries with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("Reading automata dir", options.fsa)
        omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True)
    else:
        if options.verbose:
            print("Searching for automata everywhere...")
        omorfi.load_from_dir(labelsegment=True, segment=True)
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        surfs = omorfi.tokenise(line)
        for surf in surfs:
            segments = omorfi.segment(surf)
            labelsegments = omorfi.labelsegment(surf)
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(segments, labelsegments, surf,
                                            outfile)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, surf, outfile, options)
        print(file=outfile)
    exit(0)
Ejemplo n.º 14
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="read analyser model from AFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to ftb3", file=stderr)
        exit(4)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    lexprobs = None
    tagprobs = None

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # ftb is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                print("Cannot figure out token index", fields[0], file=stderr)
                exit(1)
            surf = fields[1]
            anals = omorfi.analyse(surf)
            if not anals or len(anals) == 0 or (len(anals) == 1
                                                and 'OOV' in anals[0]):
                unknowns += 1
                anals = omorfi.guess(surf)
            if anals and len(anals) > 0:
                if options.oracle:
                    try_analyses_ftb(fields, index, surf, anals,
                                     options.outfile)
                else:
                    print_analyses_ftb(index, surf, anals[0], options.outfile)
            else:
                print("Failed:", fields)
                exit(1)
        elif line.startswith('<') and line.rstrip().endswith('>'):
            print(line.strip(), file=options.outfile)
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in ftb3 format: '", line, "'", file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 15
0
 def __init__(self):
     self.omorfi = Omorfi()
     self.omorfi.load_from_dir()
     self.tokenizer = RegexpTokenizer(
         '\w+\-\w+|\w+|\$[\d\.]+|\.\.\.|[",!\.\(\)]|\S+')
Ejemplo n.º 16
0
 def stream(text):
     om = Omorfi()
     om.load_from_dir('/usr/local/share/omorfi/', lemmatise=True)
     for token in om.tokenise(text):
         yield " ".join(map(lambda x: str(x), om.lemmatise(token[0])))
Ejemplo n.º 17
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--output-format', metavar="OUTFORMAT",
                   default="moses",
                   help="format output for OUTFORMAT", choices=['moses', 'conllu'])
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    lines = 0
    if options.output_format == 'conllu':
        print("# doc-name:", options.infile.name, file=options.outfile)
    for line in options.infile:
        line = line
        lines += 1
        if options.verbose and lines % 10000 == 0:
            print(lines, "...")
        if not line or line.rstrip('\n') == '':
            continue
        surfs = omorfi.tokenise(line)
        tokens += len(surfs)
        if options.output_format == 'moses':
            print(' '.join([surf[0] for surf in surfs]), file=options.outfile)
        else:
            print("# sentence-text:", line.rstrip("\n"), file=options.outfile)
            i = 1
            for surf in surfs:
                print(i, surf[0], "_", "_", "_", "_", "_", "_", "_",
                      surf[1],
                      sep="\t", file=options.outfile)
                i += 1
        if options.output_format == 'conllu':
            print(file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines,
          "tokens/line", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          "Lines per timeunit:", lines / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 18
0
def main():
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAFILE', required=True,
                   help="HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True,
                   dest="infile", help="source of analysis data")
    a.add_argument('-o', '--output', metavar="outFILE", type=FileType('w'),
                   required=True,
                   dest="outfile", help="log file name")
    a.add_argument('-v', '--verbose', action="store_true", default=False,
                   help="Print verbosely while processing")
    a.add_argument('-c', '--count', metavar="FREQ", default=0,
                   help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t', '--threshold', metavar='THOLD', default=99, type=int,
                   help="require THOLD % coverage or exit 1 (for testing)")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    # statistics
    tokens = 0
    uniqs = 0
    found_tokens = 0
    found_uniqs = 0
    missed_tokens = 0
    missed_uniqs = 0
    # for make check target
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        tokens += freq
        uniqs += 1
        if options.verbose:
            print(tokens, "(", freq, ')...', end='\r')
        anals = omorfi.analyse(surf)
        if len(anals) > 0 and "GUESS=UNKNOWN" not in anals[0][0]:
            found_tokens += freq
            found_uniqs += 1
        else:
            missed_tokens += freq
            missed_uniqs += 1
            print(freq, surf, "? (missed)", sep="\t", file=options.outfile)
    if options.verbose:
        print()
    cpuend = process_time()
    realend = perf_counter()
    print("cpu time: ", cpuend - cpustart,
          "real time:", realend - realstart)
    print("Tokens", "Matches", "Misses", "%", sep="\t")
    print(tokens, found_tokens, missed_tokens,
          found_tokens / tokens * 100 if tokens != 0 else 0,
          sep="\t")
    print("Uniqs", "Matches", "Misses", "%", sep="\t")
    print(uniqs, found_uniqs, missed_uniqs,
          found_uniqs / uniqs * 100 if uniqs != 0 else 0,
          sep="\t")
    if tokens == 0 or (found_tokens / tokens * 100 < options.threshold):
        print("needs to have", options.threshold,
              "% non-unique matches to pass regress test\n",
              file=stderr)
        exit(1)
    else:
        exit(0)
Ejemplo n.º 19
0
def main():
    global sent
    a = ArgumentParser()
    a.add_argument(
        '-f', '--fsa', metavar='FSAFILE', required=True,
        help="HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument(
        '-i', '--input', metavar="INFILE", type=str, required=True,
        dest="infile", help="source of analysis data")
    a.add_argument(
        '-m', '--master', metavar="TSVFILE", type=str, required=True,
        dest="tsvfile", help="source of existing lexical data")
    opts = a.parse_args()
    if opts.infile:
        test_corpora_files = [opts.infile]
    else:
        test_corpora_files = glob("*.text")
    # hard-coded logs for now
    lemma_log = open('missing_word_ids.log', 'w')
    case_log = open('missing_nominal_cases.log', 'w')
    comp_log = open('missing_comparatives.log', 'w')
    adposition_log = open('adposition_complements.log', 'w')
    adposition_stats = open('adposition_complements_full.log', 'w')
    adjective_log = open('adjective_agreements.log', 'w')
    omorfi = Omorfi()
    omorfi.load_filename(opts.fsa)
    gather_lemmas(open(opts.tsvfile))
    test_corpora = list()
    for test_corpus_file in test_corpora_files:
        try:
            test_corpora.append(open(test_corpus_file))
        except IOError as ioe:
            print("Failed to open corpus ", test_corpus_file, ":", ioe)
    for test_corpus in test_corpora:
        print('lines from', test_corpus.name)
        linen = 0
        for line in test_corpus:
            linen += 1
            if (linen % 200000) == 0:
                print(
                    linen, "...! Time to reload everything because memory is leaking very badly indeed!")
                sent = list()
                omorfi = None
                omorfi = Omorfi()
                omorfi.load_filename(opts.fsa)
                gc.collect()

            if (linen % 1000) == 0:
                print(linen, "...", end='\r')
            for punct in "\".,:;?!()":
                line = line.replace(punct, " " + punct + " ")
            for token in line.split():
                analyses = omorfi.analyse(token)
                add_to_sent(analyses, token)
                stat_word_ids(token, analyses)
                stat_nominal_cases(token, analyses, case_log)
                stat_adjective_comps(token, analyses, comp_log)
    print("Testing statistics")
    test_zero_lemmas(lemma_log)
    test_zero_cases(case_log)
    test_zero_comps(comp_log)
    # test_case_deviations()
    test_adposition_complements(adposition_log)
    test_adjective_agreements(adjective_log)
    print("Writing accurate statistics")
    print_adposition_stats(adposition_stats)
    print_lemma_stats(open('lemmas.freqs', 'w'))
    print_case_stats(open('cases.freqs', 'w'))
    exit(0)
Ejemplo n.º 20
0
def main():
    """Preprocess text for moses factored modeling."""
    a = ArgumentParser()
    a.add_argument('-a', '--analyser', metavar='AFILE',
                   help="load analyser model from AFILE", required=True)
    a.add_argument('-s', '--segmenter', metavar='SFILE',
                   help="load segmenter model from SFILE", required=True)
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE",
                   help="print factors into OUTFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("Reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("at least analyser file is needed", file=stderr)
        exit(1)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter model", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("at least segmenter file is needed", file=stderr)
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", infile.name)
    if options.verbose:
        print("writign to", outfile.name)
    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        tokens = omorfi.tokenise_sentence(line)
        for token in tokens:
            if not token.surf:
                continue
            anals = omorfi.analyse(token)
            pos = "X"
            mrds = ["?"]
            lemmas = [token.surf]
            if anals:
                anal = token.get_best()
                pos = anal.get_upos()
                mrds = anal.get_ufeats()
                lemmas = anal.get_lemmas()
            segments = omorfi.segment(token)
            morphs = "0"
            if segments:
                segment = token.get_best_segments()
                if segment:
                    parts = segment.get_segments()
                    morphs = ".".join(parts)
                else:
                    morphs = token.surf
            print(token.surf, '+'.join(lemmas), pos, '.'.join(mrds),
                  morphs, sep='|', end=' ', file=outfile)
        print(file=outfile)
    exit(0)
Ejemplo n.º 21
0
def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O', '--output-format', metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   choices=["labels-tsv", "moses-factors", "segments"])
    a.add_argument('--no-split-words', action="store_false", default=True,
                   dest="split_words",
                   help="split on word boundaries")
    a.add_argument('--no-split-new-words', action="store_false", default=True,
                   dest="split_new_words",
                   help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--no-split-morphs', action="store_false", default=True,
                   dest="split_morphs",
                   help="split on morph boundaries")
    a.add_argument('--split-derivs', action="store_true", default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords', action="store_true", default=False,
                   help="split on other boundaries")
    a.add_argument('--segment-marker', default='→ ←', metavar='SEG',
                   help="mark segment boundaries with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("Reading automata dir", options.fsa)
        omorfi.load_from_dir(options.fsa, segment=True,
                             labelsegment=True)
    else:
        if options.verbose:
            print("Searching for automata everywhere...")
        omorfi.load_from_dir(labelsegment=True, segment=True)
    if options.infile:
        infile = options.infile
    else:
        options.infile = stdin
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        options.output = "<stdout>"
        outfile = stdout
    if options.segment_marker is None:
        if options.verbose:
            print("Default segment marker is → ←")
        options.segment_marker = '→ ←'
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            print(file=outfile)
            continue
        tokens = omorfi.tokenise(line)
        for token in tokens:
            segments = omorfi.segment(token[0])
            labelsegments = omorfi.labelsegment(token[0])
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(
                    segments, labelsegments, token[0], outfile, options)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, token[0], outfile,
                               options)
        print(file=outfile)
    exit(0)
Ejemplo n.º 22
0
def main():
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='FSAFILE',
                   required=True,
                   help="load analyser from FSAFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   type=FileType('w'),
                   dest="outfile",
                   help="log outputs to OUTFILE")
    a.add_argument('-X',
                   '--statistics',
                   metavar="STATFILE",
                   type=FileType('w'),
                   dest="statfile",
                   help="statistics")
    a.add_argument('-v',
                   '--verbose',
                   action="store_true",
                   default=False,
                   help="Print verbosely while processing")
    a.add_argument('-C',
                   '--no-casing',
                   action="store_true",
                   default=False,
                   help="Do not try to recase input and output when matching")
    a.add_argument('-f',
                   '--format',
                   metavar="FORMAT",
                   help="use FORMAT formatter to compare analyses",
                   choices=["coverage", "ftb3.1"],
                   default="coverage")
    a.add_argument('-c',
                   '--count',
                   metavar="FREQ",
                   default=0,
                   help="test only word-forms with frequency higher than FREQ")
    a.add_argument('-t',
                   '--threshold',
                   metavar="THOLD",
                   default=99,
                   help="if coverage is less than THOLD exit with error")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    try:
        if options.analyser:
            if options.verbose:
                print("reading analyser from", options.analyser)
            omorfi.load_analyser(options.analyser)
        if not options.infile:
            options.infile = stdin
            print("reading from <stdin>")
        if not options.statfile:
            options.statfile = stdout
        if not options.outfile:
            options.outfile = stdout
    except IOError:
        print("Could not process file", options.analyser, file=stderr)
        exit(2)
    # basic statistics
    covered = 0
    full_matches = 0
    lemma_matches = 0
    anal_matches = 0
    no_matches = 0
    no_results = 0
    lines = 0
    # for make check target
    threshold = options.threshold
    realstart = perf_counter()
    cpustart = process_time()
    for line in options.infile:
        fields = line.strip().replace(' ', '\t', 1).split('\t')
        if len(fields) < 2:
            print("ERROR: Skipping line", fields, file=stderr)
            continue
        freq = int(fields[0])
        if freq < int(options.count):
            break
        surf = fields[1]
        lemma = surf
        analysis = surf
        if options.format != 'coverage':
            lemma = fields[2]
            analysis = fields[3]
        lines += freq
        if options.verbose:
            print(lines, '(', freq, ') ...', end='\r')
        anals = omorfi.analyse(surf)
        if not is_tokenlist_oov(anals):
            covered += freq
        else:
            no_results += freq
            print("OOV", surf, sep='\t', file=options.outfile)
        found_anals = False
        found_lemma = False
        for anal in anals:
            if options.format == 'ftb3.1':
                anal_ftb3 = format_feats_ftb(anal)
                lemma_ftb3 = '#'.join(get_lemmas(anal))
                # hacks ftb3:
                analysis = analysis.replace(" >>>", "")
                if analysis == anal_ftb3:
                    found_anals = True
                    print("ANALHIT", analysis, anal_ftb3, file=options.outfile)
                elif set(anal_ftb3.split()) == set(analysis.split()):
                    found_anals = True
                    print("PERMUTAHIT",
                          analysis,
                          anal_ftb3,
                          file=options.outfile)
                else:
                    print("ANALMISS",
                          analysis,
                          anal_ftb3,
                          file=options.outfile)
                if lemma == lemma_ftb3:
                    found_lemma = True
                    print("LEMMAHIT", lemma, lemma_ftb3, file=options.outfile)
                elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''):
                    found_lemma = True
                    print("LEMMARECOMP",
                          lemma,
                          lemma_ftb3,
                          file=options.outfile)
                else:
                    print("LEMMAMISS", lemma, lemma_ftb3, file=options.outfile)
        if options.format != 'coverage':
            if not found_anals and not found_lemma:
                no_matches += freq
                print("NOHITS!", surf, sep='\t', file=options.outfile)
            elif found_anals and found_lemma:
                print("HIT", surf, sep='\t', file=options.outfile)
                full_matches += freq
            elif not found_anals:
                anal_matches += freq
                print("LEMMANOANAL", surf, sep='\t', file=options.outfile)
            elif not found_lemma:
                lemma_matches += freq
                print("ANALNOLEMMA", surf, sep='\t', file=options.outfile)
            else:
                print("Logical error, kill everyone")
                exit(13)
    realend = perf_counter()
    cpuend = process_time()
    print("CPU time:", cpuend - cpustart, "real time:", realend - realstart)
    print("Lines", "Covered", "OOV", sep="\t", file=options.statfile)
    print(lines, covered, lines - covered, sep="\t", file=options.statfile)
    print(lines / lines * 100 if lines != 0 else 0,
          covered / lines * 100 if lines != 0 else 0,
          (lines - covered) / lines * 100 if lines != 0 else 0,
          sep="\t",
          file=options.statfile)
    if options.format == 'ftb3.1':
        print("Lines",
              "Matches",
              "Lemma",
              "Anals",
              "Mismatch",
              "No results",
              sep="\t",
              file=options.statfile)
        print(lines,
              full_matches,
              lemma_matches,
              anal_matches,
              no_matches,
              no_results,
              sep="\t",
              file=options.statfile)
        print(lines / lines * 100 if lines != 0 else 0,
              full_matches / lines * 100 if lines != 0 else 0,
              lemma_matches / lines * 100 if lines != 0 else 0,
              anal_matches / lines * 100 if lines != 0 else 0,
              no_matches / lines * 100 if lines != 0 else 0,
              no_results / lines * 100 if lines != 0 else 0,
              sep="\t",
              file=options.statfile)
    if lines == 0:
        print("Needs more than 0 lines to determine something", file=stderr)
        exit(2)
    elif options.format == 'ftb3.1' and \
            (full_matches / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% matches to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    elif options.format == 'coverage' and \
            (covered / lines * 100 <= int(options.threshold)):
        print("needs to have",
              threshold,
              "% coverage to pass regress test\n",
              "please examine",
              options.outfile.name,
              "for regressions",
              file=stderr)
        exit(1)
    else:
        exit(0)
Ejemplo n.º 23
0
def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-s',
                   '--segmenter',
                   metavar='SFILE',
                   help="load segmenter from SFILE",
                   required=True)
    a.add_argument('-S',
                   '--labeller',
                   metavar='LSFILE',
                   help="load labelsegmenter from LSFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O',
                   '--output-format',
                   metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   required=True,
                   choices=["moses-factors", "segments"])
    a.add_argument('--no-split-words',
                   action="store_false",
                   default=True,
                   dest="split_words",
                   help="split on word boundaries")
    a.add_argument(
        '--no-split-new-words',
        action="store_false",
        default=True,
        dest="split_new_words",
        help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--no-split-morphs',
                   action="store_false",
                   default=True,
                   dest="split_morphs",
                   help="split on morph boundaries")
    a.add_argument('--split-derivs',
                   action="store_true",
                   default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords',
                   action="store_true",
                   default=False,
                   help="split on other boundaries")
    a.add_argument('--segment-marker',
                   default='→ ←',
                   metavar='SEG',
                   help="mark segment boundaries with SEG")
    a.add_argument('--show-ambiguous',
                   default=False,
                   metavar='ASEP',
                   help="separate ambiguous segmentations with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("segmenter is needed for segmenting", file=stderr)
        exit(2)
    if options.labeller:
        if options.verbose:
            print("Reading labelsegmenter", options.labeller)
        omorfi.load_labelsegmenter(options.labeller)
    if not omorfi.can_segment:
        print("Could not load segmenter(s), re-compile them or use -f option")
        print()
        print("To compile segmenter, use --enable-segmenter, and/or",
              "--enable-labeled-segments")
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        options.infile = stdin
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        options.output = "<stdout>"
        outfile = stdout
    if options.segment_marker is None:
        if options.verbose:
            print("Default segment marker is → ←")
        options.segment_marker = '→ ←'
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            print(file=outfile)
            continue
        tokens = omorfi.tokenise(line)
        for token in tokens:
            segments = omorfi.segment(token)
            labelsegments = omorfi.labelsegment(token)
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(segments, labelsegments, token,
                                            outfile, options)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, token, outfile,
                               options)
        print(file=outfile)
    exit(0)
Ejemplo n.º 24
0
def main():
    """Preprocess text for moses factored modeling."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE",
                   required=True)
    a.add_argument('-s',
                   '--segmenter',
                   metavar='SFILE',
                   help="load segmenter model from SFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print factors into OUTFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("Reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("at least analyser file is needed", file=stderr)
        exit(1)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter model", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("at least segmenter file is needed", file=stderr)
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", infile.name)
    if options.verbose:
        print("writign to", outfile.name)
    re_lemma = re.compile("\[WORD_ID=([^]]*)\]")
    re_pos = re.compile("\[UPOS=([^]]*)\]")
    re_mrd = re.compile("\[([^=]*)=([^]]*)]")
    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        surfs = line.split()
        for surf in surfs:
            anals = omorfi.analyse(surf)
            segments = omorfi.segment(surf)
            pos_matches = re_pos.finditer(anals[0]['anal'])
            pos = "UNK"
            mrds = []
            lemmas = []
            for pm in pos_matches:
                pos = pm.group(1)
            lemma_matches = re_lemma.finditer(anals[0]['anal'])
            for lm in lemma_matches:
                lemmas += [lm.group(1)]
            mrd_matches = re_mrd.finditer(anals[0]['anal'])
            for mm in mrd_matches:
                if mm.group(1) == 'WORD_ID':
                    mrds = []
                elif mm.group(1) == 'WEIGHT':
                    pass
                else:
                    mrds += [mm.group(2)]
            parts = segments[0]['segments']
            if '{DB}' in parts:
                suffixes = parts[parts.rfind('{DB}') + 4:]
            elif '{WB}' in parts:
                suffixes = parts[parts.rfind('{WB}') + 4:]
            elif '{hyph?}' in parts:
                suffixes = parts[parts.rfind('{hyph?}') + 6:]
            else:
                suffixes = "0"
            morphs = suffixes[suffixes.find("{"):].replace("{MB}", ".")
            print(surf,
                  '+'.join(lemmas),
                  pos,
                  '.'.join(mrds),
                  morphs,
                  sep='|',
                  end=' ',
                  file=outfile)
        print(file=outfile)
    exit(0)
Ejemplo n.º 25
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f',
                   '--fsa',
                   metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-u',
                   '--udpipe',
                   metavar="UDPIPE",
                   help='use UDPIPE for additional guesses (experi-mental)')
    a.add_argument('--hacks',
                   metavar='HACKS',
                   help="mangle anaelyses to match HACKS version of UD",
                   choices=['ftb'])
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, guesser=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if options.udpipe:
        if options.verbose:
            print("Loading udpipe", options.udpipe)
        omorfi.load_udpipe(options.udpipe)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    recognised_comments = [
        'sent_id =', 'text =', 'doc-name:', 'sentence-text:'
    ]
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # conllu is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                if '-' in fields[0]:
                    # MWE
                    continue
                elif '.' in fields[0]:
                    # a ghost
                    continue
                else:
                    print("Cannot figure out token index",
                          fields[0],
                          file=stderr)
                    exit(1)
            surf = fields[1]
            anals = omorfi.analyse(surf)
            if not anals or len(anals) == 0 or (len(anals) == 1
                                                and 'UNKNOWN' in anals[0][0]):
                unknowns += 1
                anals = omorfi.guess(surf)
            if anals and len(anals) > 0:
                if options.debug:
                    debug_analyses_conllu(fields, index, surf, anals,
                                          options.outfile, options.hacks)
                elif options.oracle:
                    try_analyses_conllu(fields, index, surf, anals,
                                        options.outfile, options.hacks)
                else:
                    print_analyses_conllu(index, surf, anals[0],
                                          options.outfile, options.hacks)
        elif line.startswith('#'):
            print(line.strip(), file=options.outfile)
            recognised = False
            for rec in recognised_comments:
                if line.startswith('# ' + rec):
                    recognised = True
            if not recognised and options.verbose:
                print("Warning! Unrecognised comment line:", line, sep='\n')
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in conllu format:", line, sep='\n', file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
Ejemplo n.º 26
0
def main():
    """Preprocess text for moses factored modeling."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE",
                   help="print factors into OUTFILE")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("Reading automata dir", options.fsa)
        omorfi.load_from_dir(options.fsa)
    else:
        if options.verbose:
            print("Searching for automata everywhere...")
        omorfi.load_from_dir()
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    re_lemma = re.compile("\[WORD_ID=([^]]*)\]")
    re_pos = re.compile("\[POS=([^]]*)\]")
    re_mrd = re.compile("\[([^=]*)=([^]]*)]")
    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        surfs = line.split()
        for surf in surfs:
            anals = omorfi.analyse(surf)
            segments = omorfi.segment(surf)
            pos_matches = re_pos.finditer(anals[0][0])
            pos = "UNK"
            mrds = []
            lemmas = []
            for pm in pos_matches:
                pos = pm.group(1)
            lemma_matches = re_lemma.finditer(anals[0][0])
            for lm in lemma_matches:
                lemmas += [lm.group(1)]
            mrd_matches = re_mrd.finditer(anals[0][0])
            for mm in mrd_matches:
                if mm.group(1) == 'WORD_ID':
                    mrds = []
                elif mm.group(1) == 'WEIGHT':
                    pass
                else:
                    mrds += [mm.group(2)]
            stemfixes = segments[0][0][
                segments[0][0].rfind("{STUB}"):].replace("{STUB}", "")
            if '{' in stemfixes:
                morphs = stemfixes[stemfixes.find("{"):].replace("{MB}", ".")
            else:
                morphs = '0'
            print(surf, '+'.join(lemmas), pos, '.'.join(mrds),
                  morphs, sep='|', end=' ', file=outfile)
        print(file=outfile)
    exit(0)