Exemple #1
0
 def test_parse_file(self):
     with tempfile.TemporaryFile(mode='w+') as fp:
         fp.write(self.s3)
         fp.seek(0)
         lexical_units = list(parse_file(fp))
         self.assertEqual(len(lexical_units), 1)
         self.assertEqual(lexical_units[0].wordform, 'vino')
Exemple #2
0
 def test_parse_file(self):
     with tempfile.TemporaryFile(mode='w+') as fp:
         fp.write(self.s3)
         fp.seek(0)
         lexical_units = list(parse_file(fp))
         self.assertEqual(len(lexical_units), 1)
         self.assertEqual(lexical_units[0].wordform, 'vino')
Exemple #3
0
#!/usr/bin/env python3

from streamparser import parse_file, readingToString, known
import sys


def stash(wf, r):
    return "[<apertium-notrans>{}\/{}<\/apertium-notrans>]".format(
        lu.wordform, readingToString(r))


for blank, lu in parse_file(sys.stdin, withText=True):
    if lu.knownness == known and len(lu.readings) > 1:
        print(
            " ".join("{}^{}/{}$ ^./.<sent><clb>$".format(
                stash(lu.wordform, r), lu.wordform, readingToString(r))
                     for r in lu.readings
                     # Skip compounds:
                     if len(r) == 1),
            end="\n")
Exemple #4
0
## A script for converting kaz-tagger mode's output into a tab-separated,
## easy-to-annotate format.
##
## INPUT: apertium-kaz$ echo "бақшада ма, қайда?" | apertium -d . kaz-tagger
## ^бақшада ма/бақша<n><loc>+ма<qst>$^,/,<cm>$ ^қайда/қайда<adv><itg>+е<cop><aor><p3><sg>$^?/?<sent>$^./.<sent>$
##
## OUTPUT: apertium-kaz$ echo "бақшада ма, қайда?" | apertium -d . kaz-tagger | python3 corpus/format.py
## бақшада ма      бақша   n loc +ма qst
## ,       ,       cm
## қайда   қайда   adv itg +е cop aor p3 sg
## ?       ?       sent
##.       .       sent

import sys, streamparser

for lu in streamparser.parse_file(sys.stdin):
    print(lu.wordform, end="\t")
    for r in lu.readings:
        print(r[0].baseform, end="\t")
        print(" ".join(r[0].tags), end="")
        if len(r) > 1:
            for s in r[1:]:
                print(" +" + s.baseform, " ".join(s.tags), end="")
    print("\t\t\t")
Exemple #5
0
        post = "\033[0m"
    elif arg == "--context":
        context = True


def fst(x):
    (a, _) = x
    return a


def snd(x):
    (_, b) = x
    return b


for blank, lu in streamparser.parse_file(sys.stdin, withText=True):
    if context:
        print(blank, end="")
    m = set((t, s.baseform) for r in lu.readings for s in r for t in s.tags
            if t.startswith("@"))
    if m:
        tag = pre + "".join(map(fst, m)) + post
        if context:
            print(lu.wordform + tag, end="")
        else:
            lemma = "/".join(map(snd, m))
            print(lemma + tag, end="")
            print("")
    elif context:
        print(lu.wordform, end="")
Exemple #6
0
def prepros(filename):
    cohorts = parse_file(open(filename))
    return (cohorts)