def test_parse_file(self): with tempfile.TemporaryFile(mode='w+') as fp: fp.write(self.s3) fp.seek(0) lexical_units = list(parse_file(fp)) self.assertEqual(len(lexical_units), 1) self.assertEqual(lexical_units[0].wordform, 'vino')
#!/usr/bin/env python3 from streamparser import parse_file, readingToString, known import sys def stash(wf, r): return "[<apertium-notrans>{}\/{}<\/apertium-notrans>]".format( lu.wordform, readingToString(r)) for blank, lu in parse_file(sys.stdin, withText=True): if lu.knownness == known and len(lu.readings) > 1: print( " ".join("{}^{}/{}$ ^./.<sent><clb>$".format( stash(lu.wordform, r), lu.wordform, readingToString(r)) for r in lu.readings # Skip compounds: if len(r) == 1), end="\n")
## A script for converting kaz-tagger mode's output into a tab-separated, ## easy-to-annotate format. ## ## INPUT: apertium-kaz$ echo "бақшада ма, қайда?" | apertium -d . kaz-tagger ## ^бақшада ма/бақша<n><loc>+ма<qst>$^,/,<cm>$ ^қайда/қайда<adv><itg>+е<cop><aor><p3><sg>$^?/?<sent>$^./.<sent>$ ## ## OUTPUT: apertium-kaz$ echo "бақшада ма, қайда?" | apertium -d . kaz-tagger | python3 corpus/format.py ## бақшада ма бақша n loc +ма qst ## , , cm ## қайда қайда adv itg +е cop aor p3 sg ## ? ? sent ##. . sent import sys, streamparser for lu in streamparser.parse_file(sys.stdin): print(lu.wordform, end="\t") for r in lu.readings: print(r[0].baseform, end="\t") print(" ".join(r[0].tags), end="") if len(r) > 1: for s in r[1:]: print(" +" + s.baseform, " ".join(s.tags), end="") print("\t\t\t")
post = "\033[0m" elif arg == "--context": context = True def fst(x): (a, _) = x return a def snd(x): (_, b) = x return b for blank, lu in streamparser.parse_file(sys.stdin, withText=True): if context: print(blank, end="") m = set((t, s.baseform) for r in lu.readings for s in r for t in s.tags if t.startswith("@")) if m: tag = pre + "".join(map(fst, m)) + post if context: print(lu.wordform + tag, end="") else: lemma = "/".join(map(snd, m)) print(lemma + tag, end="") print("") elif context: print(lu.wordform, end="")
def prepros(filename): cohorts = parse_file(open(filename)) return (cohorts)