Ejemplo n.º 1
0
LEXICON_FILE = "lexicons/pratt.yaml"
ENDINGS_FILE = "stemming.yaml"

lexicon = Lexicon(LEXICON_FILE)
endings = Endings(ENDINGS_FILE)

if __name__ == "__main__":

    with open(TEST_FILE) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", None)

            for parse, form in test.items():
                stem_info = lexicon.stem_info(lemma, parse, context=location)
                if stem_info is None:
                    print("couldn't get stem info for {} {}".format(
                        lemma, parse))
                    continue
                ending_info = endings.ending_info(form, parse)
                valid_stems = (
                    set(strip_length(info.stem)
                        for info in stem_info) & set(info.stem
                                                     for info in ending_info))

                if len(valid_stems) != 1:
                    print(form, parse, lemma)
                    print("    {}".format(stem_info))
                    print("    {}".format(ending_info))
                    for valid_stem in valid_stems:
Ejemplo n.º 2
0
            if ccat_pos != "V-":
                continue

            if lemma in IGNORE_LIST:
                continue

            if ccat_parse[3] == "N":
                parse = ccat_parse[1:4]
            elif ccat_parse[3] == "P":
                parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
            elif ccat_parse[3] == "I":
                parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
            else:
                continue

            stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"])
            if stem_info is None:
                print("couldn't get stem info for {} {}".format(lemma, parse))
                continue
            ending_info = endings.ending_info(form, parse)
            valid_stems = (
                set(strip_length(info.stem.replace("|", "")) for info in stem_info) &
                set(info.stem for info in ending_info))

            if len(valid_stems) != 1:
                print(form, parse, lemma, len(valid_stems))
                print("    {}".format(stem_info))
                print("    {}".format(ending_info))
                for valid_stem in valid_stems:
                    for info in stem_info:
                        if info.stem == valid_stem:
Ejemplo n.º 3
0

lexicon = Lexicon(LEXICON_FILE)
endings = Endings(ENDINGS_FILE)


if __name__ == "__main__":

    with open(TEST_FILE) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", None)

            for parse, form in test.items():
                stem_info = lexicon.stem_info(lemma, parse, context=location)
                if stem_info is None:
                    print("couldn't get stem info for {} {}".format(lemma, parse))
                    continue
                ending_info = endings.ending_info(form, parse)
                valid_stems = (
                    set(strip_length(info.stem) for info in stem_info) &
                    set(info.stem for info in ending_info))

                if len(valid_stems) != 1:
                    print(form, parse, lemma)
                    print("    {}".format(stem_info))
                    print("    {}".format(ending_info))
                    for valid_stem in valid_stems:
                        for info in stem_info:
                            if info.stem == valid_stem:
Ejemplo n.º 4
0
            if ccat_pos != "V-":
                continue

            if lemma in IGNORE_LIST:
                continue

            if ccat_parse[3] == "N":
                parse = ccat_parse[1:4]
            elif ccat_parse[3] == "P":
                parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
            elif ccat_parse[3] == "I":
                parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
            else:
                continue

            stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"])
            if stem_info is None:
                print("couldn't get stem info for {} {}".format(lemma, parse))
                continue
            ending_info = endings.ending_info(form, parse)
            valid_stems = (set(
                strip_length(info.stem.replace("|", ""))
                for info in stem_info) & set(info.stem
                                             for info in ending_info))

            if len(valid_stems) != 1:
                print(form, parse, lemma, len(valid_stems))
                print("    {}".format(stem_info))
                print("    {}".format(ending_info))
                for valid_stem in valid_stems:
                    for info in stem_info: