def get_stem_set(self, parse, norm, test_length):
        stem_set = set()

        norm = debreath(norm)

        if parse in stemming_rules:
            pairs = stemming_rules[parse]
            while isinstance(pairs, dict) and "ref" in pairs:
                if pairs["ref"] in stemming_rules:
                    pairs = stemming_rules[pairs["ref"]]
                else:
                    raise Exception("ref to {} which doesn't exist".format(pairs["ref"]))
            for entry in pairs:
                if not test_length:
                    entry = strip_length(entry)
                s1, s234, s5 = entry.split("|")
                s2, s34 = s234.split(">")
                s3, s4 = s34.split("<")
                s3 = s3.replace("(", "\\(")
                s3 = s3.replace(")", "\\)")
                s5 = s5.replace("(", "\\(")
                s5 = s5.replace(")", "\\)")
                regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2)
                if re.match(regex_pair[0], norm):
                    stem_set.add(rebreath(strip_accents(re.sub(regex_pair[0], r"\1" + regex_pair[1], norm))))
        else:
            return None

        return stem_set
def calculate_form(entry, parse):
    c = entry
    for step in parse.split("."):
        if step[0] in "123":
            step = "_" + step
        c = getattr(c(), step)

    return strip_length(c().replace("+", "")), (c.__self__.__class__.__name__, c.__qualname__)
 def predict_and_test(self, lemma, parse, norm, location, stem_set):
     predicted = self.regex_list(lemma, parse, context=location)
     if predicted:
         predicted = predicted.replace("|", "")
         if len(stem_set) > 0:
             if any([strip_length(s) in stem_set for s in predicted.split("/")]):
                 self.counter.success()
             else:
                 self.counter.fail("got {} for {} {} {} (lexicon has {})".format(", ".join(stem_set), lemma, parse, norm, predicted))
         else:
             self.counter.fail("[{}] didn't get any match for {} {} {}".format(location, lemma, parse, norm))
     else:
         self.counter.skip("[{}] couldn't predict {} {} {}; got stem_set: {}".format(location, lemma, parse, norm, stem_set))
Exemple #4
0
def test_stemming(test_file, lexicon_file):
    stemmer = Stemmer(lexicon_file)

    with open(test_file) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", "")

            for parse, form in test.items():
                stemmer.stem(location, lemma, parse, form, test_length)

    stemmer.counter.results()
Exemple #5
0
def test_generation(test_file, lexicon_file):

    lexicon = Lexicon(lexicon_file)

    counter = Counter()

    with open(test_file) as f:
        for test in yaml.load(f):
            lemma = test.pop("lemma")
            location = test.pop("location", "")
            for parse, form in test.items():
                predicted = lexicon.generate(lemma, parse, context=location)
                if predicted is None:
                    counter.fail("didn't know how to work out {} {} {}".format(lemma, parse, form))
                elif strip_length(form) == strip_length(predicted):
                    counter.success()
                    continue
                elif strip_length(form) not in [strip_length(p) for p in predicted.split("/")]:
                    counter.fail("{} {} got {} instead of {} in {}".format(lemma, parse, predicted, form, location))
                else:
                    counter.skip("{} {} {} {} {}".format(lemma, parse, form, predicted, location))

    counter.results()
Exemple #6
0
    def ending_info(self, form, parse, test_length=False):
        stem_set = set()

        form = debreath(form)

        if parse in self.endings:
            pairs = self.endings[parse]

            while isinstance(pairs, dict) and "ref" in pairs:
                if pairs["ref"] in self.endings:
                    pairs = self.endings[pairs["ref"]]
                else:
                    raise Exception("ref to {} which doesn't exist".format(pairs["ref"]))

            for entry in pairs:
                if not test_length:
                    entry = strip_length(entry)
                s1, s234, s5 = entry.split("|")
                s2, s34 = s234.split(">")
                s3, s4 = s34.split("<")
                s3 = s3.replace("(", "\\(")
                s3 = s3.replace(")", "\\)")
                s5 = s5.replace("(", "\\(")
                s5 = s5.replace(")", "\\)")
                regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2)

                if re.match(regex_pair[0], form):
                    stem = rebreath(
                        strip_accents(
                            re.sub(regex_pair[0], r"\1" + regex_pair[1], form)))

                    stem_set.add(EndingInfo(stem, (s1, s2, s3, s4, s5)))
        else:
            return None

        return stem_set
Exemple #7
0
from characters import strip_length

from parse import Lexicon, Endings

TEST_FILE = "tests/pratt.yaml"
LEXICON_FILE = "lexicons/pratt.yaml"
ENDINGS_FILE = "stemming.yaml"

lexicon = Lexicon(LEXICON_FILE)
endings = Endings(ENDINGS_FILE)

if __name__ == "__main__":

    with open(TEST_FILE) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", None)

            for parse, form in test.items():
                stem_info = lexicon.stem_info(lemma, parse, context=location)
                if stem_info is None:
                    print("couldn't get stem info for {} {}".format(
                        lemma, parse))
                    continue
                ending_info = endings.ending_info(form, parse)
                valid_stems = (
                    set(strip_length(info.stem)
                        for info in stem_info) & set(info.stem
                                                     for info in ending_info))

TEST_FILE = "tests/dik.yaml"
LEXICON_FILE = "lexicons/dik.yaml"
ENDINGS_FILE = "stemming.yaml"


lexicon = Lexicon(LEXICON_FILE)
endings = Endings(ENDINGS_FILE)


if __name__ == "__main__":

    with open(TEST_FILE) as f:
        for test in yaml.load(f):
            lemma = strip_length(test.pop("lemma"))
            test_length = test.pop("test_length", True)
            location = test.pop("location", None)

            for parse, form in test.items():
                stem_info = lexicon.stem_info(lemma, parse, context=location)
                if stem_info is None:
                    print("couldn't get stem info for {} {}".format(lemma, parse))
                    continue
                ending_info = endings.ending_info(form, parse)
                valid_stems = (
                    set(strip_length(info.stem) for info in stem_info) &
                    set(info.stem for info in ending_info))

                if len(valid_stems) != 1:
                    print(form, parse, lemma)
            if ccat_parse[3] == "N":
                parse = ccat_parse[1:4]
            elif ccat_parse[3] == "P":
                parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
            elif ccat_parse[3] == "I":
                parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
            else:
                continue

            stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"])
            if stem_info is None:
                print("couldn't get stem info for {} {}".format(lemma, parse))
                continue
            ending_info = endings.ending_info(form, parse)
            valid_stems = (
                set(strip_length(info.stem.replace("|", "")) for info in stem_info) &
                set(info.stem for info in ending_info))

            if len(valid_stems) != 1:
                print(form, parse, lemma, len(valid_stems))
                print("    {}".format(stem_info))
                print("    {}".format(ending_info))
                for valid_stem in valid_stems:
                    for info in stem_info:
                        if info.stem == valid_stem:
                            print("    {}".format(info))
                    for info in ending_info:
                        if info.stem == valid_stem:
                            print("    {}".format(info))
Exemple #10
0
        lemma = row["lemma"]
        if ccat_pos != "V-":
            continue

        if ccat_parse[3] == "N":
            parse = ccat_parse[1:4]
        elif ccat_parse[3] == "P":
            parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
        elif ccat_parse[3] == "I":
            parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
        else:
            continue

        predicted = lexicon.generate(lemma, parse)
        if predicted is None:
            counter.fail("didn't know how to work out {} {} {}".format(
                lemma, parse, norm))
        elif strip_length(norm) == strip_length(predicted):
            counter.success()
            continue
        elif strip_length(norm) not in [
                strip_length(p) for p in predicted.split("/")
        ]:
            counter.fail("{} {} got {} instead of {} in {}".format(
                lemma, parse, predicted, norm, row["bcv"]))
        else:
            counter.skip("{} {} {} {} {}".format(lemma, parse, norm, predicted,
                                                 row["bcv"]))

counter.results()
            words[lemma]["aorist.mp2.actual"].add(norm)


lexicon = Lexicon("lexicons/morphgnt.yaml")

for k in sorted(words.keys(), key=collator.sort_key):
    PAN_generated = lexicon.generate(k, "PAN")
    PMN_generated = lexicon.generate(k, "PMN")
    PPN_generated = lexicon.generate(k, "PPN")
    AAN_generated = lexicon.generate(k, "AAN")
    AMN_generated = lexicon.generate(k, "AMN")
    APN_generated = lexicon.generate(k, "APN")

    if PAN_generated:
        if "present.act.actual" not in words[k]:
            words[k]["present.act.generated"].add(strip_length(PAN_generated))
    if PMN_generated:
        if "present.mp1.actual" not in words[k]:
            words[k]["present.mp1.generated"].add(strip_length(PMN_generated))
    if PPN_generated:
        if "present.mp1.actual" not in words[k]:
            words[k]["present.mp1.generated"].add(strip_length(PPN_generated))
    if AAN_generated:
        if "aorist.act.actual" not in words[k]:
            words[k]["aorist.act.generated"].add(strip_length(AAN_generated))
    if AMN_generated:
        if "aorist.mp1.actual" not in words[k]:
            words[k]["aorist.mp1.generated"].add(strip_length(AMN_generated))
    if APN_generated:
        if "aorist.mp2.actual" not in words[k]:
            words[k]["aorist.mp2.generated"].add(strip_length(APN_generated))
Exemple #12
0
        if ccat_parse[1:4] == "APN":
            words[lemma]["aorist.mp2.actual"].add(norm)

lexicon = Lexicon("lexicons/morphgnt.yaml")

for k in sorted(words.keys(), key=collator.sort_key):
    PAN_generated = lexicon.generate(k, "PAN")
    PMN_generated = lexicon.generate(k, "PMN")
    PPN_generated = lexicon.generate(k, "PPN")
    AAN_generated = lexicon.generate(k, "AAN")
    AMN_generated = lexicon.generate(k, "AMN")
    APN_generated = lexicon.generate(k, "APN")

    if PAN_generated:
        if "present.act.actual" not in words[k]:
            words[k]["present.act.generated"].add(strip_length(PAN_generated))
    if PMN_generated:
        if "present.mp1.actual" not in words[k]:
            words[k]["present.mp1.generated"].add(strip_length(PMN_generated))
    if PPN_generated:
        if "present.mp1.actual" not in words[k]:
            words[k]["present.mp1.generated"].add(strip_length(PPN_generated))
    if AAN_generated:
        if "aorist.act.actual" not in words[k]:
            words[k]["aorist.act.generated"].add(strip_length(AAN_generated))
    if AMN_generated:
        if "aorist.mp1.actual" not in words[k]:
            words[k]["aorist.mp1.generated"].add(strip_length(AMN_generated))
    if APN_generated:
        if "aorist.mp2.actual" not in words[k]:
            words[k]["aorist.mp2.generated"].add(strip_length(APN_generated))
for book_num in range(1, 28):
    for row in morphgnt_rows(book_num):
        ccat_pos = row["ccat-pos"]
        ccat_parse = row["ccat-parse"]
        norm = row["norm"]
        lemma = row["lemma"]
        if ccat_pos != "V-":
            continue

        if ccat_parse[3] == "N":
            parse = ccat_parse[1:4]
        elif ccat_parse[3] == "P":
            parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
        elif ccat_parse[3] == "I":
            parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
        else:
            continue

        predicted = lexicon.generate(lemma, parse)
        if predicted is None:
            counter.fail("didn't know how to work out {} {} {}".format(lemma, parse, norm))
        elif strip_length(norm) == strip_length(predicted):
            counter.success()
            continue
        elif strip_length(norm) not in [strip_length(p) for p in predicted.split("/")]:
            counter.fail("{} {} got {} instead of {} in {}".format(lemma, parse, predicted, norm, row["bcv"]))
        else:
            counter.skip("{} {} {} {} {}".format(lemma, parse, norm, predicted, row["bcv"]))

counter.results()
    def generate(self, lemma, parse, allow_form_override=True, context=None):
        answers = []
        stems = None
        accent_override = None
        is_enclitic = False
        ending_override = None

        if lemma in self.lexicon:
            if allow_form_override:
                answer = self.lexicon[lemma].get("forms", {}).get(parse)
                if answer:
                    return answer

            stems = self.regex_list(lemma, parse, context)

            if "." in parse:
                accents = self.lexicon[lemma].get("accents", {}).get(parse.split(".")[0])
                if accents == "enclitic":
                    is_enclitic = True
                else:
                    accent_override = accents

            ending_override = self.lexicon[lemma].get("endings", {}).get(parse)

        if stems is None:
            return
        else:
            stems = stems.split("/")

        if parse not in stemming_rules:
            return

        for stem in stems:
            stem = debreath(stem)
            pairs = stemming_rules[parse]
            while isinstance(pairs, dict) and "ref" in pairs:
                if pairs["ref"] in stemming_rules:
                    pairs = stemming_rules[pairs["ref"]]
                else:
                    # @@@ raise error?
                    return
            base_endings = []
            default = []
            for rule in pairs:
                s1, s234, s5 = rule.split("|")
                s2, s34 = s234.split(">")
                s3, s4 = s34.split("<")

                if stem.endswith(strip_accents(s1 + s2)):
                    if s2:
                        base = stem[:-len(s2)]
                    else:
                        base = stem
                else:
                    continue

                if ending_override:
                    ending_list = ending_override.split("/")
                else:
                    ending_list = [s3 + s5]

                if s1 + s2:
                    base_endings.append((base, ending_list))
                else:
                    default.append((base, ending_list))

            # only use default if there are no other options
            if len(base_endings) == 0 and len(default) > 0:
                base_endings = default

            for base, ending_list in base_endings:
                for ending in ending_list:
                    if accent(ending):
                        answers.append((base + ending).replace("|", ""))
                    elif is_enclitic:
                        answers.append(make_oxytone(base + ending).replace("|", ""))
                    else:
                        if parse[2] == "P":
                            if accent_override:
                                answers.append(persistent(base + ending, accent_override))
                            elif parse == "AAP.NSM" and ending == "ων":
                                answers.append(make_oxytone(base + ending).replace("|", ""))
                            elif parse == "AAP.NSM" and ending == "_3+ς":
                                answers.append(make_oxytone(base + ending).replace("|", ""))
                            elif parse == "PAP.NSM" and ending == "_3+ς":
                                answers.append(make_oxytone(base + ending).replace("|", ""))
                            elif parse[0:3] == "AAP" and parse != "AAP.NSM":
                                # calculate NSM
                                nsms = self.generate(lemma, "AAP.NSM", context=context)
                                nsms = nsms.split("/")
                                for nsm in nsms:
                                    if nsm.endswith(("ών", "ούς")):
                                        answers.append(persistent(base + ending, nsm))
                                    else:
                                        answers.append(persistent(base + ending, lemma))
                            elif parse[0:3] == "PAP" and parse != "PAP.NSM":
                                # calculate NSM
                                nsms = self.generate(lemma, "PAP.NSM").split("/")
                                for nsm in nsms:
                                    nsm = strip_length(nsm)
                                    answers.append(persistent(base + ending, nsm))
                            else:
                                answers.append(recessive(base + ending, default_short=True))
                        elif parse[0:3] in ["AAN", "XAN", "XMN", "XPN"]:
                            answers.append(on_penult(base + ending, default_short=True))
                        elif parse[0:3] == "PAN" and stem.endswith("!"):
                            answers.append(on_penult(base + ending, default_short=True))
                        else:
                            answers.append(recessive(base + ending, default_short=True))

        return "/".join(remove_duplicates(rebreath(w) for w in answers))
Exemple #15
0
            if ccat_parse[3] == "N":
                parse = ccat_parse[1:4]
            elif ccat_parse[3] == "P":
                parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
            elif ccat_parse[3] == "I":
                parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
            else:
                continue

            stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"])
            if stem_info is None:
                print("couldn't get stem info for {} {}".format(lemma, parse))
                continue
            ending_info = endings.ending_info(form, parse)
            valid_stems = (set(
                strip_length(info.stem.replace("|", ""))
                for info in stem_info) & set(info.stem
                                             for info in ending_info))

            if len(valid_stems) != 1:
                print(form, parse, lemma, len(valid_stems))
                print("    {}".format(stem_info))
                print("    {}".format(ending_info))
                for valid_stem in valid_stems:
                    for info in stem_info:
                        if info.stem == valid_stem:
                            print("    {}".format(info))
                    for info in ending_info:
                        if info.stem == valid_stem:
                            print("    {}".format(info))