def get_stem_set(self, parse, norm, test_length): stem_set = set() norm = debreath(norm) if parse in stemming_rules: pairs = stemming_rules[parse] while isinstance(pairs, dict) and "ref" in pairs: if pairs["ref"] in stemming_rules: pairs = stemming_rules[pairs["ref"]] else: raise Exception("ref to {} which doesn't exist".format(pairs["ref"])) for entry in pairs: if not test_length: entry = strip_length(entry) s1, s234, s5 = entry.split("|") s2, s34 = s234.split(">") s3, s4 = s34.split("<") s3 = s3.replace("(", "\\(") s3 = s3.replace(")", "\\)") s5 = s5.replace("(", "\\(") s5 = s5.replace(")", "\\)") regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2) if re.match(regex_pair[0], norm): stem_set.add(rebreath(strip_accents(re.sub(regex_pair[0], r"\1" + regex_pair[1], norm)))) else: return None return stem_set
def calculate_form(entry, parse): c = entry for step in parse.split("."): if step[0] in "123": step = "_" + step c = getattr(c(), step) return strip_length(c().replace("+", "")), (c.__self__.__class__.__name__, c.__qualname__)
def predict_and_test(self, lemma, parse, norm, location, stem_set): predicted = self.regex_list(lemma, parse, context=location) if predicted: predicted = predicted.replace("|", "") if len(stem_set) > 0: if any([strip_length(s) in stem_set for s in predicted.split("/")]): self.counter.success() else: self.counter.fail("got {} for {} {} {} (lexicon has {})".format(", ".join(stem_set), lemma, parse, norm, predicted)) else: self.counter.fail("[{}] didn't get any match for {} {} {}".format(location, lemma, parse, norm)) else: self.counter.skip("[{}] couldn't predict {} {} {}; got stem_set: {}".format(location, lemma, parse, norm, stem_set))
def test_stemming(test_file, lexicon_file): stemmer = Stemmer(lexicon_file) with open(test_file) as f: for test in yaml.load(f): lemma = strip_length(test.pop("lemma")) test_length = test.pop("test_length", True) location = test.pop("location", "") for parse, form in test.items(): stemmer.stem(location, lemma, parse, form, test_length) stemmer.counter.results()
def test_generation(test_file, lexicon_file): lexicon = Lexicon(lexicon_file) counter = Counter() with open(test_file) as f: for test in yaml.load(f): lemma = test.pop("lemma") location = test.pop("location", "") for parse, form in test.items(): predicted = lexicon.generate(lemma, parse, context=location) if predicted is None: counter.fail("didn't know how to work out {} {} {}".format(lemma, parse, form)) elif strip_length(form) == strip_length(predicted): counter.success() continue elif strip_length(form) not in [strip_length(p) for p in predicted.split("/")]: counter.fail("{} {} got {} instead of {} in {}".format(lemma, parse, predicted, form, location)) else: counter.skip("{} {} {} {} {}".format(lemma, parse, form, predicted, location)) counter.results()
def ending_info(self, form, parse, test_length=False): stem_set = set() form = debreath(form) if parse in self.endings: pairs = self.endings[parse] while isinstance(pairs, dict) and "ref" in pairs: if pairs["ref"] in self.endings: pairs = self.endings[pairs["ref"]] else: raise Exception("ref to {} which doesn't exist".format(pairs["ref"])) for entry in pairs: if not test_length: entry = strip_length(entry) s1, s234, s5 = entry.split("|") s2, s34 = s234.split(">") s3, s4 = s34.split("<") s3 = s3.replace("(", "\\(") s3 = s3.replace(")", "\\)") s5 = s5.replace("(", "\\(") s5 = s5.replace(")", "\\)") regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2) if re.match(regex_pair[0], form): stem = rebreath( strip_accents( re.sub(regex_pair[0], r"\1" + regex_pair[1], form))) stem_set.add(EndingInfo(stem, (s1, s2, s3, s4, s5))) else: return None return stem_set
from characters import strip_length from parse import Lexicon, Endings TEST_FILE = "tests/pratt.yaml" LEXICON_FILE = "lexicons/pratt.yaml" ENDINGS_FILE = "stemming.yaml" lexicon = Lexicon(LEXICON_FILE) endings = Endings(ENDINGS_FILE) if __name__ == "__main__": with open(TEST_FILE) as f: for test in yaml.load(f): lemma = strip_length(test.pop("lemma")) test_length = test.pop("test_length", True) location = test.pop("location", None) for parse, form in test.items(): stem_info = lexicon.stem_info(lemma, parse, context=location) if stem_info is None: print("couldn't get stem info for {} {}".format( lemma, parse)) continue ending_info = endings.ending_info(form, parse) valid_stems = ( set(strip_length(info.stem) for info in stem_info) & set(info.stem for info in ending_info))
TEST_FILE = "tests/dik.yaml" LEXICON_FILE = "lexicons/dik.yaml" ENDINGS_FILE = "stemming.yaml" lexicon = Lexicon(LEXICON_FILE) endings = Endings(ENDINGS_FILE) if __name__ == "__main__": with open(TEST_FILE) as f: for test in yaml.load(f): lemma = strip_length(test.pop("lemma")) test_length = test.pop("test_length", True) location = test.pop("location", None) for parse, form in test.items(): stem_info = lexicon.stem_info(lemma, parse, context=location) if stem_info is None: print("couldn't get stem info for {} {}".format(lemma, parse)) continue ending_info = endings.ending_info(form, parse) valid_stems = ( set(strip_length(info.stem) for info in stem_info) & set(info.stem for info in ending_info)) if len(valid_stems) != 1: print(form, parse, lemma)
if ccat_parse[3] == "N": parse = ccat_parse[1:4] elif ccat_parse[3] == "P": parse = ccat_parse[1:4] + "." + ccat_parse[4:7] elif ccat_parse[3] == "I": parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5] else: continue stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"]) if stem_info is None: print("couldn't get stem info for {} {}".format(lemma, parse)) continue ending_info = endings.ending_info(form, parse) valid_stems = ( set(strip_length(info.stem.replace("|", "")) for info in stem_info) & set(info.stem for info in ending_info)) if len(valid_stems) != 1: print(form, parse, lemma, len(valid_stems)) print(" {}".format(stem_info)) print(" {}".format(ending_info)) for valid_stem in valid_stems: for info in stem_info: if info.stem == valid_stem: print(" {}".format(info)) for info in ending_info: if info.stem == valid_stem: print(" {}".format(info))
lemma = row["lemma"] if ccat_pos != "V-": continue if ccat_parse[3] == "N": parse = ccat_parse[1:4] elif ccat_parse[3] == "P": parse = ccat_parse[1:4] + "." + ccat_parse[4:7] elif ccat_parse[3] == "I": parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5] else: continue predicted = lexicon.generate(lemma, parse) if predicted is None: counter.fail("didn't know how to work out {} {} {}".format( lemma, parse, norm)) elif strip_length(norm) == strip_length(predicted): counter.success() continue elif strip_length(norm) not in [ strip_length(p) for p in predicted.split("/") ]: counter.fail("{} {} got {} instead of {} in {}".format( lemma, parse, predicted, norm, row["bcv"])) else: counter.skip("{} {} {} {} {}".format(lemma, parse, norm, predicted, row["bcv"])) counter.results()
words[lemma]["aorist.mp2.actual"].add(norm) lexicon = Lexicon("lexicons/morphgnt.yaml") for k in sorted(words.keys(), key=collator.sort_key): PAN_generated = lexicon.generate(k, "PAN") PMN_generated = lexicon.generate(k, "PMN") PPN_generated = lexicon.generate(k, "PPN") AAN_generated = lexicon.generate(k, "AAN") AMN_generated = lexicon.generate(k, "AMN") APN_generated = lexicon.generate(k, "APN") if PAN_generated: if "present.act.actual" not in words[k]: words[k]["present.act.generated"].add(strip_length(PAN_generated)) if PMN_generated: if "present.mp1.actual" not in words[k]: words[k]["present.mp1.generated"].add(strip_length(PMN_generated)) if PPN_generated: if "present.mp1.actual" not in words[k]: words[k]["present.mp1.generated"].add(strip_length(PPN_generated)) if AAN_generated: if "aorist.act.actual" not in words[k]: words[k]["aorist.act.generated"].add(strip_length(AAN_generated)) if AMN_generated: if "aorist.mp1.actual" not in words[k]: words[k]["aorist.mp1.generated"].add(strip_length(AMN_generated)) if APN_generated: if "aorist.mp2.actual" not in words[k]: words[k]["aorist.mp2.generated"].add(strip_length(APN_generated))
if ccat_parse[1:4] == "APN": words[lemma]["aorist.mp2.actual"].add(norm) lexicon = Lexicon("lexicons/morphgnt.yaml") for k in sorted(words.keys(), key=collator.sort_key): PAN_generated = lexicon.generate(k, "PAN") PMN_generated = lexicon.generate(k, "PMN") PPN_generated = lexicon.generate(k, "PPN") AAN_generated = lexicon.generate(k, "AAN") AMN_generated = lexicon.generate(k, "AMN") APN_generated = lexicon.generate(k, "APN") if PAN_generated: if "present.act.actual" not in words[k]: words[k]["present.act.generated"].add(strip_length(PAN_generated)) if PMN_generated: if "present.mp1.actual" not in words[k]: words[k]["present.mp1.generated"].add(strip_length(PMN_generated)) if PPN_generated: if "present.mp1.actual" not in words[k]: words[k]["present.mp1.generated"].add(strip_length(PPN_generated)) if AAN_generated: if "aorist.act.actual" not in words[k]: words[k]["aorist.act.generated"].add(strip_length(AAN_generated)) if AMN_generated: if "aorist.mp1.actual" not in words[k]: words[k]["aorist.mp1.generated"].add(strip_length(AMN_generated)) if APN_generated: if "aorist.mp2.actual" not in words[k]: words[k]["aorist.mp2.generated"].add(strip_length(APN_generated))
for book_num in range(1, 28): for row in morphgnt_rows(book_num): ccat_pos = row["ccat-pos"] ccat_parse = row["ccat-parse"] norm = row["norm"] lemma = row["lemma"] if ccat_pos != "V-": continue if ccat_parse[3] == "N": parse = ccat_parse[1:4] elif ccat_parse[3] == "P": parse = ccat_parse[1:4] + "." + ccat_parse[4:7] elif ccat_parse[3] == "I": parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5] else: continue predicted = lexicon.generate(lemma, parse) if predicted is None: counter.fail("didn't know how to work out {} {} {}".format(lemma, parse, norm)) elif strip_length(norm) == strip_length(predicted): counter.success() continue elif strip_length(norm) not in [strip_length(p) for p in predicted.split("/")]: counter.fail("{} {} got {} instead of {} in {}".format(lemma, parse, predicted, norm, row["bcv"])) else: counter.skip("{} {} {} {} {}".format(lemma, parse, norm, predicted, row["bcv"])) counter.results()
def generate(self, lemma, parse, allow_form_override=True, context=None): answers = [] stems = None accent_override = None is_enclitic = False ending_override = None if lemma in self.lexicon: if allow_form_override: answer = self.lexicon[lemma].get("forms", {}).get(parse) if answer: return answer stems = self.regex_list(lemma, parse, context) if "." in parse: accents = self.lexicon[lemma].get("accents", {}).get(parse.split(".")[0]) if accents == "enclitic": is_enclitic = True else: accent_override = accents ending_override = self.lexicon[lemma].get("endings", {}).get(parse) if stems is None: return else: stems = stems.split("/") if parse not in stemming_rules: return for stem in stems: stem = debreath(stem) pairs = stemming_rules[parse] while isinstance(pairs, dict) and "ref" in pairs: if pairs["ref"] in stemming_rules: pairs = stemming_rules[pairs["ref"]] else: # @@@ raise error? return base_endings = [] default = [] for rule in pairs: s1, s234, s5 = rule.split("|") s2, s34 = s234.split(">") s3, s4 = s34.split("<") if stem.endswith(strip_accents(s1 + s2)): if s2: base = stem[:-len(s2)] else: base = stem else: continue if ending_override: ending_list = ending_override.split("/") else: ending_list = [s3 + s5] if s1 + s2: base_endings.append((base, ending_list)) else: default.append((base, ending_list)) # only use default if there are no other options if len(base_endings) == 0 and len(default) > 0: base_endings = default for base, ending_list in base_endings: for ending in ending_list: if accent(ending): answers.append((base + ending).replace("|", "")) elif is_enclitic: answers.append(make_oxytone(base + ending).replace("|", "")) else: if parse[2] == "P": if accent_override: answers.append(persistent(base + ending, accent_override)) elif parse == "AAP.NSM" and ending == "ων": answers.append(make_oxytone(base + ending).replace("|", "")) elif parse == "AAP.NSM" and ending == "_3+ς": answers.append(make_oxytone(base + ending).replace("|", "")) elif parse == "PAP.NSM" and ending == "_3+ς": answers.append(make_oxytone(base + ending).replace("|", "")) elif parse[0:3] == "AAP" and parse != "AAP.NSM": # calculate NSM nsms = self.generate(lemma, "AAP.NSM", context=context) nsms = nsms.split("/") for nsm in nsms: if nsm.endswith(("ών", "ούς")): answers.append(persistent(base + ending, nsm)) else: answers.append(persistent(base + ending, lemma)) elif parse[0:3] == "PAP" and parse != "PAP.NSM": # calculate NSM nsms = self.generate(lemma, "PAP.NSM").split("/") for nsm in nsms: nsm = strip_length(nsm) answers.append(persistent(base + ending, nsm)) else: answers.append(recessive(base + ending, default_short=True)) elif parse[0:3] in ["AAN", "XAN", "XMN", "XPN"]: answers.append(on_penult(base + ending, default_short=True)) elif parse[0:3] == "PAN" and stem.endswith("!"): answers.append(on_penult(base + ending, default_short=True)) else: answers.append(recessive(base + ending, default_short=True)) return "/".join(remove_duplicates(rebreath(w) for w in answers))
if ccat_parse[3] == "N": parse = ccat_parse[1:4] elif ccat_parse[3] == "P": parse = ccat_parse[1:4] + "." + ccat_parse[4:7] elif ccat_parse[3] == "I": parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5] else: continue stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"]) if stem_info is None: print("couldn't get stem info for {} {}".format(lemma, parse)) continue ending_info = endings.ending_info(form, parse) valid_stems = (set( strip_length(info.stem.replace("|", "")) for info in stem_info) & set(info.stem for info in ending_info)) if len(valid_stems) != 1: print(form, parse, lemma, len(valid_stems)) print(" {}".format(stem_info)) print(" {}".format(ending_info)) for valid_stem in valid_stems: for info in stem_info: if info.stem == valid_stem: print(" {}".format(info)) for info in ending_info: if info.stem == valid_stem: print(" {}".format(info))