def get_stem_set(self, parse, norm, test_length): stem_set = set() norm = debreath(norm) if parse in stemming_rules: pairs = stemming_rules[parse] while isinstance(pairs, dict) and "ref" in pairs: if pairs["ref"] in stemming_rules: pairs = stemming_rules[pairs["ref"]] else: raise Exception("ref to {} which doesn't exist".format(pairs["ref"])) for entry in pairs: if not test_length: entry = strip_length(entry) s1, s234, s5 = entry.split("|") s2, s34 = s234.split(">") s3, s4 = s34.split("<") s3 = s3.replace("(", "\\(") s3 = s3.replace(")", "\\)") s5 = s5.replace("(", "\\(") s5 = s5.replace(")", "\\)") regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2) if re.match(regex_pair[0], norm): stem_set.add(rebreath(strip_accents(re.sub(regex_pair[0], r"\1" + regex_pair[1], norm)))) else: return None return stem_set
def ending_info(self, form, parse, test_length=False): stem_set = set() form = debreath(form) if parse in self.endings: pairs = self.endings[parse] while isinstance(pairs, dict) and "ref" in pairs: if pairs["ref"] in self.endings: pairs = self.endings[pairs["ref"]] else: raise Exception("ref to {} which doesn't exist".format(pairs["ref"])) for entry in pairs: if not test_length: entry = strip_length(entry) s1, s234, s5 = entry.split("|") s2, s34 = s234.split(">") s3, s4 = s34.split("<") s3 = s3.replace("(", "\\(") s3 = s3.replace(")", "\\)") s5 = s5.replace("(", "\\(") s5 = s5.replace(")", "\\)") regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2) if re.match(regex_pair[0], form): stem = rebreath( strip_accents( re.sub(regex_pair[0], r"\1" + regex_pair[1], form))) stem_set.add(EndingInfo(stem, (s1, s2, s3, s4, s5))) else: return None return stem_set
lexeme = lexemes[lemma] try: mounce_cat = lexeme["mounce-morphcat"] except: error("{} has no mounce-morphcat".format(lemma)) if not isinstance(mounce_cat, list): mounce_cat = [mounce_cat] for cat in mounce_cat: mounce_by_lemma[lemma].add(cat) new_mounce_cat = map_non_noun_categories(mounce_cat, aspect_voice, gender, lemma) orig_norm = norm norm = decompose_breathing(strip_accents(norm)) success = False for ending_and_class_regex in noun_endings[case_number + gender]: try: ending, class_regex, explanation = ending_and_class_regex.split( ) except ValueError: error("{}\n{} {}".format(row["bcv"], case_number + gender, ending_and_class_regex)) if norm.endswith(ending.replace(".", "")): success = set() for cat in new_mounce_cat: if re.match(class_regex, cat): success.add(cat)
from collections import defaultdict from characters import strip_accents, strip_breathing CELLS = defaultdict(lambda: defaultdict(set)) GENDER = defaultdict(set) LEMMAS = defaultdict(set) with open("nominals.txt") as f: for line in f: lemma, mounce1, aspect_voice, gender, mounce2, theme1, case_number, norm, theme2, distinguisher, explanation = line.strip( ).split() CELLS[mounce2][case_number + gender].add( (strip_breathing(strip_accents(distinguisher)), explanation)) GENDER[mounce2].add(gender) LEMMAS[mounce2 + " " + gender].add(lemma) for mounce in sorted(CELLS, key=lambda x: x[0] + x[2:]): for gender in ["M", "F", "N", "-"]: if gender in GENDER[mounce]: print("\n\n{} {} ({}):".format(mounce, gender, len(LEMMAS[mounce + " " + gender]))) for case_number in [ "NS", "GS", "DS", "AS", "VS", "NP", "VP", "GP", "DP", "AP" ]: if case_number + gender in CELLS[mounce]: if len(CELLS[mounce][case_number + gender]) == 1: cell = CELLS[mounce][case_number + gender].pop() print(" {}: {:10} {{{}}}".format(
else: lexeme = lexemes[lemma] try: mounce_cat = lexeme["mounce-morphcat"] except: error("{} has no mounce-morphcat".format(lemma)) if not isinstance(mounce_cat, list): mounce_cat = [mounce_cat] for cat in mounce_cat: mounce_by_lemma[lemma].add(cat) new_mounce_cat = map_non_noun_categories(mounce_cat, aspect_voice, gender, lemma) orig_norm = norm norm = decompose_breathing(strip_accents(norm)) success = False for ending_and_class_regex in noun_endings[case_number + gender]: try: ending, class_regex, explanation = ending_and_class_regex.split() except ValueError: error("{}\n{} {}".format(row["bcv"], case_number + gender, ending_and_class_regex)) if norm.endswith(ending.replace(".", "")): success = set() for cat in new_mounce_cat: if re.match(class_regex, cat): success.add(cat) if success: break
def generate(self, lemma, parse, allow_form_override=True, context=None): answers = [] stems = None accent_override = None is_enclitic = False ending_override = None if lemma in self.lexicon: if allow_form_override: answer = self.lexicon[lemma].get("forms", {}).get(parse) if answer: return answer stems = self.regex_list(lemma, parse, context) if "." in parse: accents = self.lexicon[lemma].get("accents", {}).get(parse.split(".")[0]) if accents == "enclitic": is_enclitic = True else: accent_override = accents ending_override = self.lexicon[lemma].get("endings", {}).get(parse) if stems is None: return else: stems = stems.split("/") if parse not in stemming_rules: return for stem in stems: stem = debreath(stem) pairs = stemming_rules[parse] while isinstance(pairs, dict) and "ref" in pairs: if pairs["ref"] in stemming_rules: pairs = stemming_rules[pairs["ref"]] else: # @@@ raise error? return base_endings = [] default = [] for rule in pairs: s1, s234, s5 = rule.split("|") s2, s34 = s234.split(">") s3, s4 = s34.split("<") if stem.endswith(strip_accents(s1 + s2)): if s2: base = stem[:-len(s2)] else: base = stem else: continue if ending_override: ending_list = ending_override.split("/") else: ending_list = [s3 + s5] if s1 + s2: base_endings.append((base, ending_list)) else: default.append((base, ending_list)) # only use default if there are no other options if len(base_endings) == 0 and len(default) > 0: base_endings = default for base, ending_list in base_endings: for ending in ending_list: if accent(ending): answers.append((base + ending).replace("|", "")) elif is_enclitic: answers.append(make_oxytone(base + ending).replace("|", "")) else: if parse[2] == "P": if accent_override: answers.append(persistent(base + ending, accent_override)) elif parse == "AAP.NSM" and ending == "ων": answers.append(make_oxytone(base + ending).replace("|", "")) elif parse == "AAP.NSM" and ending == "_3+ς": answers.append(make_oxytone(base + ending).replace("|", "")) elif parse == "PAP.NSM" and ending == "_3+ς": answers.append(make_oxytone(base + ending).replace("|", "")) elif parse[0:3] == "AAP" and parse != "AAP.NSM": # calculate NSM nsms = self.generate(lemma, "AAP.NSM", context=context) nsms = nsms.split("/") for nsm in nsms: if nsm.endswith(("ών", "ούς")): answers.append(persistent(base + ending, nsm)) else: answers.append(persistent(base + ending, lemma)) elif parse[0:3] == "PAP" and parse != "PAP.NSM": # calculate NSM nsms = self.generate(lemma, "PAP.NSM").split("/") for nsm in nsms: nsm = strip_length(nsm) answers.append(persistent(base + ending, nsm)) else: answers.append(recessive(base + ending, default_short=True)) elif parse[0:3] in ["AAN", "XAN", "XMN", "XPN"]: answers.append(on_penult(base + ending, default_short=True)) elif parse[0:3] == "PAN" and stem.endswith("!"): answers.append(on_penult(base + ending, default_short=True)) else: answers.append(recessive(base + ending, default_short=True)) return "/".join(remove_duplicates(rebreath(w) for w in answers))
"F": "n-1b", "N": "n-2c", }[gender] elif aspect_voice == "FP": cat = { "M": "n-2a", "F": "n-1b", "N": "n-2c", }[gender] else: assert False, aspect_voice new_mounce_cat.append(cat) orig_norm = norm norm = strip_accents(norm) norm = norm.replace("ἡ", "hη") norm = norm.replace("ὁ", "hο") norm = norm.replace("οὑ", "hου") norm = norm.replace("οἱ", "hοι") norm = norm.replace("αἱ", "hαι") norm = norm.replace("εἱ", "hει") norm = norm.replace("ἁ", "hα") norm = norm.replace("ἑ", "hε") norm = norm.replace("ὡ", "hω") norm = norm.replace("ὑ", "hυ") norm = norm.replace("ᾑ", "hῃ") norm = norm.replace("ᾡ", "hῳ") norm = norm.replace("οὐ", "ου") norm = norm.replace("ὠ", "ω") norm = norm.replace("ὀ", "ο")