def test_lexicon(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", "foo", set())]) self.assertEqual(lexicon.stem_to_lemma_key_regex["foo"], {("FOO", "bar", ())})
def setUp(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") rules = StemmingRuleSet() self.rule = rules.add("barista", "|o><|llow") self.inflexion = Inflexion() self.inflexion.add_lexicon(lexicon) self.inflexion.add_stemming_rule_set(rules)
def test_find_stems_with_tags_2(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "faa", {'-a'}) lexicon.add("FOO", "bar", "fee", {'-b'}) self.assertEqual( lexicon.find_stems("FOO", "barista"), {"faa", "fee"} )
def load_lexicon(lexicon_file, pre_processor=lambda x: x): lexicon = Lexicon() partnum_to_key_regex = { "1-": "P", "1-A": "PA", "1-M": "PM", "1+": "I", "2-": "F[AM]", "2-A": "FA", "2-M": "FM", "3-": "A[AM][NPDSO]", "3+": "A[AM]I", "3+A": "AAI", "3+M": "AMI", "4-": "XA", "4+": "YA", "5-": "X[MP]", "5+": "Y[MP]", "6-": "AP[NPDSO]", "6+": "API", "7-": "FP", } form_override = {} accent_override = defaultdict(list) with open(lexicon_file) as f: for lemma, entry in yaml.load(f).items(): if "stems" not in entry: continue stems = [] for partnum, stems in sorted(entry["stems"].items()): key_regex = partnum_to_key_regex[partnum] for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key_regex, stems in entry.get("stem_overrides", []): if stems is None: continue for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key, form in entry.get("forms", {}).items(): form_override[(lemma, key)] = form for key_regex, form in entry.get("accents", []): accent_override[lemma].append((key_regex, form)) return lexicon, form_override, accent_override
def test_lexicon(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") self.assertEqual( lexicon.lemma_to_stems["FOO"], [("bar", "foo", set())] ) self.assertEqual( lexicon.stem_to_lemma_key_regex["foo"], {("FOO", "bar", ())} )
def __init__(self, stemming_file, lexicon_file=None, strip_length=False): self.ruleset = load_stemming(stemming_file, strip_length) if lexicon_file: self.lexicon, self.form_override, self.accent_override = \ load_lexicon(lexicon_file, pre_processor=debreath) else: self.lexicon = Lexicon() self.form_override = {} self.accent_override = defaultdict(list) self.inflexion = Inflexion() self.inflexion.add_lexicon(self.lexicon) self.inflexion.add_stemming_rule_set(self.ruleset)
class GreekInflexion: def __init__(self, stemming_file, lexicon_file=None, strip_length=False): self.ruleset = load_stemming(stemming_file, strip_length) if lexicon_file: (self.lexicon, self.form_override, self.accent_override, self.segmented_lemmas) = load_lexicon(lexicon_file, pre_processor=debreath) else: self.lexicon = Lexicon() self.form_override = {} self.accent_override = defaultdict(list) self.segmented_lemmas = {} self.inflexion = Inflexion() self.inflexion.add_lexicon(self.lexicon) self.inflexion.add_stemming_rule_set(self.ruleset) def find_stems(self, lemma, key, tags=None): return self.lexicon.find_stems(lemma, key, tags, stem_post_processor=rebreath) def generate(self, lemma, key, tags=None): overrides = self.form_override.get((lemma, key)) if overrides: if isinstance(overrides, str): overrides = [overrides] return {override: [{"override": "form"}] for override in overrides} generated = defaultdict(list) for orig_form, details in self.inflexion.generate(lemma, key, tags).items(): for detail in details: segmented_lemma = self.segmented_lemmas.get(lemma) accent_form, accent_notes = calculate_accent( orig_form, key, lemma, segmented_lemma, detail["stem"], self.inflexion, self.accent_override) detail.update({"original_form": orig_form}) detail.update({"accent_notes": accent_notes}) generated[accent_form].append(detail) return generated def possible_stems(self, form, key_regex=None): for key, stem in self.ruleset.possible_stems(debreath(form)): if key_regex is None or re.match(key_regex, key): if stem != "h": yield key, rebreath(strip_accents(stem)) def possible_stems2(self, form, key_regex=None): for key, stem in self.ruleset.possible_stems2(debreath(form)): if key_regex is None or re.match(key_regex, key): yield key, rebreath(strip_accents(stem)) def parse(self, form): return self.inflexion.parse(debreath(form), stem_post_processor=strip_accents) def conjugate(self, lemma, *TVMs, tags=None): print("-") print(" lemma: {}".format(lemma)) if tags: print() print(" tags:") for tag in tags: print(" - {}".format(tag)) for TVM in TVMs: print() if TVM[2] in "ISO": for PN in ["1S", "2S", "3S", "1P", "2P", "3P"]: parse = TVM + "." + PN form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) elif TVM[2] == "D": if "." not in TVM: for PN in ["2S", "3S", "2P", "3P"]: parse = TVM + "." + PN form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) else: form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(TVM, form)) elif TVM[2] == "N": parse = TVM form = "/".join(self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) elif TVM[2] == "P": if TVM.endswith(".N"): for NG in ["SM", "SF", "SN"]: parse = TVM + NG form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) else: for CNG in ["NSM", "NSF", "NSN", "GSM", "GSF", "GSN"]: parse = TVM + "." + CNG form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) print() print() def decline(self, lemma, TVM, tags=None): if TVM[2] != "P": raise ValueError print("-") print(" lemma: {}".format(lemma)) for G in "MFN": print() for CN in [ "NS", "GS", "DS", "AS", "VS", "NP", "VP", "GP", "DP", "AP" ]: parse = TVM + "." + CN + G form = "/".join(self.generate(lemma, parse, tags=tags).keys()) print(" {}: {}".format(parse, form)) print() print()
def test_find_stems_with_tags_2(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "faa", {'-a'}) lexicon.add("FOO", "bar", "fee", {'-b'}) self.assertEqual(lexicon.find_stems("FOO", "barista"), {"faa", "fee"})
def test_find_stems(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
def load_lexicon(lexicon_file, pre_processor=lambda x: x): lexicon = Lexicon() partnum_to_key_regex = { "1-": "P", "1-A": "PA", "1-M": "PM", "1+": "I", "2-": "F[AM]", "2-A": "FA", "2-M": "FM", "3-": "A[AM][NPDSO]", "3+": "A[AM]I", "3+A": "AAI", "3+M": "AMI", "4-": "XA", "4+": "YA", "5-": "X[MP]", "5+": "Y[MP]", "6-": "AP[NPDSO]", "6+": "API", "7-": "FP", "8-": "Z[MP]", "M": "..M", "F": "..F", "N": "..N", } form_override = {} accent_override = defaultdict(list) with open(lexicon_file) as f: for lemma, entry in yaml.load(f).items(): if "stems" in entry: stems = [] for partnum, stems in sorted( (entry["stems"] if entry.get("stems") else {}).items()): key_regex = partnum_to_key_regex[partnum] for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key_regex, stems in entry.get("stem_overrides", []): if stems is None: continue for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key, form in entry.get("forms", {}).items(): form_override[(lemma, key)] = form for key_regex, form in entry.get("accents", []): accent_override[lemma].append((key_regex, form)) return lexicon, form_override, accent_override
""" encapsulates details of file format used for stemming.yaml, and lexicon YAMLs, providing functions for loading the file and populating StemmingRuleSet or Lexicon (with form and accent overrides) """ from collections import defaultdict import yaml from greek_accentuation.characters import strip_length as do_strip_length from inflexion.lexicon import Lexicon from inflexion.stemming import StemmingRuleSet class RefDoesNotExistException(Exception): pass def load_stemming(stemming_file, strip_length=False): ruleset = StemmingRuleSet() with open(stemming_file) as f: stemming_dict = yaml.load(f) for key, rules in stemming_dict.items(): while isinstance(rules, dict) and "ref" in rules: if rules["ref"] in stemming_dict:
class GreekInflexion: def __init__(self, stemming_file, lexicon_file=None, strip_length=False): self.ruleset = load_stemming(stemming_file, strip_length) if lexicon_file: self.lexicon, self.form_override, self.accent_override = \ load_lexicon(lexicon_file, pre_processor=debreath) else: self.lexicon = Lexicon() self.form_override = {} self.accent_override = defaultdict(list) self.inflexion = Inflexion() self.inflexion.add_lexicon(self.lexicon) self.inflexion.add_stemming_rule_set(self.ruleset) def find_stems(self, lemma, key, tags=None): return self.lexicon.find_stems( lemma, key, tags, stem_post_processor=rebreath) def generate(self, lemma, key, tags=None): overrides = self.form_override.get((lemma, key)) if overrides: if isinstance(overrides, str): overrides = [overrides] return { override: [{"override": "form"}] for override in overrides } generated = defaultdict(list) for orig_form, details in self.inflexion.generate( lemma, key, tags).items(): for detail in details: accent_form = calculate_accent( orig_form, key, lemma, detail["stem"], self.inflexion, self.accent_override) detail.update({"original_form": orig_form}) generated[accent_form].append(detail) return generated def possible_stems(self, form, key_regex=None): for key, stem in self.ruleset.possible_stems(debreath(form)): if key_regex is None or re.match(key_regex, key): yield key, rebreath(strip_accents(stem)) def possible_stems2(self, form, key_regex=None): for key, stem in self.ruleset.possible_stems2(debreath(form)): if key_regex is None or re.match(key_regex, key): yield key, rebreath(strip_accents(stem)) def parse(self, form): return self.inflexion.parse( debreath(form), stem_post_processor=strip_accents) def conjugate(self, lemma, *TVMs, tags=None): print("-") print(" lemma: {}".format(lemma)) if tags: print() print(" tags:") for tag in tags: print(" - {}".format(tag)) for TVM in TVMs: print() if TVM[2] in "ISO": for PN in ["1S", "2S", "3S", "1P", "2P", "3P"]: parse = TVM + "." + PN form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) elif TVM[2] == "D": if "." not in TVM: for PN in ["2S", "3S", "2P", "3P"]: parse = TVM + "." + PN form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) else: form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(TVM, form)) elif TVM[2] == "N": parse = TVM form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) elif TVM[2] == "P": if TVM.endswith(".N"): for NG in ["SM", "SF", "SN"]: parse = TVM + NG form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) else: for CNG in ["NSM", "NSF", "NSN", "GSM", "GSF", "GSN"]: parse = TVM + "." + CNG form = "/".join( self.generate(lemma, parse, tags=tags).keys()) if form: print(" {}: {}".format(parse, form)) print() print() def decline(self, lemma, TVM, tags=None): if TVM[2] != "P": raise ValueError print("-") print(" lemma: {}".format(lemma)) for G in "MFN": print() for CN in [ "NS", "GS", "DS", "AS", "VS", "NP", "VP", "GP", "DP", "AP" ]: parse = TVM + "." + CN + G form = "/".join( self.generate(lemma, parse, tags=tags).keys()) print(" {}: {}".format(parse, form)) print() print()