Ejemplo n.º 1
0
 def test_lexicon(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.lemma_to_stems["FOO"],
                      [("bar", "foo", set())])
     self.assertEqual(lexicon.stem_to_lemma_key_regex["foo"],
                      {("FOO", "bar", ())})
Ejemplo n.º 2
0
 def setUp(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     rules = StemmingRuleSet()
     self.rule = rules.add("barista", "|o><|llow")
     self.inflexion = Inflexion()
     self.inflexion.add_lexicon(lexicon)
     self.inflexion.add_stemming_rule_set(rules)
Ejemplo n.º 3
0
 def setUp(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     rules = StemmingRuleSet()
     self.rule = rules.add("barista", "|o><|llow")
     self.inflexion = Inflexion()
     self.inflexion.add_lexicon(lexicon)
     self.inflexion.add_stemming_rule_set(rules)
Ejemplo n.º 4
0
 def test_find_stems_with_tags_2(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "faa", {'-a'})
     lexicon.add("FOO", "bar", "fee", {'-b'})
     self.assertEqual(
         lexicon.find_stems("FOO", "barista"),
         {"faa", "fee"}
     )
Ejemplo n.º 5
0
def load_lexicon(lexicon_file, pre_processor=lambda x: x):
    lexicon = Lexicon()

    partnum_to_key_regex = {
        "1-": "P",
        "1-A": "PA",
        "1-M": "PM",
        "1+": "I",
        "2-": "F[AM]",
        "2-A": "FA",
        "2-M": "FM",
        "3-": "A[AM][NPDSO]",
        "3+": "A[AM]I",
        "3+A": "AAI",
        "3+M": "AMI",
        "4-": "XA",
        "4+": "YA",
        "5-": "X[MP]",
        "5+": "Y[MP]",
        "6-": "AP[NPDSO]",
        "6+": "API",
        "7-": "FP",
    }

    form_override = {}
    accent_override = defaultdict(list)

    with open(lexicon_file) as f:

        for lemma, entry in yaml.load(f).items():

            if "stems" not in entry:
                continue

            stems = []

            for partnum, stems in sorted(entry["stems"].items()):

                key_regex = partnum_to_key_regex[partnum]

                for stem, tag in split_stem_tags(stems):
                    lexicon.add(lemma, key_regex, pre_processor(stem), tag)

            for key_regex, stems in entry.get("stem_overrides", []):

                if stems is None:
                    continue

                for stem, tag in split_stem_tags(stems):
                    lexicon.add(lemma, key_regex, pre_processor(stem), tag)

            for key, form in entry.get("forms", {}).items():
                form_override[(lemma, key)] = form

            for key_regex, form in entry.get("accents", []):
                accent_override[lemma].append((key_regex, form))

    return lexicon, form_override, accent_override
Ejemplo n.º 6
0
 def test_lexicon(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(
         lexicon.lemma_to_stems["FOO"],
         [("bar", "foo", set())]
     )
     self.assertEqual(
         lexicon.stem_to_lemma_key_regex["foo"],
         {("FOO", "bar", ())}
     )
Ejemplo n.º 7
0
 def test_find_stems_with_tags_2(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "faa", {'-a'})
     lexicon.add("FOO", "bar", "fee", {'-b'})
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"faa", "fee"})
Ejemplo n.º 8
0
 def test_find_stems(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
Ejemplo n.º 9
0
def load_lexicon(lexicon_file, pre_processor=lambda x: x):
    lexicon = Lexicon()

    partnum_to_key_regex = {
        "1-": "P",
        "1-A": "PA",
        "1-M": "PM",
        "1+": "I",
        "2-": "F[AM]",
        "2-A": "FA",
        "2-M": "FM",
        "3-": "A[AM][NPDSO]",
        "3+": "A[AM]I",
        "3+A": "AAI",
        "3+M": "AMI",
        "4-": "XA",
        "4+": "YA",
        "5-": "X[MP]",
        "5+": "Y[MP]",
        "6-": "AP[NPDSO]",
        "6+": "API",
        "7-": "FP",
        "8-": "Z[MP]",
        "M": "..M",
        "F": "..F",
        "N": "..N",
    }

    form_override = {}
    accent_override = defaultdict(list)

    with open(lexicon_file) as f:

        for lemma, entry in yaml.load(f).items():

            if "stems" in entry:

                stems = []

                for partnum, stems in sorted(
                    (entry["stems"] if entry.get("stems") else {}).items()):

                    key_regex = partnum_to_key_regex[partnum]

                    for stem, tag in split_stem_tags(stems):
                        lexicon.add(lemma, key_regex, pre_processor(stem), tag)

                for key_regex, stems in entry.get("stem_overrides", []):

                    if stems is None:
                        continue

                    for stem, tag in split_stem_tags(stems):
                        lexicon.add(lemma, key_regex, pre_processor(stem), tag)

            for key, form in entry.get("forms", {}).items():
                form_override[(lemma, key)] = form

            for key_regex, form in entry.get("accents", []):
                accent_override[lemma].append((key_regex, form))

    return lexicon, form_override, accent_override
Ejemplo n.º 10
0
 def test_find_stems(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})