Esempio n. 1
0
 def test_lexicon(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.lemma_to_stems["FOO"],
                      [("bar", "foo", set())])
     self.assertEqual(lexicon.stem_to_lemma_key_regex["foo"],
                      {("FOO", "bar", ())})
Esempio n. 2
0
 def setUp(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     rules = StemmingRuleSet()
     self.rule = rules.add("barista", "|o><|llow")
     self.inflexion = Inflexion()
     self.inflexion.add_lexicon(lexicon)
     self.inflexion.add_stemming_rule_set(rules)
Esempio n. 3
0
 def setUp(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     rules = StemmingRuleSet()
     self.rule = rules.add("barista", "|o><|llow")
     self.inflexion = Inflexion()
     self.inflexion.add_lexicon(lexicon)
     self.inflexion.add_stemming_rule_set(rules)
Esempio n. 4
0
 def test_find_stems_with_tags_2(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "faa", {'-a'})
     lexicon.add("FOO", "bar", "fee", {'-b'})
     self.assertEqual(
         lexicon.find_stems("FOO", "barista"),
         {"faa", "fee"}
     )
Esempio n. 5
0
def load_lexicon(lexicon_file, pre_processor=lambda x: x):
    lexicon = Lexicon()

    partnum_to_key_regex = {
        "1-": "P",
        "1-A": "PA",
        "1-M": "PM",
        "1+": "I",
        "2-": "F[AM]",
        "2-A": "FA",
        "2-M": "FM",
        "3-": "A[AM][NPDSO]",
        "3+": "A[AM]I",
        "3+A": "AAI",
        "3+M": "AMI",
        "4-": "XA",
        "4+": "YA",
        "5-": "X[MP]",
        "5+": "Y[MP]",
        "6-": "AP[NPDSO]",
        "6+": "API",
        "7-": "FP",
    }

    form_override = {}
    accent_override = defaultdict(list)

    with open(lexicon_file) as f:

        for lemma, entry in yaml.load(f).items():

            if "stems" not in entry:
                continue

            stems = []

            for partnum, stems in sorted(entry["stems"].items()):

                key_regex = partnum_to_key_regex[partnum]

                for stem, tag in split_stem_tags(stems):
                    lexicon.add(lemma, key_regex, pre_processor(stem), tag)

            for key_regex, stems in entry.get("stem_overrides", []):

                if stems is None:
                    continue

                for stem, tag in split_stem_tags(stems):
                    lexicon.add(lemma, key_regex, pre_processor(stem), tag)

            for key, form in entry.get("forms", {}).items():
                form_override[(lemma, key)] = form

            for key_regex, form in entry.get("accents", []):
                accent_override[lemma].append((key_regex, form))

    return lexicon, form_override, accent_override
Esempio n. 6
0
 def test_lexicon(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(
         lexicon.lemma_to_stems["FOO"],
         [("bar", "foo", set())]
     )
     self.assertEqual(
         lexicon.stem_to_lemma_key_regex["foo"],
         {("FOO", "bar", ())}
     )
    def __init__(self, stemming_file, lexicon_file=None, strip_length=False):

        self.ruleset = load_stemming(stemming_file, strip_length)

        if lexicon_file:
            self.lexicon, self.form_override, self.accent_override = \
                load_lexicon(lexicon_file, pre_processor=debreath)
        else:
            self.lexicon = Lexicon()
            self.form_override = {}
            self.accent_override = defaultdict(list)

        self.inflexion = Inflexion()
        self.inflexion.add_lexicon(self.lexicon)
        self.inflexion.add_stemming_rule_set(self.ruleset)
Esempio n. 8
0
    def __init__(self, stemming_file, lexicon_file=None, strip_length=False):

        self.ruleset = load_stemming(stemming_file, strip_length)

        if lexicon_file:
            self.lexicon, self.form_override, self.accent_override = \
                load_lexicon(lexicon_file, pre_processor=debreath)
        else:
            self.lexicon = Lexicon()
            self.form_override = {}
            self.accent_override = defaultdict(list)

        self.inflexion = Inflexion()
        self.inflexion.add_lexicon(self.lexicon)
        self.inflexion.add_stemming_rule_set(self.ruleset)
class GreekInflexion:
    def __init__(self, stemming_file, lexicon_file=None, strip_length=False):

        self.ruleset = load_stemming(stemming_file, strip_length)

        if lexicon_file:
            (self.lexicon, self.form_override, self.accent_override,
             self.segmented_lemmas) = load_lexicon(lexicon_file,
                                                   pre_processor=debreath)
        else:
            self.lexicon = Lexicon()
            self.form_override = {}
            self.accent_override = defaultdict(list)
            self.segmented_lemmas = {}

        self.inflexion = Inflexion()
        self.inflexion.add_lexicon(self.lexicon)
        self.inflexion.add_stemming_rule_set(self.ruleset)

    def find_stems(self, lemma, key, tags=None):
        return self.lexicon.find_stems(lemma,
                                       key,
                                       tags,
                                       stem_post_processor=rebreath)

    def generate(self, lemma, key, tags=None):
        overrides = self.form_override.get((lemma, key))
        if overrides:
            if isinstance(overrides, str):
                overrides = [overrides]
            return {override: [{"override": "form"}] for override in overrides}
        generated = defaultdict(list)
        for orig_form, details in self.inflexion.generate(lemma, key,
                                                          tags).items():
            for detail in details:
                segmented_lemma = self.segmented_lemmas.get(lemma)
                accent_form, accent_notes = calculate_accent(
                    orig_form, key, lemma, segmented_lemma, detail["stem"],
                    self.inflexion, self.accent_override)
                detail.update({"original_form": orig_form})
                detail.update({"accent_notes": accent_notes})
                generated[accent_form].append(detail)
        return generated

    def possible_stems(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                if stem != "h":
                    yield key, rebreath(strip_accents(stem))

    def possible_stems2(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems2(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                yield key, rebreath(strip_accents(stem))

    def parse(self, form):
        return self.inflexion.parse(debreath(form),
                                    stem_post_processor=strip_accents)

    def conjugate(self, lemma, *TVMs, tags=None):
        print("-")
        print("    lemma: {}".format(lemma))
        if tags:
            print()
            print("    tags:")
            for tag in tags:
                print("      - {}".format(tag))
        for TVM in TVMs:
            print()
            if TVM[2] in "ISO":
                for PN in ["1S", "2S", "3S", "1P", "2P", "3P"]:
                    parse = TVM + "." + PN
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(parse, form))
            elif TVM[2] == "D":
                if "." not in TVM:
                    for PN in ["2S", "3S", "2P", "3P"]:
                        parse = TVM + "." + PN
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(TVM, form))
            elif TVM[2] == "N":
                parse = TVM
                form = "/".join(self.generate(lemma, parse, tags=tags).keys())
                if form:
                    print("    {}: {}".format(parse, form))
            elif TVM[2] == "P":
                if TVM.endswith(".N"):
                    for NG in ["SM", "SF", "SN"]:
                        parse = TVM + NG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    for CNG in ["NSM", "NSF", "NSN", "GSM", "GSF", "GSN"]:
                        parse = TVM + "." + CNG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
        print()
        print()

    def decline(self, lemma, TVM, tags=None):

        if TVM[2] != "P":
            raise ValueError

        print("-")

        print("    lemma: {}".format(lemma))

        for G in "MFN":
            print()
            for CN in [
                    "NS", "GS", "DS", "AS", "VS", "NP", "VP", "GP", "DP", "AP"
            ]:
                parse = TVM + "." + CN + G
                form = "/".join(self.generate(lemma, parse, tags=tags).keys())
                print("    {}: {}".format(parse, form))

        print()
        print()
Esempio n. 10
0
 def test_find_stems_with_tags_2(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "faa", {'-a'})
     lexicon.add("FOO", "bar", "fee", {'-b'})
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"faa", "fee"})
Esempio n. 11
0
 def test_find_stems(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
Esempio n. 12
0
def load_lexicon(lexicon_file, pre_processor=lambda x: x):
    lexicon = Lexicon()

    partnum_to_key_regex = {
        "1-": "P",
        "1-A": "PA",
        "1-M": "PM",
        "1+": "I",
        "2-": "F[AM]",
        "2-A": "FA",
        "2-M": "FM",
        "3-": "A[AM][NPDSO]",
        "3+": "A[AM]I",
        "3+A": "AAI",
        "3+M": "AMI",
        "4-": "XA",
        "4+": "YA",
        "5-": "X[MP]",
        "5+": "Y[MP]",
        "6-": "AP[NPDSO]",
        "6+": "API",
        "7-": "FP",
        "8-": "Z[MP]",
        "M": "..M",
        "F": "..F",
        "N": "..N",
    }

    form_override = {}
    accent_override = defaultdict(list)

    with open(lexicon_file) as f:

        for lemma, entry in yaml.load(f).items():

            if "stems" in entry:

                stems = []

                for partnum, stems in sorted(
                    (entry["stems"] if entry.get("stems") else {}).items()):

                    key_regex = partnum_to_key_regex[partnum]

                    for stem, tag in split_stem_tags(stems):
                        lexicon.add(lemma, key_regex, pre_processor(stem), tag)

                for key_regex, stems in entry.get("stem_overrides", []):

                    if stems is None:
                        continue

                    for stem, tag in split_stem_tags(stems):
                        lexicon.add(lemma, key_regex, pre_processor(stem), tag)

            for key, form in entry.get("forms", {}).items():
                form_override[(lemma, key)] = form

            for key_regex, form in entry.get("accents", []):
                accent_override[lemma].append((key_regex, form))

    return lexicon, form_override, accent_override
Esempio n. 13
0
"""
encapsulates details of file format used for stemming.yaml, and lexicon YAMLs,
providing functions for loading the file and populating StemmingRuleSet or
Lexicon (with form and accent overrides)
"""

from collections import defaultdict

import yaml

from greek_accentuation.characters import strip_length as do_strip_length

from inflexion.lexicon import Lexicon
from inflexion.stemming import StemmingRuleSet


class RefDoesNotExistException(Exception):
    pass


def load_stemming(stemming_file, strip_length=False):
    ruleset = StemmingRuleSet()

    with open(stemming_file) as f:
        stemming_dict = yaml.load(f)

    for key, rules in stemming_dict.items():

        while isinstance(rules, dict) and "ref" in rules:
            if rules["ref"] in stemming_dict:
Esempio n. 14
0
class GreekInflexion:

    def __init__(self, stemming_file, lexicon_file=None, strip_length=False):

        self.ruleset = load_stemming(stemming_file, strip_length)

        if lexicon_file:
            self.lexicon, self.form_override, self.accent_override = \
                load_lexicon(lexicon_file, pre_processor=debreath)
        else:
            self.lexicon = Lexicon()
            self.form_override = {}
            self.accent_override = defaultdict(list)

        self.inflexion = Inflexion()
        self.inflexion.add_lexicon(self.lexicon)
        self.inflexion.add_stemming_rule_set(self.ruleset)

    def find_stems(self, lemma, key, tags=None):
        return self.lexicon.find_stems(
            lemma, key, tags, stem_post_processor=rebreath)

    def generate(self, lemma, key, tags=None):
        overrides = self.form_override.get((lemma, key))
        if overrides:
            if isinstance(overrides, str):
                overrides = [overrides]
            return {
                override: [{"override": "form"}]
                for override in overrides
            }
        generated = defaultdict(list)
        for orig_form, details in self.inflexion.generate(
                lemma, key, tags).items():
            for detail in details:
                accent_form = calculate_accent(
                    orig_form, key, lemma, detail["stem"],
                    self.inflexion, self.accent_override)
                detail.update({"original_form": orig_form})
                generated[accent_form].append(detail)
        return generated

    def possible_stems(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                yield key, rebreath(strip_accents(stem))

    def possible_stems2(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems2(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                yield key, rebreath(strip_accents(stem))

    def parse(self, form):
        return self.inflexion.parse(
            debreath(form), stem_post_processor=strip_accents)

    def conjugate(self, lemma, *TVMs, tags=None):
        print("-")
        print("    lemma: {}".format(lemma))
        if tags:
            print()
            print("    tags:")
            for tag in tags:
                print("      - {}".format(tag))
        for TVM in TVMs:
            print()
            if TVM[2] in "ISO":
                for PN in ["1S", "2S", "3S", "1P", "2P", "3P"]:
                    parse = TVM + "." + PN
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(parse, form))
            elif TVM[2] == "D":
                if "." not in TVM:
                    for PN in ["2S", "3S", "2P", "3P"]:
                        parse = TVM + "." + PN
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(TVM, form))
            elif TVM[2] == "N":
                parse = TVM
                form = "/".join(
                    self.generate(lemma, parse, tags=tags).keys())
                if form:
                    print("    {}: {}".format(parse, form))
            elif TVM[2] == "P":
                if TVM.endswith(".N"):
                    for NG in ["SM", "SF", "SN"]:
                        parse = TVM + NG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    for CNG in ["NSM", "NSF", "NSN", "GSM", "GSF", "GSN"]:
                        parse = TVM + "." + CNG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
        print()
        print()

    def decline(self, lemma, TVM, tags=None):

        if TVM[2] != "P":
            raise ValueError

        print("-")

        print("    lemma: {}".format(lemma))

        for G in "MFN":
            print()
            for CN in [
                "NS", "GS", "DS", "AS", "VS",
                "NP", "VP", "GP", "DP", "AP"
            ]:
                parse = TVM + "." + CN + G
                form = "/".join(
                    self.generate(lemma, parse, tags=tags).keys())
                print("    {}: {}".format(parse, form))

        print()
        print()
Esempio n. 15
0
 def test_find_stems(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})