Exemple #1
0
 def test_find_stems_with_tags_1(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "faa", {'+a'})
     lexicon.add("FOO", "bar", "fee", {'-a'})
     self.assertEqual(
         lexicon.find_stems("FOO", "barista"),
         {"fee"}
     )
     self.assertEqual(
         lexicon.find_stems("FOO", "barista", {"a"}),
         {"faa"}
     )
class GreekInflexion:
    def __init__(self, stemming_file, lexicon_file=None, strip_length=False):

        self.ruleset = load_stemming(stemming_file, strip_length)

        if lexicon_file:
            (self.lexicon, self.form_override, self.accent_override,
             self.segmented_lemmas) = load_lexicon(lexicon_file,
                                                   pre_processor=debreath)
        else:
            self.lexicon = Lexicon()
            self.form_override = {}
            self.accent_override = defaultdict(list)
            self.segmented_lemmas = {}

        self.inflexion = Inflexion()
        self.inflexion.add_lexicon(self.lexicon)
        self.inflexion.add_stemming_rule_set(self.ruleset)

    def find_stems(self, lemma, key, tags=None):
        return self.lexicon.find_stems(lemma,
                                       key,
                                       tags,
                                       stem_post_processor=rebreath)

    def generate(self, lemma, key, tags=None):
        overrides = self.form_override.get((lemma, key))
        if overrides:
            if isinstance(overrides, str):
                overrides = [overrides]
            return {override: [{"override": "form"}] for override in overrides}
        generated = defaultdict(list)
        for orig_form, details in self.inflexion.generate(lemma, key,
                                                          tags).items():
            for detail in details:
                segmented_lemma = self.segmented_lemmas.get(lemma)
                accent_form, accent_notes = calculate_accent(
                    orig_form, key, lemma, segmented_lemma, detail["stem"],
                    self.inflexion, self.accent_override)
                detail.update({"original_form": orig_form})
                detail.update({"accent_notes": accent_notes})
                generated[accent_form].append(detail)
        return generated

    def possible_stems(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                if stem != "h":
                    yield key, rebreath(strip_accents(stem))

    def possible_stems2(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems2(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                yield key, rebreath(strip_accents(stem))

    def parse(self, form):
        return self.inflexion.parse(debreath(form),
                                    stem_post_processor=strip_accents)

    def conjugate(self, lemma, *TVMs, tags=None):
        print("-")
        print("    lemma: {}".format(lemma))
        if tags:
            print()
            print("    tags:")
            for tag in tags:
                print("      - {}".format(tag))
        for TVM in TVMs:
            print()
            if TVM[2] in "ISO":
                for PN in ["1S", "2S", "3S", "1P", "2P", "3P"]:
                    parse = TVM + "." + PN
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(parse, form))
            elif TVM[2] == "D":
                if "." not in TVM:
                    for PN in ["2S", "3S", "2P", "3P"]:
                        parse = TVM + "." + PN
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(TVM, form))
            elif TVM[2] == "N":
                parse = TVM
                form = "/".join(self.generate(lemma, parse, tags=tags).keys())
                if form:
                    print("    {}: {}".format(parse, form))
            elif TVM[2] == "P":
                if TVM.endswith(".N"):
                    for NG in ["SM", "SF", "SN"]:
                        parse = TVM + NG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    for CNG in ["NSM", "NSF", "NSN", "GSM", "GSF", "GSN"]:
                        parse = TVM + "." + CNG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
        print()
        print()

    def decline(self, lemma, TVM, tags=None):

        if TVM[2] != "P":
            raise ValueError

        print("-")

        print("    lemma: {}".format(lemma))

        for G in "MFN":
            print()
            for CN in [
                    "NS", "GS", "DS", "AS", "VS", "NP", "VP", "GP", "DP", "AP"
            ]:
                parse = TVM + "." + CN + G
                form = "/".join(self.generate(lemma, parse, tags=tags).keys())
                print("    {}: {}".format(parse, form))

        print()
        print()
Exemple #3
0
 def test_find_stems_with_tags_2(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "faa", {'-a'})
     lexicon.add("FOO", "bar", "fee", {'-b'})
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"faa", "fee"})
Exemple #4
0
 def test_find_stems(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
class GreekInflexion:

    def __init__(self, stemming_file, lexicon_file=None, strip_length=False):

        self.ruleset = load_stemming(stemming_file, strip_length)

        if lexicon_file:
            self.lexicon, self.form_override, self.accent_override = \
                load_lexicon(lexicon_file, pre_processor=debreath)
        else:
            self.lexicon = Lexicon()
            self.form_override = {}
            self.accent_override = defaultdict(list)

        self.inflexion = Inflexion()
        self.inflexion.add_lexicon(self.lexicon)
        self.inflexion.add_stemming_rule_set(self.ruleset)

    def find_stems(self, lemma, key, tags=None):
        return self.lexicon.find_stems(
            lemma, key, tags, stem_post_processor=rebreath)

    def generate(self, lemma, key, tags=None):
        overrides = self.form_override.get((lemma, key))
        if overrides:
            if isinstance(overrides, str):
                overrides = [overrides]
            return {
                override: [{"override": "form"}]
                for override in overrides
            }
        generated = defaultdict(list)
        for orig_form, details in self.inflexion.generate(
                lemma, key, tags).items():
            for detail in details:
                accent_form = calculate_accent(
                    orig_form, key, lemma, detail["stem"],
                    self.inflexion, self.accent_override)
                detail.update({"original_form": orig_form})
                generated[accent_form].append(detail)
        return generated

    def possible_stems(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                yield key, rebreath(strip_accents(stem))

    def possible_stems2(self, form, key_regex=None):
        for key, stem in self.ruleset.possible_stems2(debreath(form)):
            if key_regex is None or re.match(key_regex, key):
                yield key, rebreath(strip_accents(stem))

    def parse(self, form):
        return self.inflexion.parse(
            debreath(form), stem_post_processor=strip_accents)

    def conjugate(self, lemma, *TVMs, tags=None):
        print("-")
        print("    lemma: {}".format(lemma))
        if tags:
            print()
            print("    tags:")
            for tag in tags:
                print("      - {}".format(tag))
        for TVM in TVMs:
            print()
            if TVM[2] in "ISO":
                for PN in ["1S", "2S", "3S", "1P", "2P", "3P"]:
                    parse = TVM + "." + PN
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(parse, form))
            elif TVM[2] == "D":
                if "." not in TVM:
                    for PN in ["2S", "3S", "2P", "3P"]:
                        parse = TVM + "." + PN
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    form = "/".join(
                        self.generate(lemma, parse, tags=tags).keys())
                    if form:
                        print("    {}: {}".format(TVM, form))
            elif TVM[2] == "N":
                parse = TVM
                form = "/".join(
                    self.generate(lemma, parse, tags=tags).keys())
                if form:
                    print("    {}: {}".format(parse, form))
            elif TVM[2] == "P":
                if TVM.endswith(".N"):
                    for NG in ["SM", "SF", "SN"]:
                        parse = TVM + NG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
                else:
                    for CNG in ["NSM", "NSF", "NSN", "GSM", "GSF", "GSN"]:
                        parse = TVM + "." + CNG
                        form = "/".join(
                            self.generate(lemma, parse, tags=tags).keys())
                        if form:
                            print("    {}: {}".format(parse, form))
        print()
        print()

    def decline(self, lemma, TVM, tags=None):

        if TVM[2] != "P":
            raise ValueError

        print("-")

        print("    lemma: {}".format(lemma))

        for G in "MFN":
            print()
            for CN in [
                "NS", "GS", "DS", "AS", "VS",
                "NP", "VP", "GP", "DP", "AP"
            ]:
                parse = TVM + "." + CN + G
                form = "/".join(
                    self.generate(lemma, parse, tags=tags).keys())
                print("    {}: {}".format(parse, form))

        print()
        print()
Exemple #6
0
 def test_find_stems(self):
     lexicon = Lexicon()
     lexicon.add("FOO", "bar", "foo")
     self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})