def test_lookups_to_from_disk(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table2", {"a": 1, "b": 2, "c": 3}) with make_tempdir() as tmpdir: lookups.to_disk(tmpdir) new_lookups = Lookups() new_lookups.from_disk(tmpdir) assert len(new_lookups) == 2 assert "table1" in new_lookups assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 assert table2["b"] == 2
def test_lookups_to_from_bytes(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table2", {"a": 1, "b": 2, "c": 3}) lookups_bytes = lookups.to_bytes() new_lookups = Lookups() new_lookups.from_bytes(lookups_bytes) assert len(new_lookups) == 2 assert "table1" in new_lookups assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 assert table2["b"] == 2 assert new_lookups.to_bytes() == lookups_bytes
def test_lookups_api(): table_name = "test" data = {"foo": "bar", "hello": "world"} lookups = Lookups() lookups.add_table(table_name, data) assert table_name in lookups assert lookups.has_table(table_name) table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 assert table.get("hello") == "world" table.set("a", "b") assert table.get("a") == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): lookups.get_table("xyz")
def test_lookups_api(): table_name = "test" data = {"foo": "bar", "hello": "world"} lookups = Lookups() lookups.add_table(table_name, data) assert len(lookups) == 1 assert table_name in lookups assert lookups.has_table(table_name) table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 assert table["hello"] == "world" table["a"] = "b" assert table["a"] == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): lookups.get_table("xyz") with pytest.raises(ValueError): lookups.add_table(table_name) table = lookups.remove_table(table_name) assert table.name == table_name assert len(lookups) == 0 assert table_name not in lookups with pytest.raises(KeyError): lookups.get_table(table_name)
class MorphologizerLemmatizer(Pipe): """Pipeline component that assigns morphological features and lemmas to Docs. The actual morphological analysis is done by libvoikko. """ compound_re = re.compile(r"\+(\w+)(?:\(\+?[\w=]+\))?") minen_re = re.compile(r"\b(\w+)\[Tn4\]mi") ny_re = re.compile(r"\[X\]\[\w+\]\[Ny\](\w+)") roman_numeral_structure_re = re.compile(r"=j+|=q+") voikko_cases = { "nimento": "Case=Nom", "omanto": "Case=Gen", "kohdanto": "Case=Acc", "olento": "Case=Ess", "osanto": "Case=Par", "tulento": "Case=Tra", "sisaolento": "Case=Ine", "sisaeronto": "Case=Ela", "sisatulento": "Case=Ill", "ulkoolento": "Case=Ade", "ulkoeronto": "Case=Abl", "ulkotulento": "Case=All", "vajanto": "Case=Abe", "seuranto": "Case=Com", "keinonto": "Case=Ins", "kerrontosti": "Case=Nom" # Should never occur. "kerrontosti" # should only appear on ADVs, which # don't have cases. } voikko_classes_by_pos = { ADJ: frozenset(["laatusana", "nimisana_laatusana"]), ADP: frozenset(["nimisana", "seikkasana", "suhdesana"]), ADV: frozenset(["seikkasana"]), AUX: frozenset(["teonsana", "kieltosana"]), CCONJ: frozenset(["sidesana"]), INTJ: frozenset(["huudahdussana"]), NOUN: frozenset(["nimisana", "nimisana_laatusana", "lyhenne"]), NUM: frozenset(["lukusana"]), PRON: frozenset(["asemosana", "nimisana", "nimisana_laatusana"]), PROPN: frozenset(["nimi", "etunimi", "sukunimi", "paikannimi"]), SCONJ: frozenset(["sidesana"]), VERB: frozenset([]), # Would be "teonsana" but # MINEN-infinitives are treated as noun. # See _analysis_has_compatible_pos() SYM: frozenset([]), X: frozenset([]) } affix_to_sijamuoto = { "n": "omanto", "na": "olento", "nä": "olento", "a": "osanto", "ä": "osanto", "ta": "osanto", "tä": "osanto", "ksi": "tulento", "ssa": "sisaolento", "ssä": "sisaolento", "sta": "sisaeronto", "stä": "sisaeronto", "han": "sisatulento", "hin": "sisatulento", "hun": "sisatulento", "seen": "sisatulento", "siin": "sisatulento", "lla": "ulkoolento", "llä": "ulkoolento", "lta": "ulkoeronto", "ltä": "ulkoeronto", "lle": "ulkotulento", "tta": "vajanto", "ttä": "vajanto", } possessive_suffixes = { "1s": ["ni"], "2s": ["si"], "1p": ["mme"], "2p": ["nne"], "3": ["nsa", "nsä", "an", "en", "in" "on", "un", "yn", "än", "ön"], } voikko_degree = { "positive": "Degree=Pos", "comparative": "Degree=Cmp", "superlative": "Degree=Sup" } voikko_mood = { "A-infinitive": "InfForm=1", "E-infinitive": "InfForm=2", "MA-infinitive": "InfForm=3", "indicative": "Mood=Ind", "conditional": "Mood=Cnd", "potential": "Mood=Pot", "imperative": "Mood=Imp" } voikko_part_form = { "past_active": "PartForm=Past", "past_passive": "PartForm=Past", "present_active": "PartForm=Pres", "present_passive": "PartForm=Pres", "agent": "PartForm=Agt" } voikko_tense = { "present_active": "Tense=Pres", "present_passive": "Tense=Pres", "present_simple": "Tense=Pres", "past_active": "Tense=Past", "past_passive": "Tense=Past", "past_imperfective": "Tense=Past" } pron_types = { "minä": "Prs", "sinä": "Prs", "hän": "Prs", "me": "Prs", "te": "Prs", "he": "Prs", "tämä": "Dem", "tuo": "Dem", "se": "Dem", "nämä": "Dem", "nuo": "Dem", "ne": "Dem", # The relative "mikä" will be handled as a special case # separately so here we label all occurences of "mikä" as # interrogative. "mikä": "Int", "kuka": "Int", "ken": "Int", # ketä "kumpi": "Int", "millainen": "Int", "kuinka": "Int", "miksi": "Int", # The relative "joka" will be handled else where. Here "joka" # is Voikko's lemmatization of jotakin, jollekin, jostakin, ... "joka": "Ind", "kaikki": "Ind", "jokainen": "Ind", "koko": "Ind", "harva": "Ind", "muutama": "Ind", "jokunen": "Ind", "yksi": "Ind", "ainoa": "Ind", "eräs": "Ind", "muuan": "Ind", "joku": "Ind", "jokin": "Ind", "kukin": "Ind", "moni": "Ind", "usea": "Ind", "molempi": "Ind", "kumpikin": "Ind", "kumpikaan": "Ind", "jompikumpi": "Ind", "sama": "Ind", "muu": "Ind", "kukaan": "Ind", "mikään": "Ind", "toinen": "Rcp" } pron_persons = { "minä": "1", "sinä": "2", "hän": "3", "me": "1", "te": "2", "he": "3" } infinite_moods = frozenset( ["A-infinitive", "E-infinitive", "MA-infinitive", "MAINEN-infinitive"]) def __init__( self, vocab: Vocab, name: str = "morphologizer", *, overwrite_lemma: bool = False, ) -> None: super().__init__() self.name = name self.vocab = vocab self.voikko = libvoikko.Voikko("fi") self.lookups = Lookups() self.overwrite_lemma = overwrite_lemma self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]] self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]] self.nsubj_labels = [ vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"] ] self.ccomp_labels = [ vocab.strings.add(x) for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"] ] self.relcl_labels = [ vocab.strings.add(x) for x in ["acl:relcl", "ccomp"] ] self.foreign_tag = vocab.strings.add('Foreign') def __call__(self, doc: Doc) -> Doc: error_handler = self.get_error_handler() try: for token in doc: if token.pos in (PUNCT, SPACE): if self.overwrite_lemma or token.lemma == 0: token.lemma = token.orth else: analysis = self._analyze(token) morph = self.voikko_morph(token, analysis) if morph: token.set_morph(morph) if self.overwrite_lemma or token.lemma == 0: token.lemma_ = self.lemmatize(token, analysis) return doc except Exception as e: error_handler(self.name, self, [doc], e) def initialize( self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, nlp: Optional[Language] = None, lookups: Optional[Lookups] = None, ): """Initialize the morphologizer and load in data. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. lookups (Lookups): The lookups object containing the (optional) tables such as "lemma_exc" and "morphologizer_exc". Defaults to None. """ if lookups is None: lookups = load_lookups(lang=self.vocab.lang, tables=["lemma_exc", "morphologizer_exc"]) self.lookups = lookups def voikko_morph(self, token: Token, analysis: dict) -> Optional[str]: # Run Voikko's analysis and convert the result to morph # features. exc_table = self.lookups.get_table("morphologizer_exc", {}).get(token.pos) if exc_table is not None: exc = exc_table.get(token.orth_.lower()) if exc: return exc # Pre-compute some frequent morphs to avoid code duplication. # (Functions are not an option because the function call # overhead is too high.) # Clitic morph_clitic = None if "FOCUS" in analysis: focus = analysis["FOCUS"] if focus == "kin": morph_clitic = "Clitic=Kin" elif focus == "kaan": morph_clitic = "Clitic=Kaan" elif focus == "ka": morph_clitic = "Clitic=Ka" elif "KYSYMYSLIITE" in analysis: morph_clitic = "Clitic=Ko" morph_number = None morph_number_psor = None morph_person_psor = None if token.pos in (ADJ, ADP, ADV, AUX, NOUN, NUM, PRON, PROPN, VERB): # Number if "NUMBER" in analysis: number = analysis["NUMBER"] if number == "singular": morph_number = "Number=Sing" elif number == "plural": morph_number = "Number=Plur" # Number[psor] and Person[psor] if "POSSESSIVE" in analysis: possessive = analysis["POSSESSIVE"] if possessive == "1s": morph_number_psor = "Number[psor]=Sing" morph_person_psor = "Person[psor]=1" elif possessive == "1p": morph_number_psor = "Number[psor]=Plur" morph_person_psor = "Person[psor]=1" elif possessive == "3": morph_person_psor = "Person[psor]=3" # Set morphs per POS morphology = [] if token.pos in (ADJ, NOUN, PROPN): # Abbr if "CLASS" in analysis and analysis["CLASS"] == "lyhenne": morphology.append("Abbr=Yes") # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Degree if token.pos == ADJ and "COMPARISON" in analysis: morphology.append(self.voikko_degree[analysis["COMPARISON"]]) # Number if morph_number is not None: morphology.append(morph_number) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # NumType if token.pos == ADJ and "NUMTYPE" in analysis: morphology.append(f'NumType={analysis["NUMTYPE"]}') # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) elif token.pos in (AUX, VERB): vclass = analysis.get("CLASS") # Abbr if vclass == "lyhenne": morphology.append("Abbr=Yes") # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Connegative if "CONNEGATIVE" in analysis: morphology.append("Connegative=Yes") # Degree if "COMPARISON" in analysis: morphology.append(self.voikko_degree[analysis["COMPARISON"]]) # InfForm and Mood # These are mutually exclusive and both are based on MOOD mood = None if "MOOD" in analysis: mood = analysis["MOOD"] morph_inf_form_or_mood = self.voikko_mood.get(mood) if morph_inf_form_or_mood is not None: morphology.append(morph_inf_form_or_mood) # Number if morph_number is not None: morphology.append(morph_number) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # PartForm participle = None if "PARTICIPLE" in analysis: participle = analysis["PARTICIPLE"] morph_part_form = self.voikko_part_form.get(participle) if morph_part_form: morphology.append(morph_part_form) # Person person = None if "PERSON" in analysis: person = analysis["PERSON"] if person in ("0", "1", "2", "3"): morphology.append(f"Person={person}") # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) # Polarity if vclass == "kieltosana": morphology.append("Polarity=Neg") # Tense if "TENSE" in analysis: morphology.append(self.voikko_tense[analysis["TENSE"]]) # VerbForm if mood in self.infinite_moods: morphology.append("VerbForm=Inf") elif participle is not None: morphology.append("VerbForm=Part") else: morphology.append("VerbForm=Fin") # Voice if person in ("0", "1", "2", "3"): morphology.append("Voice=Act") elif person == "4": morphology.append("Voice=Pass") elif "VOICE" in analysis: morphology.append(f"Voice={analysis['VOICE']}") elif participle == "past_passive": morphology.append("Voice=Pass") elif participle in ("present_active", "past_active", "present_passive"): morphology.append("Voice=Act") elif token.pos == ADV: # Abbr if "CLASS" in analysis and analysis["CLASS"] == "lyhenne": morphology.append("Abbr=Yes") # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Degree if "COMPARISON" in analysis: degree = analysis["COMPARISON"] if degree in ("comparative", "superlative"): morphology.append(self.voikko_degree[degree]) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) elif token.pos == PRON: # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Degree if "COMPARISON" in analysis: morphology.append(self.voikko_degree[analysis["COMPARISON"]]) # Number if morph_number is not None: morphology.append(morph_number) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # Person if "PERSON" in analysis: person = analysis["PERSON"] if person in ("0", "1", "2", "3"): morphology.append(f"Person={person}") # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) # PronType if "PRONTYPE" in analysis: morphology.append(f"PronType={analysis['PRONTYPE']}") # Reflex if "BASEFORM" in analysis and analysis["BASEFORM"] == "itse": morphology.append("Reflex=Yes") elif token.pos in (CCONJ, SCONJ): # Clitic if morph_clitic is not None: morphology.append(morph_clitic) elif token.pos == NUM: # Abbr if "CLASS" in analysis and analysis["CLASS"] == "lyhenne": morphology.append("Abbr=Yes") # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Number if morph_number is not None: morphology.append(morph_number) # NumType if "NUMTYPE" in analysis: morphology.append(f'NumType={analysis["NUMTYPE"]}') elif token.pos == ADP: # AdpType if "ADPTYPE" in analysis: morphology.append(f"AdpType={analysis['ADPTYPE']}") # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) elif token.pos == SYM: # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) elif token.pos == X: # Foreign if token.tag == self.foreign_tag: morphology.append('Foreign=Yes') return "|".join(morphology) if morphology else None def lemmatize(self, token: Token, analysis: dict) -> str: cached_lower = None exc_table = self.lookups.get_table("lemma_exc", {}).get(token.pos) if exc_table is not None: cached_lower = token.orth_.lower() exc = exc_table.get(cached_lower) if exc: return exc # Some exceptions to Voikko's lemmatization algorithm to # better match UD lemmas if token.pos in (AUX, VERB) and "PARTICIPLE" in analysis: return self._participle_lemma(analysis) elif token.pos == NOUN and analysis.get("MOOD") == "MINEN-infinitive": return self._minen_noun_lemma(analysis) elif token.pos in (NOUN, NUM, PROPN) and (colon_i := token.orth_.find(":")) > 0: # Lemma of inflected abbreviations: BBC:n, EU:ssa return token.orth_[:colon_i] elif token.pos == ADV: cached_lower = cached_lower or token.orth_.lower() return self._adv_lemma(analysis, cached_lower)