def r(): """ Provide a module-scoped Greynir instance as a test fixture """ from reynir import Greynir r = Greynir() yield r # Do teardown here r.__class__.cleanup()
def parse_text_file(file_handle, affix_lemma=1, id_prefix=None, start_index=1, **options): """ Parse contiguous text into reynir simple trees in bracket format """ text = file_handle.read() r = Greynir(**options) dd = r.parse(text) for idx, sent in enumerate(dd["sentences"]): nltk_tree = reynir_sentence_to_annotree(sent) id_prefix = "" if id_prefix is None else id_prefix id_str = "{}.{}".format(id_prefix, idx) meta_node = AnnoTree( "META", [ AnnoTree("ID-CORPUS", [id_str]), AnnoTree("ID-LOCAL", [id_str]), AnnoTree("URL", ["greynir.is"]), AnnoTree("COMMENT", [""]), ], ) meta_tree = AnnoTree("", [meta_node, nltk_tree]) yield meta_tree
def lemmas(q: str, all_lemmas: bool = False) -> Response: """ Lemmatization API. """ if not q: return _err("Missing query parameter") if len(q) > _MAX_LEMMAS_TXT_LEN: return _err(f"Param exceeds max length ({_MAX_LEMMAS_TXT_LEN} chars)") # Lazy-load Greynir engine global greynir if greynir is None: greynir = Greynir() resp: Dict[str, Any] = dict(q=q) try: lem: List[Any] = [] for m in greynir.lemmatize(q, all_lemmas=all_lemmas): # TODO: postprocess in some way? lem.append(m) resp["err"] = False resp["lemmas"] = lem except Exception as e: return _err(f"Villa kom upp við lemmun texta: '{e}'") return JSONResponse(content=resp)
def lemmatize(self, sent: Iterable[Tok]) -> Iterable[LemmaTuple]: """ Lemmatize a sentence (list of tokens), returning an iterable of (lemma, category) tuples """ if self._g is None: # Initialize parser singleton self.__class__._g = Greynir() # Attempt to parse the sentence assert self._g is not None s = self._g.parse_tokens(sent) if s.tree is None: # Unable to parse: fall back to simple lemmatizer yield from super().lemmatize(sent) else: # Successfully parsed: obtain the (lemma, category) tuples # from the terminals of the parse tree yield from s.tree.lemmas_and_cats
assert db.cast_to_accusative("Kattarhestur") == "Kattarhest" assert db.cast_to_dative("Kattarhestur") == "Kattarhesti" assert db.cast_to_genitive("Kattarhestur") == "Kattarhests" f = lambda mm: [m for m in mm if "2" not in m.beyging] assert db.cast_to_accusative("fjórir", meaning_filter_func=f) == "fjóra" assert db.cast_to_dative("fjórir", meaning_filter_func=f) == "fjórum" assert db.cast_to_genitive("fjórir", meaning_filter_func=f) == "fjögurra" assert db.cast_to_accusative("Suður-Afríka") == "Suður-Afríku" assert db.cast_to_dative("Suður-Afríka") == "Suður-Afríku" assert db.cast_to_genitive("Suður-Afríka") == "Suður-Afríku" assert db.cast_to_accusative("Vestur-Þýskaland") == "Vestur-Þýskaland" assert db.cast_to_dative("Vestur-Þýskaland") == "Vestur-Þýskalandi" assert db.cast_to_genitive("Vestur-Þýskaland") == "Vestur-Þýskalands" f = lambda mm: sorted(mm, key=lambda m: "2" in m.beyging or "3" in m.beyging) assert db.cast_to_accusative("Kópavogur", meaning_filter_func=f) == "Kópavog" assert db.cast_to_dative("Kópavogur", meaning_filter_func=f) == "Kópavogi" assert db.cast_to_genitive("Kópavogur", meaning_filter_func=f) == "Kópavogs" if __name__ == "__main__": # When invoked as a main module, do a verbose test from reynir import Greynir r = Greynir() test_cases(r) test_casting() r.__class__.cleanup()
def __init__(self, use_icebert: bool = True) -> None: self.g = Greynir() if use_icebert: self.ib = IcebertModel.pos_from_settings()
class Lemmatizer: SPLIT_WC = 100 g = None # Placeholder for Greynir instance ib = None # Placeholder for IceBERT model instance device = "cpu" def __init__(self, use_icebert: bool = True) -> None: self.g = Greynir() if use_icebert: self.ib = IcebertModel.pos_from_settings() def lemmatize_sentence(self, sentence: _Sentence) -> TokLem: if sentence.tree is not None: return self.g_lemmatize(sentence) a_lemmas = [] a_tokens = [] tokens = sentence.tokens # Split words to not hit 512 token limit in IceBERT # Consider making this smarter if dealing with a lot of long sentences. for i in range(0, len(tokens), self.SPLIT_WC): p_lemmas, p_tokens = self.ib_lemmatize(tokens[i * 100 : (i + 1) * 100]) # noqa a_lemmas += p_lemmas a_tokens += p_tokens return a_lemmas, a_tokens def lemmatize(self, text: str) -> List[Tuple[List[str], List[str], str]]: lemmatized = [] parsed = self.parse(text) for sentence in parsed: lemmas, tokens = self.lemmatize_sentence(sentence) lemmatized.append(([l for l in lemmas], tokens, sentence.tidy_text)) # noqa return lemmatized def lemmatize_pretty(self, text: str) -> None: lemmatized_data = self.lemmatize(text) for lemmatized, tokens, _ in lemmatized_data: print("---") print("\t".join("{:10}".format(v) for v in tokens)) print("\t".join("{:10}".format(v.lower()) for v in lemmatized)) def g_lemmatize(self, g_sentence: _Sentence) -> TokLem: tokens = [t.txt for t in g_sentence.tokens] if g_sentence.tree is None: return tokens, tokens return g_sentence.tree.lemmas, tokens def ib_lemmatize(self, g_tokens: TokenList) -> TokLem: tokens = [t.txt for t in g_tokens] sent = " ".join(tokens) if self.ib is None: raise ValueError("Lemmatizer needs to be instantiated with use_icebert.") ifds = self.ib.predict_to_idf(sent, device=self.device) lemmas = [] for idx, tok in enumerate(g_tokens): cands = tok.val if isinstance(cands, int) or isinstance(cands, float): # Number lemmas.append(tok.txt) continue if cands and len(cands) > 1 and (isinstance(cands[0], int) or isinstance(cands[0], float)): # Punctuation lemmas.append(tok.txt) continue if not cands: lemmas.append(tok.txt) continue lemm_cands = set(c.stofn for c in cands if hasattr(c, "stofn")) if len(lemm_cands) == 1: # Only one candidate, we use that one lemmas.append(lemm_cands.pop()) continue found = False for c in cands: if hasattr(c, "name"): lemmas.append(c.name) found = True break if isinstance(c[0], int): lemmas.append(tok.txt) found = True break try: ifd = IFD_Tagset( k=tok.kind, c=c.ordfl, t=c.ordfl, f=c.fl, txt=tok.txt, s=c.stofn, b=c.beyging, ) except: # noqa lemmas.append(tok.txt) found = True break try: str_ifd = str(ifd) except TypeError: # Some oddity in ifdtagger str_ifd = "" if str_ifd == ifds[idx]: lemmas.append(c.stofn) found = True break if not found: lemmas.append(tok.txt) return lemmas, tokens def parse(self, text: str) -> List[_Sentence]: if self.g is None: raise ValueError("Greynir needs to be instantiated.") text = text.replace("\n", " ").replace(" ", " ") return self.g.parse(text)["sentences"] # type: ignore