Example #1
0
 def ifd_tag_person(txt, p):
     i = IFD_Tagset(k="PERSON",
                    c="person",
                    g=p.gender,
                    x=txt,
                    s=p.name,
                    t="person_" + p.gender + "_" + p.case)
     return str(i)
Example #2
0
 def ifd_tag(kind, txt, m):
     i = IFD_Tagset(k=TOK.descr[kind],
                    c=m.ordfl,
                    t=m.ordfl,
                    f=m.fl,
                    x=txt,
                    s=m.stofn,
                    b=m.beyging)
     return str(i)
Example #3
0
 def tag_stream(sentence_stream):
     """ Generator for tag stream from a token stream """
     for sent in sentence_stream:
         if not sent:
             continue
         # For each sentence, start and end with empty strings
         for i in range(n - 1):
             yield ""
         for t in sent:
             tag = None
             # Skip punctuation
             if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
                 canonicalize_token(t)
                 tag = str(IFD_Tagset(t))
                 if tag:
                     self.lemma_cnt[t["x"]][tag] += 1
             if tag:
                 yield tag
         for i in range(n - 1):
             yield ""
Example #4
0
 def tag_stream(sentence_stream: Iterable[Iterable[TokenDict]]) -> Iterator[str]:
     """ Generator for tag stream from a token stream """
     for sent in sentence_stream:
         if not sent:
             continue
         # For each sentence, start and end with empty strings
         for _ in range(n - 1):
             yield ""
         for t in sent:
             tag = None
             # Skip punctuation
             if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
                 ct = canonicalize_token(t)
                 tag = str(IFD_Tagset(ct))
                 if tag:
                     self.lemma_cnt[ct.get("x", "")][tag] += 1
             if tag:
                 yield tag
         for _ in range(n - 1):
             yield ""
Example #5
0
 def ifd_taglist_entity(txt):
     i = IFD_Tagset(c="entity", x=txt)
     return [(str(i), 1.0)]
Example #6
0
    def ib_lemmatize(self, g_tokens: TokenList) -> TokLem:
        tokens = [t.txt for t in g_tokens]
        sent = " ".join(tokens)
        if self.ib is None:
            raise ValueError("Lemmatizer needs to be instantiated with use_icebert.")
        ifds = self.ib.predict_to_idf(sent, device=self.device)
        lemmas = []

        for idx, tok in enumerate(g_tokens):

            cands = tok.val
            if isinstance(cands, int) or isinstance(cands, float):
                # Number
                lemmas.append(tok.txt)
                continue
            if cands and len(cands) > 1 and (isinstance(cands[0], int) or isinstance(cands[0], float)):
                # Punctuation
                lemmas.append(tok.txt)
                continue
            if not cands:
                lemmas.append(tok.txt)
                continue

            lemm_cands = set(c.stofn for c in cands if hasattr(c, "stofn"))
            if len(lemm_cands) == 1:
                # Only one candidate, we use that one
                lemmas.append(lemm_cands.pop())
                continue

            found = False
            for c in cands:
                if hasattr(c, "name"):
                    lemmas.append(c.name)
                    found = True
                    break
                if isinstance(c[0], int):
                    lemmas.append(tok.txt)
                    found = True
                    break
                try:
                    ifd = IFD_Tagset(
                        k=tok.kind,
                        c=c.ordfl,
                        t=c.ordfl,
                        f=c.fl,
                        txt=tok.txt,
                        s=c.stofn,
                        b=c.beyging,
                    )
                except:  # noqa
                    lemmas.append(tok.txt)
                    found = True
                    break
                try:
                    str_ifd = str(ifd)
                except TypeError:
                    # Some oddity in ifdtagger
                    str_ifd = ""
                if str_ifd == ifds[idx]:
                    lemmas.append(c.stofn)
                    found = True
                    break
            if not found:
                lemmas.append(tok.txt)

        return lemmas, tokens