Esempio n. 1
0
def _decode_bio(
    text: str,
    tokens: List[str],
    mask: List[int],
    labels: List[str],
) -> Doc:
    """Create `Doc` from transformers output. Only support BIO label scheme."""
    assert len(labels) == len(tokens)
    doc = Doc(text)

    # get Ent
    ents: List[Ent] = []
    cur_ent: Optional[Ent] = None
    tokens = _norm_tokens(tokens, mask)
    for span_lists, label in zip(textspan.get_original_spans(tokens, text),
                                 labels):
        if not span_lists:
            # special tokens should hit here
            continue
        l = span_lists[0][0]
        r = span_lists[-1][1]
        if label.startswith("I-") and cur_ent and cur_ent.label == label[2:]:
            # expand previous entity
            cur_ent.end_char = r
        elif label.startswith("I-") or label.startswith("B-"):
            # new entity
            if cur_ent:
                ents.append(cur_ent)
            cur_ent = Ent(l, r, doc, label=label[2:])
    if cur_ent:
        ents.append(cur_ent)
    doc.ents = ents
    return doc
Esempio n. 2
0
def test_get_doc_char_pos(tokens: List[str], i: int, expected: Optional[str]):
    doc = Doc.from_words(tokens)
    token = token_from_char_pos(doc, i)
    if expected is None:
        assert token is None
    else:
        assert token is not None
        token.text == expected
Esempio n. 3
0
 def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc:
     words = [x.surface + x.space for x in dtokens]
     doc = Doc.from_words(words)
     for token, dtoken in zip(doc, dtokens):
         token.tag_ = dtoken.pos
         token.lemma_ = dtoken.lemma
         self.set_juman_fstring(token, dtoken.fstring)
     return doc
Esempio n. 4
0
 def __call__(self, text: str) -> Doc:
     dtokens = self.detailed_tokens(text)
     words = [x.surface + x.space for x in dtokens]
     doc = Doc.from_words(words)
     for token, dtoken in zip(doc, dtokens):
         token.tag_ = dtoken.pos
         token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text
         self.set_mecab_fstring(token, dtoken.fstring)
     return doc
Esempio n. 5
0
 def __call__(self, text: str) -> Doc:
     pieces: List[str] = self.tokenizer.EncodeAsPieces(text)  # type: ignore
     if pieces and pieces[0] == self.SPACE_CHAR:
         _tokens = pieces[1:]
     else:
         _tokens = pieces
     tokens = [token.replace(self.SPACE_CHAR, " ") for token in _tokens]
     doc = Doc.from_words(tokens)
     self.set_spm_pieces(doc, pieces)
     return doc
Esempio n. 6
0
def test_get_doc_char_pos_hyp(data: Tuple[List[str], int]):
    tokens, i = data
    doc = Doc.from_words(tokens)
    ret = token_from_char_pos(doc, i)
    expected = _simple_get_doc_char_pos(doc, i)
    assert ret is expected
Esempio n. 7
0
def test_doc(words: List[str]):
    doc = Doc.from_words(words)
    assert doc.text == "".join(words)
    assert doc.tokens is not None
    for word, token in zip(words, doc.tokens):
        assert word == doc.text[token.start_char:token.end_char]
Esempio n. 8
0
    assert doc.text == "".join(words)
    assert doc.tokens is not None
    for word, token in zip(words, doc.tokens):
        assert word == doc.text[token.start_char:token.end_char]


@given(st.lists(st.text()))
def test_doc_hyp(words: List[str]):
    doc = Doc.from_words(words)
    assert doc.text == "".join(words)
    assert doc.tokens is not None
    for word, token in zip(words, doc.tokens):
        assert word == doc.text[token.start_char:token.end_char]


doc = Doc("foo")
span = Span(0, 0, doc)
token = Token(0, 0, doc)
ent = Ent(0, 0, doc)


@pytest.mark.parametrize(
    "obj,ty,ok",
    [
        (doc, DocProto, True),
        (token, TokenProto, True),
        (token, SpanProto, True),
        (span, SpanProto, True),
        (ent, EntProto, True),
        (ent, SpanProto, True),
        (span, EntProto, False),