def _decode_bio( text: str, tokens: List[str], mask: List[int], labels: List[str], ) -> Doc: """Create `Doc` from transformers output. Only support BIO label scheme.""" assert len(labels) == len(tokens) doc = Doc(text) # get Ent ents: List[Ent] = [] cur_ent: Optional[Ent] = None tokens = _norm_tokens(tokens, mask) for span_lists, label in zip(textspan.get_original_spans(tokens, text), labels): if not span_lists: # special tokens should hit here continue l = span_lists[0][0] r = span_lists[-1][1] if label.startswith("I-") and cur_ent and cur_ent.label == label[2:]: # expand previous entity cur_ent.end_char = r elif label.startswith("I-") or label.startswith("B-"): # new entity if cur_ent: ents.append(cur_ent) cur_ent = Ent(l, r, doc, label=label[2:]) if cur_ent: ents.append(cur_ent) doc.ents = ents return doc
def test_get_doc_char_pos(tokens: List[str], i: int, expected: Optional[str]): doc = Doc.from_words(tokens) token = token_from_char_pos(doc, i) if expected is None: assert token is None else: assert token is not None token.text == expected
def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc: words = [x.surface + x.space for x in dtokens] doc = Doc.from_words(words) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma self.set_juman_fstring(token, dtoken.fstring) return doc
def __call__(self, text: str) -> Doc: dtokens = self.detailed_tokens(text) words = [x.surface + x.space for x in dtokens] doc = Doc.from_words(words) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text self.set_mecab_fstring(token, dtoken.fstring) return doc
def __call__(self, text: str) -> Doc: pieces: List[str] = self.tokenizer.EncodeAsPieces(text) # type: ignore if pieces and pieces[0] == self.SPACE_CHAR: _tokens = pieces[1:] else: _tokens = pieces tokens = [token.replace(self.SPACE_CHAR, " ") for token in _tokens] doc = Doc.from_words(tokens) self.set_spm_pieces(doc, pieces) return doc
def test_get_doc_char_pos_hyp(data: Tuple[List[str], int]): tokens, i = data doc = Doc.from_words(tokens) ret = token_from_char_pos(doc, i) expected = _simple_get_doc_char_pos(doc, i) assert ret is expected
def test_doc(words: List[str]): doc = Doc.from_words(words) assert doc.text == "".join(words) assert doc.tokens is not None for word, token in zip(words, doc.tokens): assert word == doc.text[token.start_char:token.end_char]
assert doc.text == "".join(words) assert doc.tokens is not None for word, token in zip(words, doc.tokens): assert word == doc.text[token.start_char:token.end_char] @given(st.lists(st.text())) def test_doc_hyp(words: List[str]): doc = Doc.from_words(words) assert doc.text == "".join(words) assert doc.tokens is not None for word, token in zip(words, doc.tokens): assert word == doc.text[token.start_char:token.end_char] doc = Doc("foo") span = Span(0, 0, doc) token = Token(0, 0, doc) ent = Ent(0, 0, doc) @pytest.mark.parametrize( "obj,ty,ok", [ (doc, DocProto, True), (token, TokenProto, True), (token, SpanProto, True), (span, SpanProto, True), (ent, EntProto, True), (ent, SpanProto, True), (span, EntProto, False),