def test_get_doc_char_pos(tokens: List[str], i: int, expected: Optional[str]): doc = Doc.from_words(tokens) token = token_from_char_pos(doc, i) if expected is None: assert token is None else: assert token is not None token.text == expected
def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc: words = [x.surface + x.space for x in dtokens] doc = Doc.from_words(words) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma self.set_juman_fstring(token, dtoken.fstring) return doc
def __call__(self, text: str) -> Doc: dtokens = self.detailed_tokens(text) words = [x.surface + x.space for x in dtokens] doc = Doc.from_words(words) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text self.set_mecab_fstring(token, dtoken.fstring) return doc
def __call__(self, text: str) -> Doc: pieces: List[str] = self.tokenizer.EncodeAsPieces(text) # type: ignore if pieces and pieces[0] == self.SPACE_CHAR: _tokens = pieces[1:] else: _tokens = pieces tokens = [token.replace(self.SPACE_CHAR, " ") for token in _tokens] doc = Doc.from_words(tokens) self.set_spm_pieces(doc, pieces) return doc
def test_get_doc_char_pos_hyp(data: Tuple[List[str], int]): tokens, i = data doc = Doc.from_words(tokens) ret = token_from_char_pos(doc, i) expected = _simple_get_doc_char_pos(doc, i) assert ret is expected
def test_doc(words: List[str]): doc = Doc.from_words(words) assert doc.text == "".join(words) assert doc.tokens is not None for word, token in zip(words, doc.tokens): assert word == doc.text[token.start_char:token.end_char]