Exemple #1
0
def test_get_doc_char_pos(tokens: List[str], i: int, expected: Optional[str]):
    doc = Doc.from_words(tokens)
    token = token_from_char_pos(doc, i)
    if expected is None:
        assert token is None
    else:
        assert token is not None
        token.text == expected
Exemple #2
0
 def _dtokens_to_doc(self, dtokens: List[ShortUnitWord]) -> Doc:
     words = [x.surface + x.space for x in dtokens]
     doc = Doc.from_words(words)
     for token, dtoken in zip(doc, dtokens):
         token.tag_ = dtoken.pos
         token.lemma_ = dtoken.lemma
         self.set_juman_fstring(token, dtoken.fstring)
     return doc
Exemple #3
0
 def __call__(self, text: str) -> Doc:
     dtokens = self.detailed_tokens(text)
     words = [x.surface + x.space for x in dtokens]
     doc = Doc.from_words(words)
     for token, dtoken in zip(doc, dtokens):
         token.tag_ = dtoken.pos
         token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text
         self.set_mecab_fstring(token, dtoken.fstring)
     return doc
Exemple #4
0
 def __call__(self, text: str) -> Doc:
     pieces: List[str] = self.tokenizer.EncodeAsPieces(text)  # type: ignore
     if pieces and pieces[0] == self.SPACE_CHAR:
         _tokens = pieces[1:]
     else:
         _tokens = pieces
     tokens = [token.replace(self.SPACE_CHAR, " ") for token in _tokens]
     doc = Doc.from_words(tokens)
     self.set_spm_pieces(doc, pieces)
     return doc
Exemple #5
0
def test_get_doc_char_pos_hyp(data: Tuple[List[str], int]):
    tokens, i = data
    doc = Doc.from_words(tokens)
    ret = token_from_char_pos(doc, i)
    expected = _simple_get_doc_char_pos(doc, i)
    assert ret is expected
Exemple #6
0
def test_doc(words: List[str]):
    doc = Doc.from_words(words)
    assert doc.text == "".join(words)
    assert doc.tokens is not None
    for word, token in zip(words, doc.tokens):
        assert word == doc.text[token.start_char:token.end_char]