Example #1
0
    def tokenize(self, text: str):
        tokens = []

        if self.with_postag:
            response = self.kytea.getTagsToString(text)

            # FIXME Following dirty workaround is required to
            #       process inputs which include <whitespace> itself
            #       (e.g. "私 は猫")
            response = response.replace("\\ ",
                                        "<SPACE>").replace("  ", " <SPACE>")

            for elem in response.split(" ")[:-1]:
                # FIXME If input contains a character "/",
                #       KyTea outputs "//補助記号/・",
                #       which breaks the simple logic elem.split("/")
                pron, postag, surface = map(lambda e: e[::-1],
                                            elem[::-1].split("/", maxsplit=2))
                surface = surface.replace("<SPACE>", " ")
                tokens.append(Token(surface=surface, postag=postag, pron=pron))

        else:
            for surface in list(self.kytea.getWS(text)):
                tokens.append(Token(surface=surface))

        return tokens
Example #2
0
    def tokenize(self, text: str) -> List[Token]:
        return_result = []
        parse_result = self.janome.tokenize(text)

        if self.with_postag:
            for morph in parse_result:
                surface = morph.surface
                postag, postag2, postag3, postag4 = morph.part_of_speech.split(",")
                inflection = morph.infl_type
                conjugation = morph.infl_form
                base_form = morph.base_form
                yomi = morph.reading
                pron = morph.phonetic

                token = Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron)
                return_result.append(token)

        else:
            for morph in parse_result:
                return_result.append(Token(surface=morph.surface))

        return return_result
Example #3
0
    def tokenize(self, text: str):
        """Tokenize."""
        result = []
        for token in self.tokenizer.tokenize(text, self.mode):
            surface = token.surface()
            if self.with_postag:
                postag, postag2, postag3, postag4, \
                    inflection, conjugation = token.part_of_speech()
                base_form = token.dictionary_form()
                normalized_form = token.normalized_form()
                yomi = token.reading_form()
                result.append(Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    normalized_form=normalized_form,
                    yomi=yomi,
                ))

            else:
                result.append(Token(
                    surface=surface
                ))
        return result
Example #4
0
def test_postagging_with_mecab():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
    except ImportError:
        pytest.skip("natto-py is not installed.")

    expect = [Token(**kwargs) for kwargs in mecab_tokens_list]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #5
0
def test_postagging_with_kytea():
    """Test KyTea tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True)
    except ImportError:
        pytest.skip("KyTea is not installed.")

    expect = [Token(**kwargs) for kwargs in kytea_tokens_list]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #6
0
def test_postagging_with_janome():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="janome", with_postag=True)
    except ImportError:
        pytest.skip("Janome is not installed.")

    expect = [Token(**kwargs) for kwargs in janome_tokens_list]
    result = tokenizer.tokenize("すもももももももものうち")
    assert expect == result
Example #7
0
def test_word_tokenize_with_janome():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="janome", with_postag=True)
    except ImportError:
        pytest.skip("skip janome")

    expect = [Token(**kwargs) for kwargs in janome_tokens_list]
    result = tokenizer.tokenize(SENTENCE3)
    assert expect == result
Example #8
0
def test_word_tokenize_with_kytea():
    """Test KyTea tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True)
    except ImportError:
        pytest.skip("skip kytea")

    expect = [Token(**kwargs) for kwargs in kytea_tokens_list]
    result = tokenizer.tokenize(SENTENCE1)
    assert expect == result
Example #9
0
def test_word_tokenize_with_mecab():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
    except ImportError:
        pytest.skip("skip mecab")

    expect = [Token(**kwargs) for kwargs in mecab_tokens_list]
    result = tokenizer.tokenize(SENTENCE1)
    assert expect == result
Example #10
0
def test_word_tokenize_with_kytea():
    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("Mykytea is not installed.")

    tokenizer = WordTokenizer(tokenizer="KyTea")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #11
0
def test_word_tokenize_with_janome():
    try:
        import janome
        del janome
    except ImportError:
        pytest.skip("janome is not installed")

    tokenizer = WordTokenizer(tokenizer="Janome")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #12
0
def test_word_tokenize_with_mecab_whitespace():
    try:
        import natto
        del natto
    except ImportError:
        pytest.skip("natto-py is not installed")

    tokenizer = WordTokenizer(tokenizer="MeCab")
    expect = [Token(surface=w) for w in "吾輩 は   で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は である")
    assert expect == result
Example #13
0
def test_word_tokenize_with_kytea_using_custom_model():
    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("KyTea is not installed.")

    tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm")
    expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #14
0
def test_word_tokenize_with_sudachi_mode_a():
    try:
        import sudachipy
        del sudachipy
    except ImportError:
        pytest.skip("sudachipy is not installed")

    tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A")
    expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")]
    result = tokenizer.tokenize("医薬品安全管理責任者")
    assert expect == result
Example #15
0
def test_word_tokenize_with_sudachi_mode_a():
    """Test Sudachi tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="sudachi",
                                  mode="A",
                                  with_postag=True)
    except ImportError:
        pytest.skip("skip sudachi")

    expect = [Token(**kwargs) for kwargs in sudachi_tokens_list]
    result = tokenizer.tokenize(SENTENCE2)
    assert expect == result
Example #16
0
def test_postagging_with_sudachi_mode_a():
    """Test Sudachi tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="sudachi",
                                  mode="A",
                                  with_postag=True)
    except ImportError:
        pytest.skip("SudachiPy is not installed.")

    expect = [Token(**kwargs) for kwargs in sudachi_tokens_list]
    result = tokenizer.tokenize("医薬品安全管理責任者")
    assert expect == result
def test_word_tokenize_with_sentencepiece():
    try:
        import sentencepiece
        del sentencepiece
    except ImportError:
        pytest.skip("Sentencepiece is not installed.")

    tokenizer = WordTokenizer(tokenizer="Sentencepiece",
                              model_path="data/model.spm")
    expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #18
0
def test_token_with_postag2():
    token = Token(
        surface="大崎",
        postag="名詞",
        postag2="固有名詞,人名,姓",
        inflection="*",
        conjugation="*",
        base_form="大崎",
        yomi="オオサキ",
        pron="オーサキ")

    truth = "名詞,固有名詞,人名,姓,*,*,大崎,オオサキ,オーサキ"
    assert token.feature == truth
Example #19
0
    def tokenize(self, text: str) -> List[Token]:
        """Tokenize"""
        return_result = []
        parse_result = self.mecab.parse(text).rstrip(" ")
        if self.with_postag:
            for elem in parse_result.split("\n")[:-1]:
                (
                    surface,
                    postag,
                    postag2,
                    postag3,
                    postag4,
                    inflection,
                    conjugation,
                    base_form,
                    yomi,
                    pron,
                ) = self.parse_feature(elem)

                token = Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron,
                )
                return_result.append(token)
        else:
            for surface in parse_result.split(" "):
                return_result.append(Token(surface=surface))

        return return_result
def test_kytea_with_s3_model():
    try:
        import boto3
        del boto3
    except ImportError:
        pytest.skip("skip s3 test because of missing boto3")

    try:
        tokenizer = WordTokenizer(
            tokenizer="KyTea", model_path="s3://konoha-demo/kytea/model.knm")
    except ImportError:
        pytest.skip("skip kytea")

    expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]  # NOQA
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
def test_sentencepiece_with_s3_model():
    try:
        import boto3
        del boto3
    except ImportError:
        pytest.skip("skip s3 test because of missing boto3")

    try:
        tokenizer = WordTokenizer(
            tokenizer="SentencePiece",
            model_path="s3://konoha-demo/sentencepiece/model.spm")
    except ImportError:
        pytest.skip("skip sentencepiece")

    expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]  # NOQA
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #22
0
def test_word_tokenize_with_whitespace():
    tokenizer = WordTokenizer(tokenizer="Whitespace")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
    result = tokenizer.tokenize("吾輩 は 猫 で ある")
    assert expect == result
 def tokenize(self, text: str):
     result = []
     for subword in self.tokenizer.EncodeAsPieces(text):
         token = Token(surface=subword)
         result.append(token)
     return result
 def tokenize(self, text: str):
     return [Token(surface=surface) for surface in text.split(" ")]
Example #25
0
def test_token_without_feature():
    token = Token(surface="大崎")
    assert "大崎" == token.surface
    assert "" == token.feature
Example #26
0
 def tokenize(self, text: str):
     return [Token(surface=char) for char in list(text)]
Example #27
0
def test_word_tokenize_with_character():
    tokenizer = WordTokenizer(tokenizer="Character")
    expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Example #28
0
def test_token_with_postag():
    token = Token(surface="大崎", postag="名詞")
    assert "大崎" == token.surface
    assert "名詞" == token.feature