Exemple #1
0
    def __init__(
        self,
        tokenizer_name: str = 'mecab',
        with_postag: bool = False,
        user_dictionary_path: Optional[str] = None,
        system_dictionary_path: Optional[str] = None,
        model_path: Optional[str] = None,
        mode: Optional[str] = None,
        dictionary_format: Optional[str] = None,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
    ) -> None:
        self._tokenizer = WordTokenizer(
            tokenizer=tokenizer_name,
            with_postag=with_postag,
            user_dictionary_path=user_dictionary_path,
            system_dictionary_path=system_dictionary_path,
            model_path=model_path,
            mode=mode,
            dictionary_format=dictionary_format,
        )
        self._start_tokens = start_tokens or []
        self._start_tokens.reverse()

        self._end_tokens = end_tokens or []
Exemple #2
0
def test_postagging_with_janome():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="janome", with_postag=True)
    except ImportError:
        pytest.skip("Janome is not installed.")

    expect = [Token(**kwargs) for kwargs in janome_tokens_list]
    result = tokenizer.tokenize("すもももももももものうち")
    assert expect == result
Exemple #3
0
def test_postagging_with_mecab():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
    except ImportError:
        pytest.skip("natto-py is not installed.")

    expect = [Token(**kwargs) for kwargs in mecab_tokens_list]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
def test_word_tokenize_with_janome():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="janome", with_postag=True)
    except ImportError:
        pytest.skip("skip janome")

    expect = [Token(**kwargs) for kwargs in janome_tokens_list]
    result = tokenizer.tokenize(SENTENCE3)
    assert expect == result
def test_word_tokenize_with_kytea():
    """Test KyTea tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True)
    except ImportError:
        pytest.skip("skip kytea")

    expect = [Token(**kwargs) for kwargs in kytea_tokens_list]
    result = tokenizer.tokenize(SENTENCE1)
    assert expect == result
def test_postagging_with_kytea():
    """Test KyTea tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True)
    except ImportError:
        pytest.skip("KyTea is not installed.")

    expect = [Token(**kwargs) for kwargs in kytea_tokens_list]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
def test_word_tokenize_with_mecab():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
    except ImportError:
        pytest.skip("skip mecab")

    expect = [Token(**kwargs) for kwargs in mecab_tokens_list]
    result = tokenizer.tokenize(SENTENCE1)
    assert expect == result
Exemple #8
0
def test_word_tokenize_with_kytea():
    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("Mykytea is not installed.")

    tokenizer = WordTokenizer(tokenizer="KyTea")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Exemple #9
0
def test_word_tokenize_with_kytea_using_custom_model():
    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("KyTea is not installed.")

    tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm")
    expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
def test_word_tokenize_with_mecab_whitespace():
    try:
        import natto
        del natto
    except ImportError:
        pytest.skip("natto-py is not installed")

    tokenizer = WordTokenizer(tokenizer="MeCab")
    expect = [Token(surface=w) for w in "吾輩 は   で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は である")
    assert expect == result
def test_word_tokenize_with_janome():
    try:
        import janome
        del janome
    except ImportError:
        pytest.skip("janome is not installed")

    tokenizer = WordTokenizer(tokenizer="Janome")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
def test_word_tokenize_with_sudachi_mode_a():
    try:
        import sudachipy
        del sudachipy
    except ImportError:
        pytest.skip("sudachipy is not installed")

    tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A")
    expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")]
    result = tokenizer.tokenize("医薬品安全管理責任者")
    assert expect == result
def test_word_tokenize_with_sentencepiece():
    try:
        import sentencepiece
        del sentencepiece
    except ImportError:
        pytest.skip("Sentencepiece is not installed.")

    tokenizer = WordTokenizer(tokenizer="Sentencepiece",
                              model_path="data/model.spm")
    expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Exemple #14
0
def test_word_tokenize_with_sudachi_mode_a():
    """Test Sudachi tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="sudachi",
                                  mode="A",
                                  with_postag=True)
    except ImportError:
        pytest.skip("skip sudachi")

    expect = [Token(**kwargs) for kwargs in sudachi_tokens_list]
    result = tokenizer.tokenize(SENTENCE2)
    assert expect == result
Exemple #15
0
def test_postagging_with_sudachi_mode_a():
    """Test Sudachi tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="sudachi",
                                  mode="A",
                                  with_postag=True)
    except ImportError:
        pytest.skip("SudachiPy is not installed.")

    expect = [Token(**kwargs) for kwargs in sudachi_tokens_list]
    result = tokenizer.tokenize("医薬品安全管理責任者")
    assert expect == result
def test_kytea_with_s3_model():
    try:
        import boto3
        del boto3
    except ImportError:
        pytest.skip("skip s3 test because of missing boto3")

    try:
        tokenizer = WordTokenizer(
            tokenizer="KyTea", model_path="s3://konoha-demo/kytea/model.knm")
    except ImportError:
        pytest.skip("skip kytea")

    expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]  # NOQA
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
def test_sentencepiece_with_s3_model():
    try:
        import boto3
        del boto3
    except ImportError:
        pytest.skip("skip s3 test because of missing boto3")

    try:
        tokenizer = WordTokenizer(
            tokenizer="SentencePiece",
            model_path="s3://konoha-demo/sentencepiece/model.spm")
    except ImportError:
        pytest.skip("skip sentencepiece")

    expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]  # NOQA
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Exemple #18
0
class KonohaTokenizer(Tokenizer):
    """An integration for AllenNLP.
    """
    def __init__(
        self,
        tokenizer_name: str = 'mecab',
        with_postag: bool = False,
        user_dictionary_path: Optional[str] = None,
        system_dictionary_path: Optional[str] = None,
        model_path: Optional[str] = None,
        mode: Optional[str] = None,
        dictionary_format: Optional[str] = None,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
    ) -> None:
        self._tokenizer = WordTokenizer(
            tokenizer=tokenizer_name,
            with_postag=with_postag,
            user_dictionary_path=user_dictionary_path,
            system_dictionary_path=system_dictionary_path,
            model_path=model_path,
            mode=mode,
            dictionary_format=dictionary_format,
        )
        self._start_tokens = start_tokens or []
        self._start_tokens.reverse()

        self._end_tokens = end_tokens or []

    @overrides
    def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
        return [self.tokenize(text) for text in texts]

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        konoha_tokens = self._tokenizer.tokenize(text)
        tokens = [
            Token(
                text=token.surface,
                lemma_=token.base_form,
                pos_=token.postag,
            ) for token in konoha_tokens
        ]

        for start_token in self._start_tokens:
            tokens.insert(0, Token(start_token, 0))

        for end_token in self._end_tokens:
            tokens.append(Token(end_token, -1))

        return tokens
Exemple #19
0
def test_word_tokenize_with_character():
    tokenizer = WordTokenizer(tokenizer="Character")
    expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
Exemple #20
0
def test_word_tokenizer():
    tokenizer1 = WordTokenizer(tokenizer="Whitespace")
    tokenizer2 = WordTokenizer(tokenizer="whitespace")
    assert tokenizer1.tokenizer.name == tokenizer2.tokenizer.name
def test_word_tokenize_with_whitespace():
    tokenizer = WordTokenizer(tokenizer="Whitespace")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
    result = tokenizer.tokenize("吾輩 は 猫 で ある")
    assert expect == result