Beispiel #1
0
    def tokenize(self, text: str) -> List[Token]:
        tokens = []

        if self._with_postag:
            response = self._tokenizer.getTagsToString(text)

            # FIXME Following dirty workaround is required to
            #       process inputs which include <whitespace> itself
            #       (e.g. "私 は猫")
            response = response.replace("\\ ",
                                        "<SPACE>").replace("  ", " <SPACE>")

            for elem in response.split(" ")[:-1]:
                # FIXME If input contains a character "/",
                #       KyTea outputs "//補助記号/・",
                #       which breaks the simple logic elem.split("/")
                pron, postag, surface = map(lambda e: e[::-1],
                                            elem[::-1].split("/", maxsplit=2))
                surface = surface.replace("<SPACE>", " ")
                tokens.append(Token(surface=surface, postag=postag, pron=pron))

        else:
            for surface in list(self._tokenizer.getWS(text)):
                tokens.append(Token(surface=surface))

        return tokens
Beispiel #2
0
    def tokenize(self, text: str) -> List[Token]:
        return_result = []
        parse_result = self._tokenizer.tokenize(text)

        if self._with_postag:
            for morph in parse_result:
                surface = morph.surface
                postag, postag2, postag3, postag4 = morph.part_of_speech.split(
                    ",")
                inflection = morph.infl_type
                conjugation = morph.infl_form
                base_form = morph.base_form
                yomi = morph.reading
                pron = morph.phonetic

                token = Token(
                    surface=surface,
                    postag=postag,
                    postag2=postag2,
                    postag3=postag3,
                    postag4=postag4,
                    inflection=inflection,
                    conjugation=conjugation,
                    base_form=base_form,
                    yomi=yomi,
                    pron=pron,
                )
                return_result.append(token)

        else:
            for morph in parse_result:
                return_result.append(Token(surface=morph.surface))

        return return_result
Beispiel #3
0
    def tokenize(self, text: str) -> List[Token]:
        """Tokenize."""
        result = []
        for token in self._tokenizer.tokenize(text, self._mode):
            surface = token.surface()
            if self._with_postag:
                (
                    postag,
                    postag2,
                    postag3,
                    postag4,
                    inflection,
                    conjugation,
                ) = token.part_of_speech()
                base_form = token.dictionary_form()
                normalized_form = token.normalized_form()
                yomi = token.reading_form()
                result.append(
                    Token(
                        surface=surface,
                        postag=postag,
                        postag2=postag2,
                        postag3=postag3,
                        postag4=postag4,
                        inflection=inflection,
                        conjugation=conjugation,
                        base_form=base_form,
                        normalized_form=normalized_form,
                        yomi=yomi,
                    ))

            else:
                result.append(Token(surface=surface))
        return result
Beispiel #4
0
 def tokenize(self, text: str) -> List[Token]:
     response = self._tokenizer.tagging(text)
     tokens = [
         Token(surface=surface, postag=postag)
         for (surface, postag) in zip(response.words, response.postags)
     ]
     return tokens
Beispiel #5
0
def parse_feature_for_unidic(elem) -> Token:
    """
    UniDic format: https://unidic.ninjal.ac.jp/faq
    """
    surface, feaature_line = elem.split("\t")
    features = feaature_line.split(",")

    postag = features[0] or None
    postag2 = features[1] or None
    postag3 = features[2] or None
    postag4 = features[3] or None
    inflection = features[4] or None
    conjugation = features[5] or None

    if len(features) >= 10:
        yomi = features[6] or None
        base_form = features[7] or None
        pron = features[9] or None
    else:
        yomi = None
        base_form = None
        pron = None

    return Token(
        surface=surface,
        postag=postag,
        postag2=postag2,
        postag3=postag3,
        postag4=postag4,
        inflection=inflection,
        conjugation=conjugation,
        base_form=base_form,
        yomi=yomi,
        pron=pron,
    )
Beispiel #6
0
def parse_feature_for_ipadic(elem) -> Token:
    surface, feature = elem.split("\t")
    (
        postag,
        postag2,
        postag3,
        postag4,
        inflection,
        conjugation,
        base_form,
        *other,
    ) = feature.split(",")

    # For words not in a dictionary
    if len(other) == 2:
        yomi, pron = other
    else:
        yomi, pron = None, None

    return Token(
        surface=surface,
        postag=postag,
        postag2=postag2,
        postag3=postag3,
        postag4=postag4,
        inflection=inflection,
        conjugation=conjugation,
        base_form=base_form,
        yomi=yomi,
        pron=pron,
    )
Beispiel #7
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
Beispiel #8
0
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [
        Token.from_dict(token_param)
        for token_param in read_lines(tokenizer_name)[0]
    ]
    result = tokenizer.tokenize(raw_texts[0])
    assert expect == result
Beispiel #9
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
        pytest.skip("KyTea doesn't work in Python3.6")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
Beispiel #10
0
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[
            "system_dictionary_path"].startswith("s3://"):
        pytest.skip("AWS credentials not found.")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
Beispiel #11
0
    def tokenize(self, text: str) -> List[Token]:
        """Tokenize"""
        return_result = []
        parse_result = self._tokenizer.parse(text).rstrip(" ")
        if self._with_postag:
            for elem in parse_result.split("\n")[:-1]:
                return_result.append(self._parse_feature(elem))
        else:
            for surface in parse_result.split(" "):
                return_result.append(Token(surface=surface))

        return return_result
Beispiel #12
0
def test_token_with_postag2():
    token = Token(
        surface="大崎",
        postag="名詞",
        postag2="固有名詞,人名,姓",
        inflection="*",
        conjugation="*",
        base_form="大崎",
        yomi="オオサキ",
        pron="オーサキ",
    )

    truth = "名詞,固有名詞,人名,姓,*,*,大崎,オオサキ,オーサキ"
    assert token.feature == truth
Beispiel #13
0
    def tokenize(self, text: str) -> List[Token]:
        """Tokenize input text"""

        if isinstance(self._endpoint, str):
            endpoint = self.get_endpoint("/api/v1/tokenize")
            payload = dict(self.payload, text=text)
            token_params = self._tokenize_with_remote_host(
                endpoint=endpoint, payload=payload, headers=self.headers)
            return [
                Token.from_dict(token_param) for token_param in token_params
            ]

        else:
            assert self._tokenizer is not None
            return self._tokenizer.tokenize(text)
Beispiel #14
0
 def tokenize(self, text: str) -> List[Token]:
     result = []
     for token in self._tokenizer.tokenize(text, self._mode):
         (postag, postag2, postag3, postag4, inflection, conjugation) = token.part_of_speech()
         result.append(
             Token(
                 surface=token.surface(),
                 postag=postag,
                 postag2=postag2,
                 postag3=postag3,
                 postag4=postag4,
                 inflection=inflection,
                 conjugation=conjugation,
                 base_form=token.dictionary_form(),
                 normalized_form=token.normalized_form(),
                 yomi=token.reading_form(),
             )
         )
     return result
Beispiel #15
0
    def tokenize(self, text: str) -> List[Token]:
        return_result = []
        for morph in self._tokenizer.tokenize(text):
            postag, postag2, postag3, postag4 = morph.part_of_speech.split(",")
            token = Token(
                surface=morph.surface,
                postag=postag,
                postag2=postag2,
                postag3=postag3,
                postag4=postag4,
                inflection=morph.infl_type,
                conjugation=morph.infl_form,
                base_form=morph.base_form,
                yomi=morph.reading,
                pron=morph.phonetic,
            )
            return_result.append(token)

        return return_result
Beispiel #16
0
    def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
        """Tokenize input texts"""

        if isinstance(self._endpoint, str):
            endpoint = self.get_endpoint("/api/v1/batch_tokenize")
            payload = dict(self.payload, texts=texts)
            token_params_list = self._batch_tokenize_with_remote_host(
                endpoint=endpoint,
                payload=payload,
                headers=self.headers,
            )

            tokens_list: List[List[Token]] = []
            for tokens in token_params_list:
                tokens_list.append(
                    [Token.from_dict(token) for token in tokens])

            return tokens_list

        else:
            assert self._tokenizer is not None
            return [self._tokenizer.tokenize(text) for text in texts]
Beispiel #17
0
def test_token_with_postag():
    token = Token(surface="大崎", postag="名詞")
    assert "大崎" == token.surface
    assert "名詞" == token.feature
Beispiel #18
0
 def tokenize(self, text: str) -> List[Token]:
     result = []
     for subword in self._tokenizer.EncodeAsPieces(text):
         token = Token(surface=subword)
         result.append(token)
     return result
Beispiel #19
0
 def tokenize(self, text: str) -> List[Token]:
     return [Token(surface=surface) for surface in text.split(" ")]
Beispiel #20
0
 def tokenize(_, text: str):
     return [Token(surface=char) for char in list(text)]
Beispiel #21
0
def test_token_without_feature():
    token = Token(surface="大崎")
    assert "大崎" == token.surface
    assert "" == token.feature