def tokenize(self,
                 sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=None):
        # type: (, str, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = normalize_text(sentence,
                                                 dictionary_mode='neologd')
            normalized_sentence = normalized_sentence.replace(u' ', u'')
        elif func_normalizer is None:
            normalized_sentence = normalize_text(sentence)
            normalized_sentence = normalized_sentence.replace(u' ', u'')
        else:
            normalized_sentence = func_normalizer(sentence)

        assert isinstance(sentence, string_types)
        tokenized_objects = []

        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence.encode('utf-8')

        node = self.mecabObj.parseToNode(encoded_text)
        node = node.next
        while node.next is not None:

            word_surface = node.surface.decode('utf-8')

            tuple_pos, word_stem = self.__feature_parser(
                node.feature.decode('utf-8'), word_surface)

            tokenized_obj = TokenizedResult(node_obj=node,
                                            tuple_pos=tuple_pos,
                                            word_stem=word_stem,
                                            word_surface=word_surface,
                                            is_feature=is_feature,
                                            is_surface=is_surface)
            tokenized_objects.append(tokenized_obj)
            node = node.next

        tokenized_sentence = TokenizedSenetence(
            sentence=sentence, tokenized_objects=tokenized_objects)

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence
    def tokenize(self, sentence, normalize=True, is_feature=False, return_list=True):
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.

        :param sentence: input sentence. str
        :param normalize: boolean flag to make string normalization before tokenization
        :param is_feature:
        :param is_surface:
        :param return_list:
        :return:
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = text_preprocess.normalize_text(sentence, dictionary_mode="ipadic")
        else:
            normalized_sentence = sentence

        result = self.__list_tags(self.kytea.getTags(normalized_sentence))

        token_objects = [
            self.__extract_morphological_information(kytea_tags_tuple=kytea_tags, is_feature=is_feature)
            for kytea_tags in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence:str,
                 is_feature:bool=False, is_surface:bool=False, return_list:bool=True)->Union[TokenizedSenetence, List[ContentsTypes]]:
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """
        assert isinstance(sentence, str)

        normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        normalized_sentence = normalized_sentence.replace(' ', '')
        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence

        parsed_result = self.mecabObj.parse(encoded_text)
        tokenized_objects = self.__postprocess_analyzed_result(
            string_mecab_parsed_result=parsed_result,
            is_feature=is_feature,
            is_surface=is_surface
        )
        tokenized_sentence = TokenizedSenetence(
            sentence=sentence,
            tokenized_objects=tokenized_objects
        ) # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence
    def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=True):
        # type: (str, bool, bool, bool, bool)->Union[List[str], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.

        :param sentence: input sentence. unicode
        :param normalize: boolean flag to make string normalization before tokenization
        :param is_feature:
        :param is_surface:
        :param return_list:
        :return:
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, string_types)
        if normalize:
            normalized_sentence = text_preprocess.normalize_text(sentence, dictionary_mode="ipadic")
        else:
            normalized_sentence = sentence

        result = self.juman.analysis(normalized_sentence)
        token_objects = [
            self.__extract_morphological_information(
                mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature
            )
            for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
Example #5
0
    def tokenize(
        self,
        sentence: str,
        normalize: bool = True,
        is_feature: bool = False,
        is_surface: bool = False,
        return_list: bool = False,
        func_normalizer: Callable[[str], str] = None
    ) -> Union[TokenizedSenetence, List[ContentsTypes]]:
        """* What you can do
        -
        """
        assert isinstance(sentence, str)
        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = normalize_text(sentence,
                                                 dictionary_mode='neologd')
            normalized_sentence = normalized_sentence.replace(' ', '')
        elif func_normalizer is None:
            normalized_sentence = normalize_text(sentence)
            normalized_sentence = normalized_sentence.replace(' ', '')
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence

        parsed_result = self.mecabObj.parse(encoded_text)
        tokenized_objects = self.__postprocess_analyzed_result(
            string_mecab_parsed_result=parsed_result,
            is_feature=is_feature,
            is_surface=is_surface)
        tokenized_sentence = TokenizedSenetence(
            sentence=sentence,
            tokenized_objects=tokenized_objects)  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence
    def tokenize(self, sentence, is_feature=False, is_surface=False, return_list=True):
        # type: (str, bool, bool, bool)->Union[List[str], TokenizedSenetence]
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """

        assert isinstance(sentence, string_types)
        tokenized_objects = []

        normalized_sentence = normalize_text(sentence, self._dictType)

        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence.encode('utf-8')

        node = self.mecabObj.parseToNode(encoded_text)
        node = node.next
        while node.next is not None:

            word_surface = node.surface.decode('utf-8')

            tuple_pos, word_stem = self.__feature_parser(node.feature.decode('utf-8'), word_surface)

            tokenized_obj = TokenizedResult(
                node_obj=node,
                tuple_pos=tuple_pos,
                word_stem=word_stem,
                word_surface=word_surface,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_objects.append(tokenized_obj)
            node = node.next

        tokenized_sentence = TokenizedSenetence(
            sentence=sentence,
            tokenized_objects=tokenized_objects
        )

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence
    def tokenize(self, sentence:str, normalize:bool=True,
                 is_feature:bool=False,
                 is_surface:bool=False,
                 return_list:bool=True)->Union[List[ContentsTypes], TokenizedSenetence]:
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = text_preprocess.normalize_text(sentence, dictionary_mode='ipadic')
        else:
            normalized_sentence = sentence

        result = self.juman.analysis(normalized_sentence)
        token_objects = [
            self.__extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature
            )
            for morph_object in result]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects
            )
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=normalize_text):
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        if six.PY2 and isinstance(sentence, str):
            sentence = sentence.decode(self.string_encoding)
        else:
            pass

        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = neologdn.normalize(sentence)
        elif func_normalizer == normalize_text:
            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        elif func_normalizer is None:
            normalized_sentence = sentence
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. The variable "encoded_text" protects sentence from deleting
        if six.PY2:
            encoded_text = normalized_sentence.encode(self.string_encoding)
        else:
            encoded_text = normalized_sentence

        if six.PY2:
            tokenized_objects = []
            node = self.mecabObj.parseToNode(encoded_text)
            node = node.next
            while node.next is not None:
                word_surface = node.surface.decode(self.string_encoding)

                tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)

                tokenized_obj = TokenizedResult(
                    node_obj=node,
                    tuple_pos=tuple_pos,
                    word_stem=word_stem,
                    word_surface=word_surface,
                    is_feature=is_feature,
                    is_surface=is_surface
                )
                tokenized_objects.append(tokenized_obj)
                node = node.next

            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects)
        else:
            parsed_result = self.mecabObj.parse(encoded_text)
            tokenized_objects = self.__postprocess_analyzed_result(
                string_mecab_parsed_result=parsed_result,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects
            )  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence