def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        result = self.call_juman_interface(normalized_sentence)

        token_objects = [
            self.__extract_morphological_information(mrph_object=morph_object,
                                                     is_surface=is_surface,
                                                     is_feature=is_feature)
            for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]]
        """* What you can do
        -
        """
        if normalize:
            normalized_sentence = func_normalizer(sentence)
        else:
            normalized_sentence = sentence

        ml_token_object = self.call_juman_interface(normalized_sentence)

        token_objects = [
            juman_utils.extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature
            )
            for morph_object in ml_token_object]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)
            return tokenized_objects
    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        if six.PY2:
            normalized_sentence = normalized_sentence.encode('utf-8')

        result = self.__list_tags(self.kytea.getTags(normalized_sentence))

        token_objects = [
            self.__extract_morphological_information(
                kytea_tags_tuple=kytea_tags, is_feature=is_feature)
            for kytea_tags in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=True):
        # type: (str, bool, bool, bool, bool)->Union[List[str], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.

        :param sentence: input sentence. unicode
        :param normalize: boolean flag to make string normalization before tokenization
        :param is_feature:
        :param is_surface:
        :param return_list:
        :return:
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, string_types)
        if normalize:
            normalized_sentence = text_preprocess.normalize_text(sentence, dictionary_mode="ipadic")
        else:
            normalized_sentence = sentence

        result = self.juman.analysis(normalized_sentence)
        token_objects = [
            self.__extract_morphological_information(
                mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature
            )
            for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence, normalize=True, is_feature=False, return_list=True):
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.

        :param sentence: input sentence. str
        :param normalize: boolean flag to make string normalization before tokenization
        :param is_feature:
        :param is_surface:
        :param return_list:
        :return:
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = text_preprocess.normalize_text(sentence, dictionary_mode="ipadic")
        else:
            normalized_sentence = sentence

        result = self.__list_tags(self.kytea.getTags(normalized_sentence))

        token_objects = [
            self.__extract_morphological_information(kytea_tags_tuple=kytea_tags, is_feature=is_feature)
            for kytea_tags in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
    def filter(self,
               parsed_sentence: TokenizedSenetence,
               pos_condition: List[Tuple[str, ...]] = None,
               stopwords: List[str] = None) -> FilteredObject:
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        return parsed_sentence.filter(pos_condition, stopwords)
Beispiel #7
0
    def tokenize(
        self,
        sentence: str,
        normalize: bool = True,
        is_feature: bool = False,
        is_surface: bool = False,
        return_list: bool = False,
        func_normalizer: Callable[[str], str] = None
    ) -> Union[TokenizedSenetence, List[ContentsTypes]]:
        """* What you can do
        -
        """
        assert isinstance(sentence, str)
        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = normalize_text(sentence,
                                                 dictionary_mode='neologd')
            normalized_sentence = normalized_sentence.replace(' ', '')
        elif func_normalizer is None:
            normalized_sentence = normalize_text(sentence)
            normalized_sentence = normalized_sentence.replace(' ', '')
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence

        parsed_result = self.mecabObj.parse(encoded_text)
        tokenized_objects = self.__postprocess_analyzed_result(
            string_mecab_parsed_result=parsed_result,
            is_feature=is_feature,
            is_surface=is_surface)
        tokenized_sentence = TokenizedSenetence(
            sentence=sentence,
            tokenized_objects=tokenized_objects)  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence
    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]]
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = func_normalizer(sentence)
        else:
            normalized_sentence = sentence

        result = self.call_juman_interface(normalized_sentence)
        token_objects = [
            juman_utils.extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature) for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence:str, normalize:bool=True,
                 is_feature:bool=False,
                 is_surface:bool=False,
                 return_list:bool=True)->Union[List[ContentsTypes], TokenizedSenetence]:
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = text_preprocess.normalize_text(sentence, dictionary_mode='ipadic')
        else:
            normalized_sentence = sentence

        result = self.juman.analysis(normalized_sentence)
        token_objects = [
            self.__extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature
            )
            for morph_object in result]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects
            )
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)

            return tokenized_objects
    def tokenize(self, sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=normalize_text):
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        if six.PY2 and isinstance(sentence, str):
            sentence = sentence.decode(self.string_encoding)
        else:
            pass

        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = neologdn.normalize(sentence)
        elif func_normalizer == normalize_text:
            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        elif func_normalizer is None:
            normalized_sentence = sentence
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. The variable "encoded_text" protects sentence from deleting
        if six.PY2:
            encoded_text = normalized_sentence.encode(self.string_encoding)
        else:
            encoded_text = normalized_sentence

        if six.PY2:
            tokenized_objects = []
            node = self.mecabObj.parseToNode(encoded_text)
            node = node.next
            while node.next is not None:
                word_surface = node.surface.decode(self.string_encoding)

                tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)

                tokenized_obj = TokenizedResult(
                    node_obj=node,
                    tuple_pos=tuple_pos,
                    word_stem=word_stem,
                    word_surface=word_surface,
                    is_feature=is_feature,
                    is_surface=is_surface
                )
                tokenized_objects.append(tokenized_obj)
                node = node.next

            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects)
        else:
            parsed_result = self.mecabObj.parse(encoded_text)
            tokenized_objects = self.__postprocess_analyzed_result(
                string_mecab_parsed_result=parsed_result,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects
            )  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence