예제 #1
0
    def __extract_morphological_information(self, mrph_object, is_feature,
                                            is_surface):
        """This method extracts morphlogical information from token object.
        """
        assert isinstance(mrph_object, pyknp.Morpheme)
        assert isinstance(is_feature, bool)
        assert isinstance(is_surface, bool)

        surface = mrph_object.midasi
        word_stem = mrph_object.genkei

        tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)

        misc_info = {
            'katuyou1': mrph_object.katuyou1,
            'katuyou2': mrph_object.katuyou2,
            'imis': mrph_object.imis,
            'repname': mrph_object.repname
        }

        token_object = TokenizedResult(node_obj=None,
                                       tuple_pos=tuple_pos,
                                       word_stem=word_stem,
                                       word_surface=surface,
                                       is_feature=is_feature,
                                       is_surface=is_surface,
                                       misc_info=misc_info)

        return token_object
예제 #2
0
    def __result_parser(self, analyzed_line, is_feature, is_surface):
        # type: (text_type,bool,bool)->TokenizedResult
        """Extract surface word and feature from analyzed line.
        Extracted elements are returned with TokenizedResult class
        """
        assert isinstance(analyzed_line, str)
        assert isinstance(is_feature, bool)
        assert isinstance(is_surface, bool)

        surface, features = analyzed_line.split('\t', 1)
        tuple_pos, word_stem = self.__feature_parser(features, surface)
        tokenized_obj = TokenizedResult(node_obj=None,
                                        analyzed_line=analyzed_line,
                                        tuple_pos=tuple_pos,
                                        word_stem=word_stem,
                                        word_surface=surface,
                                        is_feature=is_feature,
                                        is_surface=is_surface)
        return tokenized_obj
예제 #3
0
    def __extract_morphological_information(self, kytea_tags_tuple,
                                            is_feature):
        # type: (Tuple[text_type,List[Any]], bool) -> TokenizedResult
        """This method extracts morphlogical information from token object.
        """
        assert isinstance(kytea_tags_tuple, tuple)
        assert isinstance(is_feature, bool)

        surface = self.__check_char_set(kytea_tags_tuple[0])
        # NOTE: kytea does NOT show word stem. Put blank string instead.
        if six.PY2:
            word_stem = ''.decode('utf-8')
        else:
            word_stem = ''

        pos_tuple = kytea_tags_tuple[1][0]
        pos = self.__check_char_set(pos_tuple[0][0])
        pos_score = float(pos_tuple[0][1])

        yomi_tuple = kytea_tags_tuple[1][1]
        yomi = self.__check_char_set(yomi_tuple[0][0])
        yomi_score = float(yomi_tuple[0][1])

        tuple_pos = (pos, )

        misc_info = {
            'pos_score': pos_score,
            'pos': pos,
            'yomi': yomi,
            'yomi_score': yomi_score
        }

        token_object = TokenizedResult(node_obj=None,
                                       tuple_pos=tuple_pos,
                                       word_stem=word_stem,
                                       word_surface=surface,
                                       is_feature=is_feature,
                                       is_surface=True,
                                       misc_info=misc_info)

        return token_object
예제 #4
0
    def __extract_morphological_information(self, kytea_tags_tuple,
                                            is_feature):
        """This method extracts morphlogical information from token object.
        """
        assert isinstance(kytea_tags_tuple, tuple)
        assert isinstance(is_feature, bool)

        surface = self.__check_char_set(kytea_tags_tuple[0])
        word_stem = ''

        pos_tuple = kytea_tags_tuple[1][0]
        pos = self.__check_char_set(pos_tuple[0][0])
        pos_score = float(pos_tuple[0][1])

        yomi_tuple = kytea_tags_tuple[1][1]
        yomi = self.__check_char_set(yomi_tuple[0][0])
        yomi_score = float(yomi_tuple[0][1])

        tuple_pos = (pos, )

        misc_info = {
            'pos_score': pos_score,
            'pos': pos,
            'yomi': yomi,
            'yomi_score': yomi_score
        }

        token_object = TokenizedResult(analyzed_line=None,
                                       tuple_pos=tuple_pos,
                                       word_stem=word_stem,
                                       word_surface=surface,
                                       is_feature=is_feature,
                                       is_surface=True,
                                       misc_info=misc_info,
                                       node_obj=None)

        return token_object
    def tokenize(self, sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=normalize_text):
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        if six.PY2 and isinstance(sentence, str):
            sentence = sentence.decode(self.string_encoding)
        else:
            pass

        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = neologdn.normalize(sentence)
        elif func_normalizer == normalize_text:
            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        elif func_normalizer is None:
            normalized_sentence = sentence
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. The variable "encoded_text" protects sentence from deleting
        if six.PY2:
            encoded_text = normalized_sentence.encode(self.string_encoding)
        else:
            encoded_text = normalized_sentence

        if six.PY2:
            tokenized_objects = []
            node = self.mecabObj.parseToNode(encoded_text)
            node = node.next
            while node.next is not None:
                word_surface = node.surface.decode(self.string_encoding)

                tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)

                tokenized_obj = TokenizedResult(
                    node_obj=node,
                    tuple_pos=tuple_pos,
                    word_stem=word_stem,
                    word_surface=word_surface,
                    is_feature=is_feature,
                    is_surface=is_surface
                )
                tokenized_objects.append(tokenized_obj)
                node = node.next

            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects)
        else:
            parsed_result = self.mecabObj.parse(encoded_text)
            tokenized_objects = self.__postprocess_analyzed_result(
                string_mecab_parsed_result=parsed_result,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects
            )  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence