Python TokenizedSenetence.TokenizedSenetence 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: JapaneseTokenizer.datamodels

클래스/타입: TokenizedSenetence

메소드/함수: TokenizedSenetence

hotexamples.com에서의 예제들: 6

Python TokenizedSenetence.TokenizedSenetence - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 JapaneseTokenizer.datamodels.TokenizedSenetence.TokenizedSenetence에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TokenizedSenetence(6)

convert_list_object(6)

filter(1)

예제 #1

파일 보기

파일: juman_wrapper.py 프로젝트: yyanagi1/JapaneseTokenizers

    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        result = self.call_juman_interface(normalized_sentence)

        token_objects = [
            self.__extract_morphological_information(mrph_object=morph_object,
                                                     is_surface=is_surface,
                                                     is_feature=is_feature)
            for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects

예제 #2

파일 보기

파일: kytea_wrapper.py 프로젝트: zhaoqf123/JapaneseTokenizers

    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        if six.PY2:
            normalized_sentence = normalized_sentence.encode('utf-8')

        result = self.__list_tags(self.kytea.getTags(normalized_sentence))

        token_objects = [
            self.__extract_morphological_information(
                kytea_tags_tuple=kytea_tags, is_feature=is_feature)
            for kytea_tags in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects

예제 #3

파일 보기

파일: jumanpp_wrapper_python3.py 프로젝트: natsukoa/JapaneseTokenizers

    def tokenize(self, sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]]
        """* What you can do
        -
        """
        if normalize:
            normalized_sentence = func_normalizer(sentence)
        else:
            normalized_sentence = sentence

        ml_token_object = self.call_juman_interface(normalized_sentence)

        token_objects = [
            juman_utils.extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature
            )
            for morph_object in ml_token_object]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)
            return tokenized_objects

예제 #4

파일 보기

파일: juman_wrapper_python3.py 프로젝트: natsukoa/JapaneseTokenizers

    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]]
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = func_normalizer(sentence)
        else:
            normalized_sentence = sentence

        result = self.call_juman_interface(normalized_sentence)
        token_objects = [
            juman_utils.extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature) for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects

예제 #5

파일 보기

    def tokenize(
        self,
        sentence: str,
        normalize: bool = True,
        is_feature: bool = False,
        is_surface: bool = False,
        return_list: bool = False,
        func_normalizer: Callable[[str], str] = None
    ) -> Union[TokenizedSenetence, List[ContentsTypes]]:
        """* What you can do
        -
        """
        assert isinstance(sentence, str)
        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = normalize_text(sentence,
                                                 dictionary_mode='neologd')
            normalized_sentence = normalized_sentence.replace('　', '')
        elif func_normalizer is None:
            normalized_sentence = normalize_text(sentence)
            normalized_sentence = normalized_sentence.replace('　', '')
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence

        parsed_result = self.mecabObj.parse(encoded_text)
        tokenized_objects = self.__postprocess_analyzed_result(
            string_mecab_parsed_result=parsed_result,
            is_feature=is_feature,
            is_surface=is_surface)
        tokenized_sentence = TokenizedSenetence(
            sentence=sentence,
            tokenized_objects=tokenized_objects)  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence

예제 #6

파일 보기

파일: mecab_wrapper.py 프로젝트: chendongliang87/JapaneseTokenizers

    def tokenize(self, sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=normalize_text):
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        if six.PY2 and isinstance(sentence, str):
            sentence = sentence.decode(self.string_encoding)
        else:
            pass

        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = neologdn.normalize(sentence)
        elif func_normalizer == normalize_text:
            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        elif func_normalizer is None:
            normalized_sentence = sentence
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. The variable "encoded_text" protects sentence from deleting
        if six.PY2:
            encoded_text = normalized_sentence.encode(self.string_encoding)
        else:
            encoded_text = normalized_sentence

        if six.PY2:
            tokenized_objects = []
            node = self.mecabObj.parseToNode(encoded_text)
            node = node.next
            while node.next is not None:
                word_surface = node.surface.decode(self.string_encoding)

                tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)

                tokenized_obj = TokenizedResult(
                    node_obj=node,
                    tuple_pos=tuple_pos,
                    word_stem=word_stem,
                    word_surface=word_surface,
                    is_feature=is_feature,
                    is_surface=is_surface
                )
                tokenized_objects.append(tokenized_obj)
                node = node.next

            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects)
        else:
            parsed_result = self.mecabObj.parse(encoded_text)
            tokenized_objects = self.__postprocess_analyzed_result(
                string_mecab_parsed_result=parsed_result,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects
            )  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence