def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence] """This method returns tokenized result. If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS. If return_list==False, this method returns TokenizedSenetence object. """ assert isinstance(normalize, bool) assert isinstance(sentence, text_type) normalized_sentence = func_normalizer(sentence) result = self.call_juman_interface(normalized_sentence) token_objects = [ self.__extract_morphological_information(mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature) for morph_object in result ] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects
def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence] """This method returns tokenized result. If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS. If return_list==False, this method returns TokenizedSenetence object. """ assert isinstance(normalize, bool) assert isinstance(sentence, text_type) normalized_sentence = func_normalizer(sentence) if six.PY2: normalized_sentence = normalized_sentence.encode('utf-8') result = self.__list_tags(self.kytea.getTags(normalized_sentence)) token_objects = [ self.__extract_morphological_information( kytea_tags_tuple=kytea_tags, is_feature=is_feature) for kytea_tags in result ] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects
def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]] """* What you can do - """ if normalize: normalized_sentence = func_normalizer(sentence) else: normalized_sentence = sentence ml_token_object = self.call_juman_interface(normalized_sentence) token_objects = [ juman_utils.extract_morphological_information( mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature ) for morph_object in ml_token_object] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects
def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]] """ :param sentence: :param ins_mecab: :param list_stopword: :param list_pos_candidate: :return: list [tuple (unicode, unicode)] """ assert isinstance(normalize, bool) assert isinstance(sentence, str) if normalize: normalized_sentence = func_normalizer(sentence) else: normalized_sentence = sentence result = self.call_juman_interface(normalized_sentence) token_objects = [ juman_utils.extract_morphological_information( mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature) for morph_object in result ] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects
def tokenize( self, sentence: str, normalize: bool = True, is_feature: bool = False, is_surface: bool = False, return_list: bool = False, func_normalizer: Callable[[str], str] = None ) -> Union[TokenizedSenetence, List[ContentsTypes]]: """* What you can do - """ assert isinstance(sentence, str) ### decide normalization function depending on dictType if func_normalizer is None and self._dictType == 'neologd': normalized_sentence = normalize_text(sentence, dictionary_mode='neologd') normalized_sentence = normalized_sentence.replace(' ', '') elif func_normalizer is None: normalized_sentence = normalize_text(sentence) normalized_sentence = normalized_sentence.replace(' ', '') else: normalized_sentence = func_normalizer(sentence) # don't delete this variable. encoded_text protects sentence from deleting encoded_text = normalized_sentence parsed_result = self.mecabObj.parse(encoded_text) tokenized_objects = self.__postprocess_analyzed_result( string_mecab_parsed_result=parsed_result, is_feature=is_feature, is_surface=is_surface) tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects) # type: TokenizedSenetence if return_list: return tokenized_sentence.convert_list_object() else: return tokenized_sentence
def tokenize(self, sentence, normalized=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=normalize_text): """* What you can do - Call mecab tokenizer, and return tokenized objects """ # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence] if six.PY2 and isinstance(sentence, str): sentence = sentence.decode(self.string_encoding) else: pass ### decide normalization function depending on dictType if func_normalizer is None and self._dictType == 'neologd': normalized_sentence = neologdn.normalize(sentence) elif func_normalizer == normalize_text: normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType) elif func_normalizer is None: normalized_sentence = sentence else: normalized_sentence = func_normalizer(sentence) # don't delete this variable. The variable "encoded_text" protects sentence from deleting if six.PY2: encoded_text = normalized_sentence.encode(self.string_encoding) else: encoded_text = normalized_sentence if six.PY2: tokenized_objects = [] node = self.mecabObj.parseToNode(encoded_text) node = node.next while node.next is not None: word_surface = node.surface.decode(self.string_encoding) tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface) tokenized_obj = TokenizedResult( node_obj=node, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=word_surface, is_feature=is_feature, is_surface=is_surface ) tokenized_objects.append(tokenized_obj) node = node.next tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects) else: parsed_result = self.mecabObj.parse(encoded_text) tokenized_objects = self.__postprocess_analyzed_result( string_mecab_parsed_result=parsed_result, is_feature=is_feature, is_surface=is_surface ) tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects ) # type: TokenizedSenetence if return_list: return tokenized_sentence.convert_list_object() else: return tokenized_sentence