def test_filter_pos(self): """ """ print("Filtering Test. POS condition is only 名詞") test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) pos_condition = [("名詞",)] filtered_result = juman_wrapper.filter(parsed_sentence=token_objects, pos_condition=pos_condition) assert isinstance(filtered_result, FilteredObject) for t_obj in filtered_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) print( "word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format( t_obj.word_surface, t_obj.word_stem, " ".join(t_obj.tuple_pos), t_obj.misc_info ) ) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) assert t_obj.tuple_pos[0] == "名詞" print("-" * 30) for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) print("word_stem:{} word_pos:{}".format(word_stem, " ".join(word_posTuple)))
def test_tokenize(self): """This test case checks juman_wrapper.tokenize """ logger.debug('Tokenize Test') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) for t_obj in token_objects.tokenized_objects: assert isinstance(t_obj, TokenizedResult) logger.debug( "word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}". format(t_obj.word_surface, t_obj.word_stem, ' '.join(t_obj.tuple_pos), t_obj.misc_info)) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) token_objects_list = token_objects.convert_list_object() assert isinstance(token_objects_list, list) logger.debug('-' * 30) for stem_posTuple in token_objects_list: assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) logger.debug('word_stem:{} word_pos:{}'.format( word_stem, ' '.join(word_posTuple)))
def test_tokenize(self): """This test case checks juman_wrapper.tokenize """ print("Tokenize Test") test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) for t_obj in token_objects.tokenized_objects: assert isinstance(t_obj, TokenizedResult) print( "word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format( t_obj.word_surface, t_obj.word_stem, " ".join(t_obj.tuple_pos), t_obj.misc_info ) ) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) token_objects_list = token_objects.convert_list_object() assert isinstance(token_objects_list, list) print("-" * 30) for stem_posTuple in token_objects_list: assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) print("word_stem:{} word_pos:{}".format(word_stem, " ".join(word_posTuple)))
def test_stopwords(self): """stopword除去のテスト""" stopword = ['AV', '女優'] logger.debug('Stopwords Filtering Test. Stopwords is {}'.format( ','.join(stopword))) test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) filtered_result = juman_wrapper.filter(parsed_sentence=token_objects, stopwords=stopword) check_flag = True for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) logger.debug('word_stem:{} word_pos:{}'.format( word_stem, ' '.join(word_posTuple))) if word_stem in stopword: check_flag = False assert check_flag
def test_stopwords(self): stopword = ["AV", "女優"] print("Stopwords Filtering Test. Stopwords is {}".format(",".join(stopword))) test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) filtered_result = juman_wrapper.filter(parsed_sentence=token_objects, stopwords=stopword) check_flag = True for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) print("word_stem:{} word_pos:{}".format(word_stem, " ".join(word_posTuple))) if word_stem in stopword: check_flag = False assert check_flag
def test_juman_severmode(self): """* What you can do - juman server modeのテストを実施する """ logger.debug('Tokenize test with server mode') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" # check socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 32000 try: s.connect((HOST, PORT)) s.close() except: logger.warning( "SKip server mode test because server is not working.") else: juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) test_sentence = "ペルシア語(ペルシアご、ペルシア語: فارسی, پارسی; Fārsī, Pārsī)は、イランを中心とする中東地域で話される言語。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT) list_token = juman_wrapper.tokenize(sentence=test_sentence, return_list=True, is_feature=True) assert isinstance(list_token, list)
def test_filter_pos(self): """ """ logger.debug('Filtering Test. POS condition is only 名詞') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) pos_condition = [('名詞', )] filtered_result = juman_wrapper.filter(parsed_sentence=token_objects, pos_condition=pos_condition) assert isinstance(filtered_result, FilteredObject) for t_obj in filtered_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) logger.debug( "word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}". format(t_obj.word_surface, t_obj.word_stem, ' '.join(t_obj.tuple_pos), t_obj.misc_info)) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) assert t_obj.tuple_pos[0] == '名詞' logger.debug('-' * 30) for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) logger.debug('word_stem:{} word_pos:{}'.format( word_stem, ' '.join(word_posTuple)))
def test_juman_severmode(self): """* What you can do - juman server modeのテストを実施する """ logger.debug('Tokenize test with server mode') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server='localhost', port=32000) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) test_sentence = "ペルシア語(ペルシアご、ペルシア語: فارسی, پارسی; Fārsī, Pārsī)は、イランを中心とする中東地域で話される言語。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server='localhost', port=32000) list_token = juman_wrapper.tokenize(sentence=test_sentence, return_list=True, is_feature=True) assert isinstance(list_token, list)