def basic_example_juman_2x(): # input is `unicode` type(in python2x) sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' juman_wrapper = JumanWrapper() tokenized_objects = juman_wrapper.tokenize(sentence=sentence, normalize=True, return_list=False) assert isinstance(tokenized_objects, TokenizedSenetence) print('-' * 30) print(u'Juman Demo') for token_object in tokenized_objects.tokenized_objects: assert isinstance(token_object, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_object.word_stem, token_object.word_surface, token_object.tuple_pos)) ### You can call juman with server mode. You must start JUMAN as server mode beforehand ### juman_wrapper = JumanWrapper(server='localhost', port=32000) tokens_list = juman_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokens_list, list) # filtering is same as mecab filtered_result = JumanWrapper().tokenize( sentence, return_list=False).filter( pos_condition=[(u'名詞', )]).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result)
class JumanTokenizer: def __init__(self): self.tokenizer = JumanWrapper() def tokenize(self, text): tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects return [ dict(analyzed_line=obj.analyzed_line, word_surface=obj.word_surface, word_stem=obj.word_stem, pos=list(obj.tuple_pos), misc_info=obj.misc_info) for obj in tokenized_objects ]
def basic_example_juman_3x(): # input is `str` type(in python3x) sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' juman_wrapper = JumanWrapper() tokenized_objects = juman_wrapper.tokenize(sentence=sentence, normalize=True, return_list=False) assert isinstance(tokenized_objects, TokenizedSenetence) print('-' * 30) print('Juman Demo') print(tokenized_objects.convert_list_object()) for token_object in tokenized_objects.tokenized_objects: assert isinstance(token_object, TokenizedResult) print('word_stem:{}, word_surafce:{}, pos:{}'.format( token_object.word_stem, token_object.word_surface, token_object.tuple_pos)) ### You can call juman with server mode. You must start JUMAN as server mode beforehand ### s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 32000 try: s.connect((HOST, PORT)) s.close() juman_wrapper = JumanWrapper(server=HOST, port=PORT) tokens_list = juman_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokens_list, list) # filtering is same as mecab filtered_result = JumanWrapper( server='localhost', port=32000).tokenize(sentence, return_list=False).filter( pos_condition=[('名詞', )]).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result) except: logger.info(msg='Juman server is not running. Skip it.')
def basic_example_juman_2x(): # input is `unicode` type(in python2x) sentence = u'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' juman_wrapper = JumanWrapper() tokenized_objects = juman_wrapper.tokenize( sentence=sentence, normalize=True, return_list=False ) assert isinstance(tokenized_objects, TokenizedSenetence) print('-'*30) print(u'Juman Demo') for token_object in tokenized_objects.tokenized_objects: assert isinstance(token_object, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_object.word_stem, token_object.word_surface, token_object.tuple_pos))