def basic_example_3x(): # ======================================================== # TOKENIZE # ======================================================== # In python3x, you don't mind it sentence = 'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # path where `mecab-config` command exists. You can check it with `which mecab-config` # default value is '/usr/local/bin' path_mecab_config = '/usr/local/bin' # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "" mecab_wrapper = MecabWrapper(dictType=dictType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = ['テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, stopwords=stopwords) assert isinstance(filtered_obj, FilteredObject) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')] filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, pos_condition=pos_condition) assert isinstance(filtered_obj, FilteredObject) ### You can write chain expression on init-instance -> tokenize -> filtering -> list ### filtered_result = MecabWrapper( dictType=dictType, path_mecab_config=path_mecab_config).tokenize(sentence).filter( pos_condition=pos_condition).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result)
def basic_example(): # ======================================================== # TOKENIZE # ======================================================== # input is `unicode` type(in python2x) # In python3x, you don't mind it sentence = u'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # osType is generic or centos. it's because Mecab has different system command in CentOs. # If you're using this in CentsOs, put "centos" osType = "generic" # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "" mecab_wrapper = MecabWrapper(dictType=dictType, osType=osType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = [u'テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, stopwords=stopwords ) assert isinstance(filtered_obj, FilteredObject) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')] filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, pos_condition=pos_condition ) assert isinstance(filtered_obj, FilteredObject)
def basic_example_3x(): # ======================================================== # TOKENIZE # ======================================================== # In python3x, you don't mind it sentence = 'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # path where `mecab-config` command exists. You can check it with `which mecab-config` # default value is '/usr/local/bin' path_mecab_config='/usr/local/bin' # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "" mecab_wrapper = MecabWrapper(dictType=dictType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = ['テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, stopwords=stopwords ) assert isinstance(filtered_obj, FilteredObject) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')] filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, pos_condition=pos_condition ) assert isinstance(filtered_obj, FilteredObject)
def basic_example_mecab_2x(): # ======================================================== # TOKENIZE # ======================================================== # input is `unicode` type(in python2x) sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # you can choose from "neologd", "all", "ipadic", "user", "", None # "ipadic" and "" is equivalent dictType = "neologd" mecab_wrapper = MecabWrapper(dictType=dictType) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = [u'テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, stopwords=stopwords) assert isinstance(filtered_obj, FilteredObject) # print('-' * 30) print(u'Mecab Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos)) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')] filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj, pos_condition=pos_condition) assert isinstance(filtered_obj, FilteredObject) print('-' * 30) print(u'Mecab Filtering Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos)) ### You can write chain expression on init-instance -> tokenize -> filtering -> list ### filtered_result = MecabWrapper( dictType=dictType).tokenize(sentence).filter( pos_condition=pos_condition).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result)
path_model_file = '../bin/entity_vector/entity_vector.model.bin' dict_type = 'neologd' path_mecab_config = '/usr/local/bin/' pos_condition = [('名詞', )] mysql_username = '******' mysql_hostname = 'localhost' mysql_password = '******' mysql_db_name = 'wikipedia' # ------------------------------------------------------------ entity_linking_model = load_entity_model(path_model_file) mecab_tokenizer = MecabWrapper(dict_type, path_mecab_config=path_mecab_config) model_object = load_entity_model(path_entity_model=path_model_file, is_use_cache=True) # type: Word2Vec mysql_connector = initialize_pymysql_connector(hostname=mysql_hostname, user_name=mysql_username, password=mysql_password, dbname=mysql_db_name) input_sentence = "かつてはイルモア、WCMといったプライベーターがオリジナルマシンで参戦していたほか、カワサキがワークス・チームを送り込んでいたが、2016年現在出場しているのはヤマハ、ホンダ、スズキ、ドゥカティ、アプリリアの5メーカーと、ワークスマシンの貸与等を受けられるサテライトチームとなっている。" filtered_nouns = mecab_tokenizer.filter( parsed_sentence=mecab_tokenizer.tokenize(sentence=input_sentence,return_list=False), pos_condition=pos_condition).convert_list_object() sequence_score_ojects = predict_japanese_wiki_names_with_wikidump(input_tokens=filtered_nouns, wikipedia_db_connector=mysql_connector, entity_vector_model=entity_linking_model, is_use_cache=True, is_sort_object=True) for rank, sequence_obj in enumerate(sequence_score_ojects): print('Rank-{} with score={}'.format(rank, sequence_obj.sequence_score)) print(sequence_obj.get_tokens()) print('-'*30)
def basic_example_mecab_2x(): # ======================================================== # TOKENIZE # ======================================================== # input is `unicode` type(in python2x) sentence = u'テヘラン(ペルシア語: تهران ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' # make MecabWrapper object # path where `mecab-config` command exists. You can check it with `which mecab-config` # default value is '/usr/local/bin' path_mecab_config='/usr/local/bin' # you can choose from "neologd", "all", "ipaddic", "user", "" # "ipadic" and "" is equivalent dictType = "neologd" mecab_wrapper = MecabWrapper(dictType=dictType, path_mecab_config=path_mecab_config) # tokenize sentence. Returned object is list of tuples tokenized_obj = mecab_wrapper.tokenize(sentence=sentence) assert isinstance(tokenized_obj, list) # Returned object is "TokenizedSenetence" class if you put return_list=False tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc stopwords = [u'テヘラン'] assert isinstance(tokenized_obj, TokenizedSenetence) # returned object is "FilteredObject" class filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, stopwords=stopwords ) assert isinstance(filtered_obj, FilteredObject) # print('-'*30) print(u'Mecab Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos)) # pos condition is list of tuples # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')] filtered_obj = mecab_wrapper.filter( parsed_sentence=tokenized_obj, pos_condition=pos_condition ) assert isinstance(filtered_obj, FilteredObject) print('-'*30) print(u'Mecab Filtering Demo') for token_obj in filtered_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) print(u'word_stem:{}, word_surafce:{}, pos:{}'.format( token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos))