Esempio n. 1
0
def basic_example_3x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # In python3x, you don't mind it
    sentence = 'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # path where `mecab-config` command exists. You can check it with `which mecab-config`
    # default value is '/usr/local/bin'
    path_mecab_config = '/usr/local/bin'

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = ""

    mecab_wrapper = MecabWrapper(dictType=dictType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence,
                                           return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = ['テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        stopwords=stopwords)
    assert isinstance(filtered_obj, FilteredObject)

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')]
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        pos_condition=pos_condition)
    assert isinstance(filtered_obj, FilteredObject)
    ### You can write chain expression on init-instance -> tokenize -> filtering -> list  ###
    filtered_result = MecabWrapper(
        dictType=dictType,
        path_mecab_config=path_mecab_config).tokenize(sentence).filter(
            pos_condition=pos_condition).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)
Esempio n. 2
0
def basic_example():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # input is `unicode` type(in python2x)
    # In python3x, you don't mind it
    sentence = u'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # osType is generic or centos. it's because Mecab has different system command in CentOs.
    # If you're using this in CentsOs, put "centos"
    osType = "generic"

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = ""

    mecab_wrapper = MecabWrapper(dictType=dictType, osType=osType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = [u'テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        stopwords=stopwords
    )
    assert isinstance(filtered_obj, FilteredObject)

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')]
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        pos_condition=pos_condition
    )
    assert isinstance(filtered_obj, FilteredObject)
def basic_example_3x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # In python3x, you don't mind it
    sentence = 'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # path where `mecab-config` command exists. You can check it with `which mecab-config`
    # default value is '/usr/local/bin'
    path_mecab_config='/usr/local/bin'

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = ""

    mecab_wrapper = MecabWrapper(dictType=dictType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = ['テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        stopwords=stopwords
    )
    assert isinstance(filtered_obj, FilteredObject)

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')]
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        pos_condition=pos_condition
    )
    assert isinstance(filtered_obj, FilteredObject)
Esempio n. 4
0
def basic_example_mecab_2x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # input is `unicode` type(in python2x)
    sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # you can choose from "neologd", "all", "ipadic", "user", "", None
    # "ipadic" and "" is equivalent
    dictType = "neologd"

    mecab_wrapper = MecabWrapper(dictType=dictType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence,
                                           return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = [u'テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        stopwords=stopwords)
    assert isinstance(filtered_obj, FilteredObject)
    #
    print('-' * 30)
    print(u'Mecab Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos))

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')]
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        pos_condition=pos_condition)
    assert isinstance(filtered_obj, FilteredObject)
    print('-' * 30)
    print(u'Mecab Filtering Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos))
    ### You can write chain expression on init-instance -> tokenize -> filtering -> list  ###
    filtered_result = MecabWrapper(
        dictType=dictType).tokenize(sentence).filter(
            pos_condition=pos_condition).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)
path_model_file = '../bin/entity_vector/entity_vector.model.bin'
dict_type = 'neologd'
path_mecab_config = '/usr/local/bin/'
pos_condition = [('名詞', )]
mysql_username = '******'
mysql_hostname = 'localhost'
mysql_password = '******'
mysql_db_name = 'wikipedia'
# ------------------------------------------------------------
entity_linking_model = load_entity_model(path_model_file)
mecab_tokenizer = MecabWrapper(dict_type, path_mecab_config=path_mecab_config)
model_object = load_entity_model(path_entity_model=path_model_file, is_use_cache=True)  # type: Word2Vec
mysql_connector = initialize_pymysql_connector(hostname=mysql_hostname,
                                               user_name=mysql_username,
                                               password=mysql_password,
                                               dbname=mysql_db_name)

input_sentence = "かつてはイルモア、WCMといったプライベーターがオリジナルマシンで参戦していたほか、カワサキがワークス・チームを送り込んでいたが、2016年現在出場しているのはヤマハ、ホンダ、スズキ、ドゥカティ、アプリリアの5メーカーと、ワークスマシンの貸与等を受けられるサテライトチームとなっている。"
filtered_nouns = mecab_tokenizer.filter(
    parsed_sentence=mecab_tokenizer.tokenize(sentence=input_sentence,return_list=False),
    pos_condition=pos_condition).convert_list_object()

sequence_score_ojects = predict_japanese_wiki_names_with_wikidump(input_tokens=filtered_nouns,
                                                                  wikipedia_db_connector=mysql_connector,
                                                                  entity_vector_model=entity_linking_model,
                                                                  is_use_cache=True,
                                                                  is_sort_object=True)
for rank, sequence_obj in enumerate(sequence_score_ojects):
    print('Rank-{} with score={}'.format(rank, sequence_obj.sequence_score))
    print(sequence_obj.get_tokens())
    print('-'*30)
def basic_example_mecab_2x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # input is `unicode` type(in python2x)
    sentence = u'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # path where `mecab-config` command exists. You can check it with `which mecab-config`
    # default value is '/usr/local/bin'
    path_mecab_config='/usr/local/bin'

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = "neologd"

    mecab_wrapper = MecabWrapper(dictType=dictType, path_mecab_config=path_mecab_config)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = [u'テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        stopwords=stopwords
    )
    assert isinstance(filtered_obj, FilteredObject)
    #
    print('-'*30)
    print(u'Mecab Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem,
            token_obj.word_surface,
            token_obj.tuple_pos))

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')]
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        pos_condition=pos_condition
    )
    assert isinstance(filtered_obj, FilteredObject)
    print('-'*30)
    print(u'Mecab Filtering Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem,
            token_obj.word_surface,
            token_obj.tuple_pos))