Example #1
0
def __func_tokenizer(text:str,
                   tokenizer_obj:MecabWrapper,
                   pos_condition:List[Tuple[str,...]]=None,
                   is_surface:bool=False)->List[str]:
    """* What you can do
    - This is base function tokenizer.
    - You use this function with functools.partial
    """
    if pos_condition is None:
        return tokenizer_obj.tokenize(sentence=text, is_surface=is_surface).convert_list_object()
    else:
        return tokenizer_obj.tokenize(sentence=text, is_surface=is_surface).filter(pos_condition).convert_list_object()
Example #2
0
def basic_example():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # input is `unicode` type(in python2x)
    # In python3x, you don't mind it
    sentence = u'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # osType is generic or centos. it's because Mecab has different system command in CentOs.
    # If you're using this in CentsOs, put "centos"
    osType = "generic"

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = ""

    mecab_wrapper = MecabWrapper(dictType=dictType, osType=osType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = [u'テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        stopwords=stopwords
    )
    assert isinstance(filtered_obj, FilteredObject)

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')]
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        pos_condition=pos_condition
    )
    assert isinstance(filtered_obj, FilteredObject)
def basic_example_3x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # In python3x, you don't mind it
    sentence = 'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # path where `mecab-config` command exists. You can check it with `which mecab-config`
    # default value is '/usr/local/bin'
    path_mecab_config='/usr/local/bin'

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = ""

    mecab_wrapper = MecabWrapper(dictType=dictType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = ['テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        stopwords=stopwords
    )
    assert isinstance(filtered_obj, FilteredObject)

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')]
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        pos_condition=pos_condition
    )
    assert isinstance(filtered_obj, FilteredObject)
Example #4
0
def advanced_example_3x():
    # ========================================================
    # USE YOUE OWN DICTIONARY
    # with your own dictionary, you can force Mecab to make some word into one token
    # ========================================================
    # make your own "user dictionary" with CSV file
    # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html
    example_user_dict = "userdict.csv"

    # set dictType='user' or dictType='all'
    # set pathUserDictCsv
    mecab_wrapper = MecabWrapper(dictType='user',
                                 pathUserDictCsv=example_user_dict)
    sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
    tokenized_obj = mecab_wrapper.tokenize(sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)
    for token_obj in tokenized_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        if token_obj.word_stem == 'ペルシア語':
            print(token_obj.word_stem)
Example #5
0
def basic_example_3x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # In python3x, you don't mind it
    sentence = 'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # path where `mecab-config` command exists. You can check it with `which mecab-config`
    # default value is '/usr/local/bin'
    path_mecab_config = '/usr/local/bin'

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = ""

    mecab_wrapper = MecabWrapper(dictType=dictType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence,
                                           return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = ['テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        stopwords=stopwords)
    assert isinstance(filtered_obj, FilteredObject)

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [('名詞', '固有名詞'), ('動詞', '自立')]
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        pos_condition=pos_condition)
    assert isinstance(filtered_obj, FilteredObject)
    ### You can write chain expression on init-instance -> tokenize -> filtering -> list  ###
    filtered_result = MecabWrapper(
        dictType=dictType,
        path_mecab_config=path_mecab_config).tokenize(sentence).filter(
            pos_condition=pos_condition).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)
def advanced_example_3x():
    # ========================================================
    # USE YOUE OWN DICTIONARY
    # with your own dictionary, you can force Mecab to make some word into one token
    # ========================================================
    # make your own "user dictionary" with CSV file
    # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html
    example_user_dict = "userdict.csv"

    # set dictType='user' or dictType='all'
    # set pathUserDictCsv
    mecab_wrapper = MecabWrapper(
        dictType='user',
        pathUserDictCsv=example_user_dict
    )
    sentence = 'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
    tokenized_obj = mecab_wrapper.tokenize(sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)
    for token_obj in tokenized_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        if token_obj.word_stem == 'ペルシア語':
            print(token_obj.word_stem)
Example #7
0
class MecabTokenizer:
    def __init__(self, dict_type=None):
        if dict_type:
            self.tokenizer = MecabWrapper(dictType=dict_type)
        else:
            self.tokenizer = MecabWrapper(dictType=None)

    def tokenize(self, text):
        tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects
        return [
            dict(analyzed_line=obj.analyzed_line,
                 word_surface=obj.word_surface,
                 word_stem=obj.word_stem,
                 pos=list(obj.tuple_pos),
                 misc_info=obj.misc_info) for obj in tokenized_objects
        ]
Example #8
0
 def __init__(self, dict_type=None):
     if dict_type:
         self.tokenizer = MecabWrapper(dictType=dict_type)
     else:
         self.tokenizer = MecabWrapper(dictType=None)
Example #9
0
def basic_example_mecab_2x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # input is `unicode` type(in python2x)
    sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # you can choose from "neologd", "all", "ipadic", "user", "", None
    # "ipadic" and "" is equivalent
    dictType = "neologd"

    mecab_wrapper = MecabWrapper(dictType=dictType)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence,
                                           return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = [u'テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        stopwords=stopwords)
    assert isinstance(filtered_obj, FilteredObject)
    #
    print('-' * 30)
    print(u'Mecab Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos))

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')]
    filtered_obj = mecab_wrapper.filter(parsed_sentence=tokenized_obj,
                                        pos_condition=pos_condition)
    assert isinstance(filtered_obj, FilteredObject)
    print('-' * 30)
    print(u'Mecab Filtering Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem, token_obj.word_surface, token_obj.tuple_pos))
    ### You can write chain expression on init-instance -> tokenize -> filtering -> list  ###
    filtered_result = MecabWrapper(
        dictType=dictType).tokenize(sentence).filter(
            pos_condition=pos_condition).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)
"""In this example, you see how to get wikipedia-liked information from Japanese sentence
"""

# ------------------------------------------------------------
# PARAMETERS
path_model_file = '../bin/entity_vector/entity_vector.model.bin'
dict_type = 'neologd'
path_mecab_config = '/usr/local/bin/'
pos_condition = [('名詞', )]
mysql_username = '******'
mysql_hostname = 'localhost'
mysql_password = '******'
mysql_db_name = 'wikipedia'
# ------------------------------------------------------------
entity_linking_model = load_entity_model(path_model_file)
mecab_tokenizer = MecabWrapper(dict_type, path_mecab_config=path_mecab_config)
model_object = load_entity_model(path_entity_model=path_model_file, is_use_cache=True)  # type: Word2Vec
mysql_connector = initialize_pymysql_connector(hostname=mysql_hostname,
                                               user_name=mysql_username,
                                               password=mysql_password,
                                               dbname=mysql_db_name)

input_sentence = "かつてはイルモア、WCMといったプライベーターがオリジナルマシンで参戦していたほか、カワサキがワークス・チームを送り込んでいたが、2016年現在出場しているのはヤマハ、ホンダ、スズキ、ドゥカティ、アプリリアの5メーカーと、ワークスマシンの貸与等を受けられるサテライトチームとなっている。"
filtered_nouns = mecab_tokenizer.filter(
    parsed_sentence=mecab_tokenizer.tokenize(sentence=input_sentence,return_list=False),
    pos_condition=pos_condition).convert_list_object()

sequence_score_ojects = predict_japanese_wiki_names_with_wikidump(input_tokens=filtered_nouns,
                                                                  wikipedia_db_connector=mysql_connector,
                                                                  entity_vector_model=entity_linking_model,
                                                                  is_use_cache=True,
Example #11
0

PATH_TRAINING_TEXT = './wikipedia_data/wikipedia-full.json'
PATH_TEST_TEXT = './wikipedia_data/wikipedia-evaluation-full.json'
PATH_ENTITY_VECTOR = './entity_vector/entity_vector.model.bin'
PATH_SAVE_TARINED_MODEL = './trained_auto_encoder.h5'
POS_CONDITION = [('名詞',), ('動詞', '自立'), ('形容詞', '自立'), ('副詞',), ('助動詞',), ('連体詞',)]

## check file existing ##
if not os.path.exists(PATH_TRAINING_TEXT):
    raise FileExistsError()
if not os.path.exists(PATH_ENTITY_VECTOR):
    raise FileExistsError()

## initialize tokenizer funtion ##
tokenizer_obj = MecabWrapper(dictType='neologd')
get_token = partial(__func_tokenizer,
                    tokenizer_obj=tokenizer_obj,
                    pos_condition=POS_CONDITION,
                    is_surface=False)

## load word embedding ##
try:
    embedding_model = KeyedVectors.load_word2vec_format(PATH_ENTITY_VECTOR,
                                                        **{'binary': True, 'unicode_errors': 'ignore'})
except:
    embedding_model = Word2Vec.load_word2vec_format(PATH_ENTITY_VECTOR,
                                                        **{'binary': True, 'unicode_errors': 'ignore'})


## make training data ##
def basic_example_mecab_2x():
    # ========================================================
    # TOKENIZE
    # ========================================================

    # input is `unicode` type(in python2x)
    sentence = u'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    # make MecabWrapper object
    # path where `mecab-config` command exists. You can check it with `which mecab-config`
    # default value is '/usr/local/bin'
    path_mecab_config='/usr/local/bin'

    # you can choose from "neologd", "all", "ipaddic", "user", ""
    # "ipadic" and "" is equivalent
    dictType = "neologd"

    mecab_wrapper = MecabWrapper(dictType=dictType, path_mecab_config=path_mecab_config)

    # tokenize sentence. Returned object is list of tuples
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence)
    assert isinstance(tokenized_obj, list)

    # Returned object is "TokenizedSenetence" class if you put return_list=False
    tokenized_obj = mecab_wrapper.tokenize(sentence=sentence, return_list=False)
    assert isinstance(tokenized_obj, TokenizedSenetence)

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions

    # stopword is list objetc
    stopwords = [u'テヘラン']
    assert isinstance(tokenized_obj, TokenizedSenetence)
    # returned object is "FilteredObject" class
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        stopwords=stopwords
    )
    assert isinstance(filtered_obj, FilteredObject)
    #
    print('-'*30)
    print(u'Mecab Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem,
            token_obj.word_surface,
            token_obj.tuple_pos))

    # pos condition is list of tuples
    # You can set POS condition "ChaSen 品詞体系 (IPA品詞体系)" of this page http://www.unixuser.org/~euske/doc/postag/#chasen
    pos_condition = [(u'名詞', u'固有名詞'), (u'動詞', u'自立')]
    filtered_obj = mecab_wrapper.filter(
        parsed_sentence=tokenized_obj,
        pos_condition=pos_condition
    )
    assert isinstance(filtered_obj, FilteredObject)
    print('-'*30)
    print(u'Mecab Filtering Demo')
    for token_obj in filtered_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_obj.word_stem,
            token_obj.word_surface,
            token_obj.tuple_pos))