Esempio n. 1
0
def add_auth_feature_bigram(auth_):
    wakati_path = my_path.project_path()+'src/wakati/{}/'.format(auth_)
    booklist = new_booklist[auth_]

    bi_num = make_hinsi_bigram(wakati_path, booklist)
    bi_per = num_to_per(bi_num)

    save_path = my_path.project_path()+'src/feature/bigram_per.txt'
    with open(save_path, mode="rb") as f:
        res = pickle.load(f)

    res[auth_] = bi_per

    with open(save_path, mode="wb") as f:
        pickle.dump(res, f)
Esempio n. 2
0
def main(data_opt_):
    #最初にprint
    if data_opt_ == 'hinsi':
        opt = '品詞出現率'
    elif data_opt_ == 'bigram':
        opt = '品詞2-gram出現率'
    print('使用特徴量:'+opt)

    #全著者実行するため、name_pairで著者ペアリストを作成
    key_pairs = clf_pair.name_pair()
    #結果保存の辞書型
    all_result = {}
    #key_pairs全実行
    for pair in key_pairs:
        #classify
        result, confusion = clf_main.classify(data_opt_, pair, 5, True, 'forest', 'up')
        #all_resultに[著者key1-著者key2]のkeyで結果を保存
        all_result['-'.join(pair)] = {'res': result, 'con': confusion}

    #all_resultをpickleでdump
    with open(my_path.project_path()+'classification/result/res_{}_raw.txt'.format(data_opt_), mode="wb") as f:
        pickle.dump(all_result, f)

    #結果の要旨をテキストで保存する関数
    write_res_view(all_result, data_opt_, 5)
Esempio n. 3
0
def init_feature_location():
    feature_path = my_path.project_path()+'src/feature/'
    os.mkdir(feature_path)
    with open(feature_path+'hinsi_per.txt', mode="wb") as f:
        pickle.dump({},f)
    with open(feature_path+'bigram_per.txt', mode="wb") as f:
        pickle.dump({},f)
def save_file_(list, author):
    name = list[0]
    list[1] = ' '.join(list[1])
    list[2] = ' '.join(list[2])
    list = '\n'.join(list[1:])
    with open(my_path.project_path()+'src/wakati/{0}/wakati_{1}.txt'.format(author, name), mode='w') as f:
        f.write(list)
Esempio n. 5
0
def add_auth_feature_hinsi(auth_):
    #分かち書きの場所
    wakati_path = my_path.project_path()+'src/wakati/{}/'.format(auth_)
    #著者の新字新仮名作品リスト
    booklist = new_booklist[auth_]

    #品詞毎の数のCounter
    fnum_auth = make_hinsi_num(wakati_path, booklist)
    #fnum_authを品詞率に変換
    fper_auth = num_to_per(fnum_auth)

    #保存先ファイルの元データを取り出し
    save_path = my_path.project_path()+'src/feature/hinsi_per.txt'
    with open(save_path, mode="rb") as f:
        res = pickle.load(f)

    #追加するデータを辞書型に追記
    res[auth_] = fper_auth
    #保存
    with open(save_path, mode="wb") as f:
        pickle.dump(res, f)
Esempio n. 6
0
def write_res_view(source_, data_opt_, num=None):
    #ファイルにwriteするための配列
    write_list = []

    #全著者ペアで繰り返し
    for k,v in source_.items():
        #keyをsplitして著者keyを取り出し、フルネームに変換
        authors = k.split('-')
        authors = [full_name.fullname_dict[i] for i in authors]
        #著者、正答率をwrite用に用意
        authors_print = '著者:'+','.join(authors)
        score_print = 'score: {:.5f}'.format(v['res']['s'])
        #重要度write用の変数imp_print
        imp_print = ['特徴量の重要度']
        #引数numで止めるためにcountを設定
        count = 0
        #重要度降順で繰り返し
        for key, val in sorted(v['res']['i'].items(), key=lambda x: x[1], reverse=True):
            #imp_printにappend
            imp_print.append(key+': {:.5f}'.format(val))
            #countがnumと一致したら終了
            count += 1
            if count == num:
                break


        conf_print = []
        confusion_header = ['', 'p_0', 'p_1']
        #混同行列用の数値を取得
        out_confusion = clf_output.gen_confusion(v['con'])
        #全混同行列で繰り返し
        for conf in out_confusion:
            this_print = []
            #予測の誤答率を計算
            this_print.append('t_1 / p_0 : {:.5f}'.format(conf[1][1] / (conf[0][1] + conf[1][1])))
            this_print.append('t_0 / p_1 : {:.5f}'.format(conf[0][2] / (conf[0][2] + conf[1][2])))
            this_print.append('--------------------------')
            #この回の数値をconf_printにappend
            conf_print.append('\n'.join(this_print))

        #ここまで用意したデータをwrite_listにappend
        write_list.append(authors_print)
        write_list.append(score_print)
        write_list.append('\n'.join(imp_print))
        write_list.append('\n'.join(conf_print))

    #ファイル名指定して保存
    with open(my_path.project_path()+'classification/result/res_{}_view.txt'.format(data_opt_), mode="w") as f:
        f.write('\n'.join(write_list))
Esempio n. 7
0
import pickle, sys, os
from collections import Counter
import full_name, my_path

with open(my_path.project_path()+'src/booklist/new_booklist.txt', mode="rb") as f:
    new_booklist = pickle.load(f)

#品詞数のcounterを作成
def make_hinsi_num(wakati_path_, booklist_):
    r_list = []
    for book in booklist_:
        with open(wakati_path_+'wakati_'+book+'.txt', mode="r") as f:
            hinsi = f.read().split('\n')[1].split()
        h_counter = Counter(hinsi)
        r_list.append(h_counter)
    return r_list

#品詞n-gramのcounterを作成
def make_hinsi_bigram(wakati_path_, booklist_):
    #return用
    r_list = []
    #新字新仮名作品リスト booklist_で繰り返し
    for book in booklist_:
        #この作品のpath
        bookpath = 'wakati_{}.txt'.format(book)
        #openして品詞リスト取り出し
        with open(wakati_path_+bookpath, mode="r") as f:
            hinsi = f.read().split('\n')[1].split()
        #bigramのリスト
        hinsi_bigram = []
        #index,valueを取り出して繰り返し
Esempio n. 8
0
    '感動詞-接続詞',
    '感動詞-指示詞',
    '感動詞-接頭辞',
    '感動詞-連体詞',
    '感動詞-感動詞',
]

#-----------------------------------

#sourceのロード
'''
使用時はcopy.deepcopyで読み込み
そのまま使うと上書きされてデータ量が増える
'''

with open(my_path.project_path() +
          'classification/source/source_hinsi_per.txt',
          mode="rb") as f:
    source_data_hinsi = pickle.load(f)

with open(my_path.project_path() +
          'classification/source/source_hinsi_bigram.txt',
          mode="rb") as f:
    source_data_bigram = pickle.load(f)

#-----------------------------------


#指定した2著者のデータ取り出し
#入力:source_data_hinsi/bigram, [auth, auth]
#出力:data(2著者の特徴量のデータ), label(dataの著者ラベル 前者が0, 後者が1)
def auth_wakati_mkdir(auth_):
    #保存先ディレクトリを作成しておく
    save_path = my_path.project_path() + 'src/wakati/{0}'.format(auth_)
    os.mkdir(save_path)
Esempio n. 10
0
import pickle, my_path

#特徴量のデータ
source_path = my_path.project_path() + 'src/feature/'
#分類用のデータ保存先
dump_path = my_path.project_path() + 'classification/source/'

with open(source_path + 'hinsi_per.txt', mode="rb") as f:
    source_hinsi_dict = pickle.load(f)

with open(source_path + 'bigram_per.txt', mode="rb") as f:
    source_bigram_dict = pickle.load(f)

hinsi_label = [
    '名詞', '接尾辞', '助詞', '動詞', '特殊', '副詞', '形容詞', '判定詞', '未定義語', '助動詞', '接続詞',
    '指示詞', '接頭辞', '連体詞', '感動詞'
]

bigram_label = [
    '名詞-名詞',
    '名詞-接尾辞',
    '名詞-助詞',
    '名詞-動詞',
    '名詞-特殊',
    '名詞-副詞',
    '名詞-形容詞',
    '名詞-判定詞',
    '名詞-未定義語',
    '名詞-助動詞',
    '名詞-接続詞',
    '名詞-指示詞',