def add_auth_feature_bigram(auth_): wakati_path = my_path.project_path()+'src/wakati/{}/'.format(auth_) booklist = new_booklist[auth_] bi_num = make_hinsi_bigram(wakati_path, booklist) bi_per = num_to_per(bi_num) save_path = my_path.project_path()+'src/feature/bigram_per.txt' with open(save_path, mode="rb") as f: res = pickle.load(f) res[auth_] = bi_per with open(save_path, mode="wb") as f: pickle.dump(res, f)
def main(data_opt_): #最初にprint if data_opt_ == 'hinsi': opt = '品詞出現率' elif data_opt_ == 'bigram': opt = '品詞2-gram出現率' print('使用特徴量:'+opt) #全著者実行するため、name_pairで著者ペアリストを作成 key_pairs = clf_pair.name_pair() #結果保存の辞書型 all_result = {} #key_pairs全実行 for pair in key_pairs: #classify result, confusion = clf_main.classify(data_opt_, pair, 5, True, 'forest', 'up') #all_resultに[著者key1-著者key2]のkeyで結果を保存 all_result['-'.join(pair)] = {'res': result, 'con': confusion} #all_resultをpickleでdump with open(my_path.project_path()+'classification/result/res_{}_raw.txt'.format(data_opt_), mode="wb") as f: pickle.dump(all_result, f) #結果の要旨をテキストで保存する関数 write_res_view(all_result, data_opt_, 5)
def init_feature_location(): feature_path = my_path.project_path()+'src/feature/' os.mkdir(feature_path) with open(feature_path+'hinsi_per.txt', mode="wb") as f: pickle.dump({},f) with open(feature_path+'bigram_per.txt', mode="wb") as f: pickle.dump({},f)
def save_file_(list, author): name = list[0] list[1] = ' '.join(list[1]) list[2] = ' '.join(list[2]) list = '\n'.join(list[1:]) with open(my_path.project_path()+'src/wakati/{0}/wakati_{1}.txt'.format(author, name), mode='w') as f: f.write(list)
def add_auth_feature_hinsi(auth_): #分かち書きの場所 wakati_path = my_path.project_path()+'src/wakati/{}/'.format(auth_) #著者の新字新仮名作品リスト booklist = new_booklist[auth_] #品詞毎の数のCounter fnum_auth = make_hinsi_num(wakati_path, booklist) #fnum_authを品詞率に変換 fper_auth = num_to_per(fnum_auth) #保存先ファイルの元データを取り出し save_path = my_path.project_path()+'src/feature/hinsi_per.txt' with open(save_path, mode="rb") as f: res = pickle.load(f) #追加するデータを辞書型に追記 res[auth_] = fper_auth #保存 with open(save_path, mode="wb") as f: pickle.dump(res, f)
def write_res_view(source_, data_opt_, num=None): #ファイルにwriteするための配列 write_list = [] #全著者ペアで繰り返し for k,v in source_.items(): #keyをsplitして著者keyを取り出し、フルネームに変換 authors = k.split('-') authors = [full_name.fullname_dict[i] for i in authors] #著者、正答率をwrite用に用意 authors_print = '著者:'+','.join(authors) score_print = 'score: {:.5f}'.format(v['res']['s']) #重要度write用の変数imp_print imp_print = ['特徴量の重要度'] #引数numで止めるためにcountを設定 count = 0 #重要度降順で繰り返し for key, val in sorted(v['res']['i'].items(), key=lambda x: x[1], reverse=True): #imp_printにappend imp_print.append(key+': {:.5f}'.format(val)) #countがnumと一致したら終了 count += 1 if count == num: break conf_print = [] confusion_header = ['', 'p_0', 'p_1'] #混同行列用の数値を取得 out_confusion = clf_output.gen_confusion(v['con']) #全混同行列で繰り返し for conf in out_confusion: this_print = [] #予測の誤答率を計算 this_print.append('t_1 / p_0 : {:.5f}'.format(conf[1][1] / (conf[0][1] + conf[1][1]))) this_print.append('t_0 / p_1 : {:.5f}'.format(conf[0][2] / (conf[0][2] + conf[1][2]))) this_print.append('--------------------------') #この回の数値をconf_printにappend conf_print.append('\n'.join(this_print)) #ここまで用意したデータをwrite_listにappend write_list.append(authors_print) write_list.append(score_print) write_list.append('\n'.join(imp_print)) write_list.append('\n'.join(conf_print)) #ファイル名指定して保存 with open(my_path.project_path()+'classification/result/res_{}_view.txt'.format(data_opt_), mode="w") as f: f.write('\n'.join(write_list))
import pickle, sys, os from collections import Counter import full_name, my_path with open(my_path.project_path()+'src/booklist/new_booklist.txt', mode="rb") as f: new_booklist = pickle.load(f) #品詞数のcounterを作成 def make_hinsi_num(wakati_path_, booklist_): r_list = [] for book in booklist_: with open(wakati_path_+'wakati_'+book+'.txt', mode="r") as f: hinsi = f.read().split('\n')[1].split() h_counter = Counter(hinsi) r_list.append(h_counter) return r_list #品詞n-gramのcounterを作成 def make_hinsi_bigram(wakati_path_, booklist_): #return用 r_list = [] #新字新仮名作品リスト booklist_で繰り返し for book in booklist_: #この作品のpath bookpath = 'wakati_{}.txt'.format(book) #openして品詞リスト取り出し with open(wakati_path_+bookpath, mode="r") as f: hinsi = f.read().split('\n')[1].split() #bigramのリスト hinsi_bigram = [] #index,valueを取り出して繰り返し
'感動詞-接続詞', '感動詞-指示詞', '感動詞-接頭辞', '感動詞-連体詞', '感動詞-感動詞', ] #----------------------------------- #sourceのロード ''' 使用時はcopy.deepcopyで読み込み そのまま使うと上書きされてデータ量が増える ''' with open(my_path.project_path() + 'classification/source/source_hinsi_per.txt', mode="rb") as f: source_data_hinsi = pickle.load(f) with open(my_path.project_path() + 'classification/source/source_hinsi_bigram.txt', mode="rb") as f: source_data_bigram = pickle.load(f) #----------------------------------- #指定した2著者のデータ取り出し #入力:source_data_hinsi/bigram, [auth, auth] #出力:data(2著者の特徴量のデータ), label(dataの著者ラベル 前者が0, 後者が1)
def auth_wakati_mkdir(auth_): #保存先ディレクトリを作成しておく save_path = my_path.project_path() + 'src/wakati/{0}'.format(auth_) os.mkdir(save_path)
import pickle, my_path #特徴量のデータ source_path = my_path.project_path() + 'src/feature/' #分類用のデータ保存先 dump_path = my_path.project_path() + 'classification/source/' with open(source_path + 'hinsi_per.txt', mode="rb") as f: source_hinsi_dict = pickle.load(f) with open(source_path + 'bigram_per.txt', mode="rb") as f: source_bigram_dict = pickle.load(f) hinsi_label = [ '名詞', '接尾辞', '助詞', '動詞', '特殊', '副詞', '形容詞', '判定詞', '未定義語', '助動詞', '接続詞', '指示詞', '接頭辞', '連体詞', '感動詞' ] bigram_label = [ '名詞-名詞', '名詞-接尾辞', '名詞-助詞', '名詞-動詞', '名詞-特殊', '名詞-副詞', '名詞-形容詞', '名詞-判定詞', '名詞-未定義語', '名詞-助動詞', '名詞-接続詞', '名詞-指示詞',