def load_data(binary_data_path): start_time = time.time() binary_data_dic = eval(read_file(binary_data_path)) end_time = time.time() print('读取数据用时:{}s'.format(end_time - start_time)) return binary_data_dic
def con_to_levn(con, ori_datas_path, le_file_path, save_path, le_n, title_n): ''' param ori_datas_path:txt文件,索引为id、标题、关键词、摘要、中图分类号、母体文献 param le_file_path:层级目录文件 param save_path:标注的数据集存储路径 param le_n:输入层级 param title_n:抽取的标题重复次数 output:标注好的分类文件夹,每个文件夹代表一个n级类别,类别中有多个文本数据。 ''' train_path = save_path + 'train/' test_path = save_path + 'test/' if not os.path.exists(train_path): os.makedirs(train_path) if not os.path.exists(test_path): os.makedirs(test_path) start2 = time.time() #读取数据 #con = read_datas(ori_datas_path) levs = list(con['中图分类号']) # 读取层级目录 read_Ch_cate(save_path, le_file_path, le_n) le_n_names = read_file(save_path + 'level_' + str(le_n) + '.txt', 'utf-8').split(',') # 抽取n级目录数据 select_txt(le_n, con, le_n_names, levs, train_path, test_path, stopword_path) end2 = time.time() run_slect_time = round(end2 - start2, 3) print('生成数据集运行时间:' + str(run_slect_time) + 's')
def statistics_corpus(data_path): print('='*40) print('I-2.统计数据...') level_dic = {} level1_count = 0 all_doc_count = 0 for level in os.listdir(data_path): if '.txt' in level or level.startswith('.'): continue temp_count = 0 le1_list = os.listdir(data_path+level+'/') if len(le1_list): level_dic[level] = [] else: os.rmdir(data_path+level+'/') for file in le1_list: if 'count' in file: con = read_file(data_path+level+'/'+file,'utf-8').split('\n') level1_count += len(con)-1 level_dic[level].append(len(con)-1) for i in con[:-1]: num = i.split('-->')[1] temp_count += int(num) level_dic[level].append(temp_count) all_doc_count += temp_count print(str(len(con)-1)+'/'+str(temp_count)) write_data(data_path+'statistics.txt','level-1\t二级类别数目\t文档数目\n','single') write_data(data_path+'statistics.txt',level_dic.items()) write_data(data_path+'statistics.txt','\n总二级类别数目:%d'%level1_count+'\n总文档数目:%d'%all_doc_count,'single')
def merge_level3(file_path, fp): for level in os.listdir(file_path): if '.txt' in level: continue for file in os.listdir(file_path + level + '/'): if 'count' in file: continue con = read_file(file_path + level + '/' + file, 'utf-8') fp.write(con)
def con_to_levn(con, ori_datas_path, le_path_list, save_path, class_info, title_n): ''' param ori_datas_path:txt文件,索引为id、标题、关键词、摘要、中图分类号、母体文献 param le_path_list:层级目录文件 param save_path:标注的数据集存储路径 param class_info:输入层级 param title_n:抽取的标题重复次数 output:标注好的分类文件夹,每个文件夹代表一个n级类别,类别中有多个文本数据。 ''' train_path = save_path + 'train/' test_path = save_path + 'test/' if not os.path.exists(train_path): os.makedirs(train_path) if not os.path.exists(test_path): os.makedirs(test_path) #读取数据 # con = read_datas(ori_datas_path) # levs = [] levs_area = list(con['区域分类']) levs_industry = list(con['行业分类']) levs_subject = list(con['学科分类']) for i in range(len(levs_subject)): levs_subject[i] = str(levs_subject[i]) levs_china_class = list(con['中图分类']) levs.append(levs_area) levs.append(levs_industry) levs.append(levs_subject) levs.append(levs_china_class) classes = list(class_info.keys()) lens = list(class_info.values()) for i in range(len(le_path_list)): # 读取层级目录 start2 = time.time() read_Ch_cate(save_path, le_path_list[i], classes[i], lens[i]) le_n_names = read_file( save_path + 'level_' + classes[i] + '_' + str(lens[i]) + '.txt', 'utf-8').split(',') # 抽取n级目录数据 select_txt(classes[i], con, le_n_names, levs[i], train_path + classes[i], test_path, stopword_path, lens[i]) end2 = time.time() run_slect_time = round(end2 - start2, 3) print('生成数据集运行时间:' + str(run_slect_time) + 's')
def train_word2vec(train_path, model_save_path): data = read_file(train_path, 'utf-8').split('\n') start = time.time() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = Word2Vec(data, sg=0, hs=1, size=200, window=5, min_count=5, iter=50) #右边是:sg=1;;左边是:sg=0 model.save(model_save_path + 'w2v_iter_20_model') end = time.time() print('训练时间%s' % str(start - end))
def get_level3_name(file_path, save_path): level3_dic = {} t = open(save_path, 'a', encoding='utf-8') for level in os.listdir(file_path): if '.txt' in level: continue level3_dic[level] = [] for file in os.listdir(file_path + level + '/'): if 'count' in file: con = read_file(file_path + level + '/' + file, 'utf-8').split('\n') for i in con[:-2]: level3 = i.split('-->')[0] t.write(level3 + '\n') print(level3) level3_dic[level].append(level3) print(level3_dic)
def count_level1(file_path): level_dic = {} level1_count = 0 for level in os.listdir(file_path): if '.txt' in level: continue temp_count = 0 level_dic[level] = [] for file in os.listdir(file_path + level + '/'): if 'count' in file: con = read_file(file_path + level + '/' + file, 'utf-8').split('\n') level1_count += len(con) - 2 level_dic[level].append(len(con) - 2) for i in con[:-2]: num = i.split('-->')[1] temp_count += int(num[:-1]) level_dic[level].append(temp_count) print(str(len(con) - 2) + '/' + str(temp_count)) print(str(level_dic)) print(level1_count)
def div_train_test(file_path, train_path, test_path): if not os.path.exists(train_path): os.makedirs(train_path) if not os.path.exists(test_path): os.makedirs(test_path) lev1 = os.listdir(file_path) for l1 in lev1: if '.txt' in l1: continue if len(os.listdir(file_path + l1 + '/')) == 0: continue file = file_path + l1 + '/' + l1 + '_train.txt' count_file = file_path + l1 + '/' + l1 + '_train_count.txt' save_train_path = train_path + l1 + '/' save_test_path = test_path + l1 + '/' if not os.path.exists(save_train_path): os.makedirs(save_train_path) if not os.path.exists(save_test_path): os.makedirs(save_test_path) save_train_file = save_train_path + l1 + '_train.txt' save_test_file = save_test_path + l1 + '_test.txt' save_tr_count = save_train_path + l1 + '_train_count.txt' save_te_count = save_test_path + l1 + '_test_count.txt' train_fp = open(save_train_file, 'w', encoding='utf-8') train_count_fp = open(save_tr_count, 'w', encoding='utf-8') test_fp = open(save_test_file, 'w', encoding='utf-8') test_count_fp = open(save_te_count, 'w', encoding='utf-8') con = read_file(file, 'utf-8').split('\n') count_con = read_file(count_file, 'utf-8').split('\n') lev3_test_dic = {} #统计三级类数目,确定划分比例 for l3 in count_con[:-2]: le3_count = l3.split('-->') num = le3_count[1].replace(',', '') test_num = int(num) / 5 #按照4:1划分数据集 train_num = int(int(num) - int(test_num)) lev3_test_dic[le3_count[0]] = int(test_num) train_count_fp.write(le3_count[0] + '-->' + str(train_num) + '\n') test_count_fp.write(le3_count[0] + '-->' + str(int(test_num)) + '\n') # print(lev3_test_dic) train_count_fp.write(str(count_con[-2])) test_count_fp.write(str(count_con[-2])) temp_test_count = 0 temp_label = list(lev3_test_dic.keys())[0] for act in con[:-1]: row = act.split(',') label = row[0].split('__')[-1] test_count = lev3_test_dic[label] if label == temp_label: if temp_test_count <= test_count: test_fp.write(act) else: train_fp.write(act) temp_label = label temp_test_count += 1 else: temp_test_count = 1 train_fp.write(act) temp_label = label
def select_txt(le_n, con, le_n_names, levs, train_path, test_path, stopword_path): print('抽取数据,生成训练和测试数据集...') stopword = read_file(stopword_path, 'utf').split('\n') #分类的类别数量 count_train_1 = 0 count_test_1 = 0 #临时数据保存 train_list = [] test_list = [] #三级类数目统计 le3_count_cate = 0 #类别标志 temp_cate = le_n_names[0][0] for i in le_n_names: #n级目录列表 #类别数据个数统计 count_train_3 = 0 count_test_3 = 0 #临时保存数据 test_1 = [] train_1 = [] test_3 = [] train_3 = [] #训练文件集一级类名字 le1_name = i[0] len_num = 1 if '/' in i: i = i.replace(r'/', ' ') le_3_train_path = train_path + i[0] + '/' le_3_test_path = test_path + i[0] + '/' if not os.path.exists(le_3_train_path): os.makedirs(le_3_train_path) if not os.path.exists(le_3_test_path): os.makedirs(le_3_test_path) for m in enumerate(levs[:]): #原始数据分类号列表 m_list = [] try: if ';' in m[1]: #类别数大于1时 m_list = m[1].split(';') else: m_list.append(m[1]) except: continue for p in m_list: #print(p[0]) if len(p) == 0: continue elif i[0] == p[0] and i in p: #符合级目录则抽取 index = m[0] #获得待抽取数据索引 item = con.loc[index] title = item['标题'] #抽取数据标题 content = item['摘要'] #抽取数据摘要,作为文本内容 key_word = item['关键词'] #抽取数据关键词 content = title + ' ' + content + ' ' + key_word try: if len_num % 4 == 0: test_1.append(deal_datas(i[0], content, stopword)) test_3.append(deal_datas(i, content, stopword)) count_test_3 += 1 else: train_1.append(deal_datas(i[0], content, stopword)) train_3.append(deal_datas(i, content, stopword)) count_train_3 += 1 except: print('抽取数据类别%s时出错!' % i) else: len_num += 1 if count_train_3 >= 100: #抽取三级类训练数据集 save_file(le_3_train_path + i[0] + '_train_count.txt', i + '-->' + str(count_train_3) + ',', 'a') random.shuffle(train_3) write_datas(le_3_train_path + i[0] + '_train.txt', train_3) train_list.append(train_1) if temp_cate == i[0]: le3_count_cate += 1 else: save_file( train_path + temp_cate + '/' + temp_cate + '_train_count.txt', '类别数目' + '-->' + str(le3_count_cate) + ',', 'a') le3_count_cate = 1 temp_cate = i[0] if count_test_3 > 35: #抽取三级类测试数据集 save_file(le_3_test_path + i[0] + '_test_count.txt', i + '-->' + str(count_test_3) + ',', 'a') random.shuffle(test_3) write_datas(le_3_test_path + i[0] + '_test.txt', test_3) test_list.append(test_1) #打乱数据,使同类别的数据分散 for l1 in test_list: random.shuffle(l1) write_datas(test_path + 'level_' + le_n[0] + '_test.txt', l1) for l2 in train_list: random.shuffle(l2) write_datas(train_path + 'level_' + le_n[0] + '_train.txt', l2) save_file(train_path + temp_cate + '/' + temp_cate + '_train_count.txt', '类别数目' + '-->' + str(le3_count_cate) + ',', 'a')
def select_txt(le_n,con,le_n_names,levs,train_path,test_path,stopword_path,lens): print('抽取数据,生成训练和测试数据集...') #分类的类别数量 cate_count = 0 #临时数据保存 train_list = [] test_list = [] stopword = read_file(stopword_path,'utf').split('\n') for i in le_n_names: #n级目录列表 #类别数据个数统计 count_train = 0 count_test = 0 #临时保存数据 test = [] train = [] #训练文件集一级类名字 # le1_name = i[0] len_num = 1 if '/' in i: i=i.replace(r'/',' ') for j in enumerate(levs[:]): #原始数据分类号列表 j_list=[] try: if ';' in j[1]: #类别数大于1时 j_list = j[1].split(';') else: j_list.append(j[1]) except: continue for p in j_list: p=str(p) #print(p[0]) if len(p) == 0: continue # elif i[0] == p[0] and i in p: #符合n级目录则抽取 elif len(i) >= 3 and len(p) >= 3 and i[:3] == p[:3]: #符合n级目录则抽取 index = j[0] #获得待抽取数据索引 item = con.loc[index] # title = item['标题'] #抽取数据标题 content = item['ContentText'] #抽取数据摘要,作为文本内容 # key_word = item['关键词'] #抽取数据关键词 # content = title+' '+content+' '+key_word try: # if len_num%8==0: # test.append(deal_datas(i,content,stopword)) # count_test+=1 # else: train.append(deal_datas(i,content,stopword)) count_train+=1 except: print('抽取数据类别%s时出错!'%i) else: len_num+=1 if count_train >= 20 : save_file(train_path+'train_'+lens+'_count.txt',i+'-->'+str(count_train)+',','a') train_list.append(train) cate_count += 1 # if count_test >= 0: # save_file(test_path+'test_'+lens+'_count.txt',i+'-->'+str(count_test)+',','a') # test_list.append(test) #打乱数据,使得同类别的样本不至于扎堆 for l1 in test_list: random.shuffle(l1) write_datas(test_path+'level_'+lens+'_test.txt',l1) for l2 in train_list: random.shuffle(l2) write_datas(train_path+'level_'+lens+'_train.txt',l2) save_file(train_path+'train_'+lens+'_count.txt','类别数目: '+str(cate_count)+',','a')
def train_merge_classifier(algorithm, train_merge_path, w2v_model, model_save_path, result_save_path): # generate merge sub-dataset print('..生成融合训练集') train_start = time.time() train_merge_data = eval(read_file(train_merge_path)) cate_list = list(train_merge_data.keys()) class_number = len(cate_list) # 得到融合训练集 train_merge_dic = get_merge_dataset(class_number, cate_list, train_merge_data) # 结果保存路径 merge_result_save_path = result_save_path + 'merge_result/' if not os.path.exists(merge_result_save_path): os.makedirs(merge_result_save_path) # 融合模型保存路径 merge_model_path = model_save_path + 'merge_model/' if not os.path.exists(merge_model_path): os.makedirs(merge_model_path) # train merge classifier # 加载二分类器映射文件 # model_name_map = eval(read_file(result_save_path+algorithm+'_model_name_map.txt')) # model_name_map = json.loads(read_file(result_save_path+algorithm+'_model_name_map_json.txt')) # 速度比上一行快 # get sklearn classifier model clf = get_model(algorithm) # 加载w2v模型 # w2v_model = load_w2v_model(w2v_model_path) # load binary models model_dic = load_binary_model(algorithm, class_number, cate_list, model_save_path) print('..训练') sum_score = 0.0 all_less_str, all_result_str = '', '' for k in range(class_number): start_time = time.time() cur_cate = cate_list[k] item = train_merge_dic[cur_cate][0] # print(train_merge_dic[cur_cate][0]) [['text1','text2'],['R','-R']] con, labels = item[0], item[1] merge_size = len(labels) lb_list = [] text_pro = [] for label in labels: if label == cur_cate: lb_list.append(1) else: lb_list.append(0) pro_matrix = np.array([], []) for model in model_dic[cur_cate]: # clf = load_model(model) binary_model = model[1] # voc = model_name_map[model[0]] train_w2v = get_train_vec(con, w2v_model) # vectorizer = TfidfVectorizer(vocabulary=voc) # tdm = vectorizer.fit_transform(con) pred = binary_model.predict_proba( train_w2v) # pred = clf.predict(tdm) for i in range(len(pred)): text_pro.append(pred[i][1]) pro_matrix = np.array(text_pro).reshape((NUMBER, merge_size)).T ''' lb_list = [] all_text_pro = [] for text,label in zip(con,labels): text_pro = [] if label == cur_cate: lb_list.append(1) else: lb_list.append(0) for model in model_dic[cur_cate]: print(model) print(text) clf = load_model(model) voc = model_name_map[model] vectorizer = TfidfVectorizer(vocabulary=voc) tdm = vectorizer.fit_transform([text]) pred = clf.predict_proba(tdm) # pred = clf.predict(tdm) print(pred) text_pro.append(pred[0][1]) print(pred[0][1]) break all_text_pro.append(text_pro) # print(text_pro) break # print(len(all_text_pro)) # print(all_text_pro[:5]) ''' # training x_train, x_test, y_train, y_test = train_test_split(pro_matrix, lb_list, test_size=0.3) classifier = clf.fit(pro_matrix, lb_list) pred = clf.predict(x_test) score = metrics.accuracy_score(y_test, pred) if score <= 0.85: all_less_str += cur_cate + ':' + str(score) + '\n' sum_score += score result_str = '%s merge classifier accuracy : %f\n' % (cur_cate, round(score, 3)) all_result_str += result_str print(result_str) save_file_lines(merge_result_save_path + algorithm + '_test.txt', result_str, 'a') model_full_path = merge_model_path + cur_cate + '_' + algorithm + '_merge.model' save_model(classifier, model_full_path) end_time = time.time() print('one run time {}\n'.format(end_time - start_time)) avg_score = sum_score / class_number avg_score_str = algorithm + ' merge classifier avg accuracy %f' % avg_score print(avg_score_str) train_end = time.time() train_run_time = round(train_end - train_start, 4) merge_run_time_str = 'merge train time: %f' % (train_run_time) print(merge_run_time_str) save_file_lines(merge_result_save_path + algorithm + '_score_less0.85.txt', all_less_str, 'w') # save_file_lines(merge_result_save_path+algorithm+'_test.txt',all_result_str,'w') save_file_lines(merge_result_save_path + algorithm + '_test.txt', avg_score_str + '\n' + merge_run_time_str, 'a')
def train_binary_classifier(algorithm, train_path, model_save_path, result_save_path): # 获取类别列表 # 加载数据 train_dic = json.loads(read_file(train_path)) cate_list = list(train_dic.keys()) class_number = len(cate_list) train_start = time.time() result_save_full_path = result_save_path + 'binary_result/' if not os.path.exists(result_save_full_path): os.makedirs(result_save_full_path) # 加载模型 clf = get_model(algorithm) print('..训练分类器') all_score = 0.0 results, less_score_cate = '', '' model_name_map = {} #训练文本映射字典 for cate in cate_list: sum_score = 0.0 if not os.path.exists(model_save_path + cate + '/'): os.makedirs(model_save_path + cate + '/') for i in range(NUMBER): train, label = train_dic[cate][i][0], train_dic[cate][i][1] x_train, x_test, y_train, y_test = train_test_split(train, label, test_size=0.2) classifier = clf.fit(x_train, y_train) pred = clf.predict(x_test) score = metrics.accuracy_score(y_test, pred) sum_score += score # save_model model_full_path = model_save_path + cate + '/' + algorithm + '_' + cate + '_' + str( i + 1) + '.model' save_model(classifier, model_full_path) # model_name_map[model_full_path] = train_dic[cate][i][2] avg_score = sum_score / NUMBER all_score += sum_score if avg_score <= 0.85: save_file_lines( result_save_full_path + algorithm + '_score_less0.85.txt', str(cate) + '\n', 'a') result_str = "%s avg-accuracy: %0.3f " % (cate, score) print(result_str) results = result_str + '\n' save_file_lines(result_save_full_path + algorithm + '_test.txt', results, 'a') # break all_avg = all_score / (NUMBER * class_number) all_result_str = "%s all avg-accuracy: %0.3f \n" % (algorithm, all_avg) print(all_result_str) train_end = time.time() train_run_time = round(train_end - train_start, 4) run_time_str = algorithm + ' train time: %f' % (train_run_time) print(run_time_str) save_file_lines(result_save_full_path + algorithm + '_test.txt', all_result_str + '\n' + run_time_str, 'a')
def test(algorithm,test_final_path,result_save_path,model_save_path,w2v_model,skip_word_save_path,fasttext_train): # 读取测试集文件 test_final_data = eval(read_file(test_final_path)) cate_list = list(test_final_data.keys()) class_number = len(cate_list) test_start = time.time() # 加载二分类器映射文件 # model_name_map = eval(read_file(result_save_path+algorithm+'_model_name_map_json.txt')) # model_name_map = json.loads(read_file(result_save_path+algorithm+'_model_name_map_json.txt')) # 速度比上一行快 all_len,all_right = 0,0 result_path = result_save_path+'final_test'+'/' if not os.path.exists(result_path): os.makedirs(result_path) record_path = result_path+'records_way_le1_le2_w2v/' if not os.path.exists(record_path): os.makedirs(record_path) test_result_path = result_path+algorithm+'_test_way_le1_le2_w2v.txt' # 得到测试文本和标签 test_data_dic = get_test_dataset(class_number,cate_list,test_final_data) # 测试过程 # 加载模型 load_model_time = time.time() # load fasttext model(level_1) le1_model = 'level_1/level_1_fasttext_classifier_big_big.model' le1_fasttext_model = fasttext_train.load_fasttext(le1_model) le2_model = 'level_2/level_2_fasttext_classifier_big_big.model' le2_fasttext_model = fasttext_train.load_fasttext(le2_model) # get sklearn classifier model # clf = get_model(algorithm) # load binary models print('..加载二分类器模型') model_dic = load_binary_model(algorithm,class_number,cate_list,model_save_path) load_end_time = time.time() print('加载模型用时:{}'.format(load_end_time-load_model_time)) # load merge model model_merge_dic = load_merge_model(algorithm,model_save_path+'merge_model/') # 加载w2v模型 # w2v_model = load_w2v_model(w2v_model_path) # read KB # kb_dic = get_KB_dic(skip_word_save_path) for cate,cons in test_data_dic.items(): test_one_time = time.time() cur_cate = cate print('cur cate %s'%cur_cate) # 定义TP,FP,TN,FN # tp_num = 0 right = 0 texts,labels = cons[0],cons[1] test_size = len(labels) all_len += test_size # 一级类目预测/二级 level_1_pre_result = fasttext_train.test_model(texts,le1_fasttext_model) level_2_pre_result = fasttext_train.test_model(texts,le2_fasttext_model) level_1_pre_labels_list,level_2_pre_labels_list = [],[] for le1 in level_1_pre_result: label_list_le1 = le1[0][:-1].split('__') level_1_pre_labels_list.append(label_list_le1[2]) for le2 in level_2_pre_result: label_list_le2 = le2[0][:-1].split('__') level_2_pre_labels_list.append(label_list_le2[2]) # print(level_2_pre_labels_list) # 知识库预测 # text_list = [text.split() for text in texts] # kb_labels_list = get_level_3_from_KB(kb_dic,text_list) # 记录 text_pre_results = {} for i in range(test_size): text,label = texts[i],labels[i] text_pre_results[label+'\t'+text] = [] # text_kb_label = [] # text_kb_label = kb_labels_list[i] text_le2_label = [] text_le1_label = level_1_pre_labels_list[i] text_le2_label = level_2_pre_labels_list[i] le_flag = 0 if text_le2_label[0] == text_le1_label: le_flag = True # if label not in text_kb_label: # 知识库预测结果未出现文档原始标签,直接跳过 way_5未跳过 # continue pre_result_dic = {} for bin_cate,models in model_dic.items(): skip_flag = 0 # if bin_cate not in text_kb_label: # continue if not le_flag and bin_cate[0] in text_le1_label: skip_flag = 1 if le_flag and bin_cate[:2] in text_le2_label: skip_flag = 2 # if bin_cate[0] in text_le1_label: # skip_flag = 3 if skip_flag: text_pro = [] pre_result_dic[bin_cate] = [] for model in models: # clf = load_model(model) clf = model[1] train_w2v = get_train_vec([text],w2v_model,skip_word_save_path) # voc = model_name_map[model[0]] # vectorizer = TfidfVectorizer(vocabulary=voc) # tdm = vectorizer.fit_transform([text]) pred = clf.predict_proba(train_w2v) text_pro.append(pred[0][1]) for c,md in model_merge_dic.items(): if bin_cate in md: # print('load merge model %s'%mer_md) merge_model = load_model(md) pre = merge_model.predict_proba([text_pro]) pre_result_dic[bin_cate].append(pre[0][1]) sort_pre_tuple = sorted(pre_result_dic.items(), key=lambda d:d[1],reverse=True) sort_cate_len = len(sort_pre_tuple) pre_cate_list = [] for pre_cate in sort_pre_tuple: pre_cate_list.append(pre_cate[0]) pre_len = len(pre_cate_list) if pre_len >=3: if label in pre_cate_list[:3]: right += 1 all_right += 1 elif label in pre_cate_list: right += 1 all_right += 1 text_pre_results[label+'\t'+text].append([text_le1_label,text_le2_label,pre_cate_list[:10]]) acc = right/test_size print('acc %f'%acc) # 这种方法的准确率为0.21,6h--way_1 不加入fasttext; 加入fasttext效果提高到0.23/way_3,way_4 save_file_lines(record_path+cur_cate+'_svm_result_record_way_le1_le2_w2v.txt',text_pre_results,'w') ''' right = 0 # doc_dic = {} doc_dic = {i:[] for i in range(test_size)} for bin_cate,models in model_dic.items(): if bin_cate[0] not in level_2_pre_labels_list: continue text_pro = [] pro_matrix = np.array([],[]) # print('load binary model %s'%bin_cate) for model in models: # clf = load_model(model) clf = model[1] voc = model_name_map[model[0]] vectorizer = TfidfVectorizer(vocabulary=voc) tdm = vectorizer.fit_transform(texts) pred = clf.predict_proba(tdm) for i in range(len(pred)): text_pro.append(pred[i][1]) pro_matrix = np.array(text_pro).reshape((NUMBER,test_size)).T temp_pro = [] for c,md in model_merge_dic.items(): if bin_cate in md: # print('load merge model %s'%mer_md) merge_model = load_model(md) pre = merge_model.predict_proba(pro_matrix) for j in range(len(pre)): doc_dic[j].append([pre[j][1],c]) # print(doc_dic) # 不加入KB和层级分类 for doc,pro_list in doc_dic.items(): pro_sort = sorted(pro_list,key=lambda d:d[0], reverse = True) pre_cate = [pro_sort[0][1],pro_sort[1][1],pro_sort[2][1]] #选择top3预测的类别,,准去率0.40+ if cur_cate in pre_cate: # 旧方法 :pro_sort[0][1] == cur_cate right += 1 all_right += 1 # 结果排序,并和知识库结果求交集 for doc,pro_list in doc_dic.items(): sort_pre_tuple = sorted(pro_list,key=lambda d:d[0], reverse = True) sort_cate_len = len(sort_pre_tuple) pre_cate_list = [] for pre_cate in sort_pre_tuple: pre_cate_list.append(pre_cate[1]) if len(pre_cate_list) >=3: if label in pre_cate_list[:3]: right += 1 all_right += 1 elif label in pre_cate_list: right += 1 all_right += 1 # 和知识库结果求交集 kb_pre_cate = [] for cate in pre_cate_list: if cate in pro_list[0]: kb_pre_cate.append(cate) kb_pre_len = len(kb_pre_cate) if kb_pre_len >=3: if cur_cate in kb_pre_cate[:3]: right += 1 all_right += 1 elif cur_cate in kb_pre_cate: right += 1 all_right += 1 acc = right/test_size print('acc %f'%acc) ''' # if acc <= 0.4: # save_file_lines(result_path+algorithm+'_less_0.2_way_0.txt',cur_cate+': '+str(acc)+'\n','a') test_one_end_time = time.time() one_run_time = round(test_one_end_time-test_one_time,4) print('test one cate time:%f\n'%one_run_time) save_file_lines(test_result_path,cur_cate+' dataset accuracy :%f'%acc+'\n','a') print('macro acc %f'%(all_right/all_len)) test_end = time.time() test_run_time = round(test_end-test_start,4) print('merge test time: %f'%(test_run_time)) save_file_lines(test_result_path,'using '+algorithm+' micro acc %f'%(all_right/all_len)+'\n','a') save_file_lines(test_result_path,'merge test time: %f'%(test_run_time),'a')
def test(algorithm, test_final_data, result_save_path, model_save_path): cate_list = list(test_final_data.keys()) class_number = len(cate_list) test_start = time.time() # load model model_dic = {} model_merge_dic = {} for j in range(class_number): cur_cate = cate_list[j] model_dic[cur_cate] = [] model_path = model_save_path + cur_cate + '/' models = os.listdir(model_path) for model in models: if algorithm in model: # svm model_full_path = model_path + model model_dic[cur_cate].append(model_full_path) for model in os.listdir(model_save_path): if algorithm in model: cate = model.split('_')[0] model_merge_dic[cate] = load_model(model_save_path + model) model_name_map = eval( read_file(result_save_path + algorithm + '_model_name_map.txt')) all_right = 0 all_len = 0 result_path = result_save_path + 'final_test' + '/' if not os.path.exists(result_path): os.makedirs(result_path) test_result_path = result_path + algorithm + '_test.txt' test_datas = {} error_cate = {} classify_cate = {} # 得到测试文本和标签 for i in range(class_number): cur_cate = cate_list[i] contents, labels = get_dataset(test_final_data[cur_cate], cur_cate) test_datas[cur_cate] = [contents, labels] # 测试过程 for cate, cons in test_datas.items(): test_one_time = time.time() cur_cate = cate print('cur cate %s' % cur_cate) texts, labels = cons[0], cons[1] right = 0 doc_dic = {} error_cate[cate] = [] classify_cate[cate] = [] test_size = len(labels) all_len += test_size doc_dic = {i: [] for i in range(test_size)} for bin_cate, models in list(model_dic.items()): # if not (bin_cate == 'A81' or bin_cate == 'B08' or bin_cate == 'D80'): # continue text_pro = [] pro_matrix = np.array([], []) # print('load binary model %s'%bin_cate) for model in models: clf = load_model(model) voc = model_name_map[model] vectorizer = TfidfVectorizer(vocabulary=voc) tdm = vectorizer.fit_transform(texts) pred = clf.predict_proba(tdm) for i in range(len(pred)): text_pro.append(pred[i][1]) pro_matrix = np.array(text_pro).reshape((NUMBER, test_size)).T temp_pro = [] for c, md in model_merge_dic.items(): if bin_cate == c: pre = md.predict_proba(pro_matrix) for j in range(len(pre)): doc_dic[j].append([pre[j][1], c]) # print(doc_dic) for doc, pro_list in doc_dic.items(): pro_sort = sorted(pro_list, key=lambda d: d[0], reverse=True) pre_cate = [pro_sort[0][1], pro_sort[1][1], pro_sort[2][1]] #选择top3预测的类别,,准去率0.40+ if cur_cate in pre_cate: # 旧方法 :pro_sort[0][1] == cur_cate right += 1 all_right += 1 else: error_cate[cate].append(pre_cate) classify_cate[cate].append(pre_cate) doc_dic[doc] = pro_sort[0][1] # print(doc_dic[2]) acc = right / test_size print('acc %f' % acc) test_one_end_time = time.time() one_run_time = round(test_one_end_time - test_one_time, 4) print('test one cate time:%f\n' % one_run_time) save_file_lines(test_result_path, cur_cate + ' dataset accuracy :%f' % acc + '\n', 'a') save_file_lines( result_save_path + 'final_test/' + algorithm + '_all.txt', '\n' + cate + '\n' + str(classify_cate[cate]), 'a') save_file_lines( result_save_path + 'final_test/' + algorithm + '_error.txt', '\n' + cate + '\n' + str(error_cate[cate]), 'a') break