def convert_w2v(binary_data_path, w2v_model, binary_w2v_data_path, skip_save_path=''): print('..word2vec文本表示') start_time = time.time() # 加载数据 binary_data_dic = load_data(binary_data_path) # 加载w2v模型 # w2v_model = load_w2v_model(w2v_model_path) # load_model_time = time.time() # print('加载w2v模型用时:{}s'.format(load_model_time-start_time)) for cate, data in binary_data_dic.items(): print('cur_cate: ' + cate) for i in range(NUMBER): self_con, self_label = binary_data_dic[cate][i][ 0], binary_data_dic[cate][i][1] train_w2v = get_train_vec(self_con, w2v_model, skip_save_path) binary_data_dic[cate][i] = [train_w2v, self_label] doc_w2v_json = json.dumps(binary_data_dic, cls=MyEncoder) # 速度比上一行快 save_file(binary_w2v_data_path, doc_w2v_json, 'w') end_time = time.time() print('文本表示用时:{}s'.format(end_time - start_time)) return binary_data_dic
def train_classifier(self): # self.load_train_dataset() #实验后的最佳参数之一 start_time = time.time() classifier = ff.train_supervised( self.data_path, lr=0.1, loss='hs', wordNgrams=2, epoch=300) # epoch=20,0.91;epoch=50,0.93; model = classifier.save_model( self.model_save_path + 'level_2_fasttext_classifier_big_big.model' ) # 保存模型 all:0.91;all_2:0.93 classifier.get_labels() # 输出标签 # 测试模型 # print('加载fasttext模型--{}'.format('level_1_fasttext_classifier_big_test.model')) # classifier = ff.load_model(self.model_save_path+'level_1_fasttext_classifier_big_test.model') test_result = classifier.test(self.test_save_path + 'test_big.txt') result_str = 'test precision:{}\n'.format(test_result) print(result_str) end_time = time.time() load_time = round(end_time - start_time, 3) train_time_str = 'train and test model time %fs' % load_time print(train_time_str) save_file(self.result_save_path + 'fasttext_result_big.txt', result_str + train_time_str + '\n', 'a')
def select_datas(ori_datas,ch_of_corpus,level3_ch,lev_n,target_path,stopword): print('='*40) print('I-1.抽取数据,生成训练和测试数据集...') #三级类总数目统计 all_cate_count = 0 #总数据列表 all_datas = [] #一级类下三级类别数目统计 le3_cate_count = 0 #设置分类标志 ch_of_corpus_flag = [] for ccn in ch_of_corpus: if ';' in ccn: item_list = ccn.split(';') ch_of_corpus_flag.append([item_list,[True]*len(item_list)]) else: ch_of_corpus_flag.append([ccn,[True]]) #n级目录表在外循环,语料库分类号在内循环 for ch in level3_ch: #一级类目下三级类别的数据条数统计 le3_data_count = 0 #一级类目下三级类别的数据list # 可改为2级类 le3_datas = [] #三级类数据存放路径 le3_datas_path = target_path + ch[0] +'/' if not os.path.exists(le3_datas_path): os.makedirs(le3_datas_path) for idx,ccn in enumerate(ch_of_corpus[:]): #原始数据分类号列表 item_list = [] if ';' in ccn: #类别数大于1时 item_list = ccn.split(';') else: item_list.append(ccn) for p in item_list: if not len(p): continue if True not in ch_of_corpus_flag[idx][1]: #如果已抽取,将不再抽取 continue elif ch[0] == p[0] and ch in p: #抽取语料库中存在的N级类目数据 row = ori_datas.loc[idx] title = row['标题'] #抽取数据标题 abstract = row['摘要'] #抽取数据摘要 key_word = row['关键词'] #抽取数据关键词 content = title+' '+abstract+' '+key_word if len(content) < 10: continue #抽取后标记false p_index = item_list.index(p) ch_of_corpus_flag[idx][1][p_index] = False con_seg = deal_data(content,stopword) le3_datas.append('__label__'+ch+', '+' '.join(con_seg)) le3_data_count += 1 if le3_data_count >= 500: #存储三级类数据 save_file(le3_datas_path+ch[0]+'_data_count.txt',ch+'-->'+str(le3_data_count)+'\n','a') random.shuffle(le3_datas) write_data(le3_datas_path+ch[0]+'_data.txt',le3_datas)
def deal_datas(cut_path, save_path): cate_counts = len(count_dic) for i, j in count_dic.items(): save_file(save_path + 'train_count.txt', i + ':' + str(j) + ',', 'a') save_file(save_path + 'train_count.txt', '--------类目数量:' + str(cate_counts), 'a') random.shuffle(contents) write_datas(save_path + 'level_3_train.txt', contents)
def read_Ch_cate(save_path,le_file_path,class_info,len_info): print('读取中图分类号...') if not '.txt' in os.listdir(save_path): infos = pd.read_excel(le_file_path) #抽取n级类号数据 le_n_datas = infos.loc[infos['层级']==int(len_info)] le_n_id = list(le_n_datas['类号ID']) le_select = [] for i in le_n_id: if i not in le_select: # i = re.sub(r'[\r\n\t]','',i) le_select.append(str(i)) #存储n级目录 save_file(save_path+'level_'+class_info+'_'+str(len_info)+'.txt',','.join(le_select),'w')
def get_dataset(original_data_path, datalevel_info_file, w2v_model_path, dataset_w2v_data_path): dataset_dic = {} dataset_label_dic = {} # 加载词向量 # w2v_model = None w2v_model = load_w2v_model(w2v_model_path) # 读取数据集二级类目信息 infos = read_file_lines(datalevel_info_file) level2_list = eval(infos[0]) level2_dic = eval(infos[1]) # 初始化数据集字典 for le2 in level2_list: if le2 not in dataset_dic: dataset_dic[le2] = [] dataset_label_dic[le2] = [] # 读取数据集 data_list, data_label_list = [], [] contents = read_file_lines(original_data_path + 'le3_data.txt') for line in contents: line_list = line.split(',') label = line_list[0].replace('__label__', '') content = line_list[1] if label[:3] in level2_list: dataset_dic[label[:3]].append(content) dataset_label_dic[label[:3]].append(label) elif label[:2] in level2_list: dataset_dic[label[:2]].append(content) dataset_label_dic[label[:2]].append(label) # 保存中间文件 dataset_w2v_dic = {} key_list = dataset_dic.keys() for key in key_list: # if not len(dataset_dic[key]): # print(key) if key not in dataset_w2v_dic: dataset_w2v_dic[key] = [] dataset_w2v_dic[key] = get_train_vec(dataset_dic[key], w2v_model) doc_w2v_json = json.dumps(dataset_w2v_dic, cls=MyEncoder) save_file(dataset_w2v_data_path, doc_w2v_json, 'w')
def stat_le1_error(records_path, le1_save_path): if not os.path.exists(le1_save_path): os.makedirs(le1_save_path) le1_error_dic = {} records_file_list = os.listdir(records_path) for file in records_file_list: if not file[0] in le1_error_dic: le1_error_dic[file[0]] = [0, 0] record_list = read_file_lines(records_path + file) for line in record_list: if 'le1 result' in line: line_list = line.split(':') le1 = line_list[1].strip() if le1 != file[0]: le1_error_dic[file[0]][1] += 1 else: le1_error_dic[file[0]][0] += 1 print(le1_error_dic) test_text_count, error_text_count = 0, 0 rate_of_error_dic = {} for i, j in le1_error_dic.items(): rate_of_error_dic[i] = '{:.3}'.format(j[1] / j[0] * 100) + '%' test_text_count += j[0] error_text_count += j[1] save_file(le1_save_path + 'le1_error_rate.txt', '一级分类错误文档数统计\n\n', 'w') save_file(le1_save_path + 'le1_error_rate.txt', str(le1_error_dic) + '\n' + str(rate_of_error_dic) + '\n\n', 'a') le1_error_rate = error_text_count / test_text_count save_file( le1_save_path + 'le1_error_rate.txt', '测试集总数:{}'.format(test_text_count) + '\n' + '错误总数:{}'.format(error_text_count) + '\n' + '错误占比:{:.4}'.format(le1_error_rate), 'a')
def stat_le3_acc(le1_path, result_path, save_path, threshold): if not os.path.exists(save_path): os.makedirs(save_path) le1_list = read_file_lines(le1_path) results = read_file_lines(result_path) save_str = '' count = 0 le3_acc_result_dic = {i.strip(): 0 for i in le1_list} print(le3_acc_result_dic) for line in results[:-2]: line_list = line.split(':') cate = line_list[0].split()[0] acc = line_list[1].strip() if float(acc) <= threshold: save_str += cate + ':' + str(acc) + '\n' le3_acc_result_dic[cate[0]] += 1 count += 1 print('less ' + str(threshold) + ' category counts:' + str(count) + '\n') # save_file(save_path+'less_'+str(threshold)+'.txt',save_str,'w') # save_file(save_path+'le3_acc_result.txt','less '+str(threshold)+' category counts:'+str(count)+'\n','a') save_file( save_path + 'le3_acc_distribute_result.txt', 'less ' + str(threshold) + ':\n' + str(le3_acc_result_dic) + '\n', 'a')
def select_txt(le_n, con, le_n_names, levs, train_path, test_path, stopword_path): print('抽取数据,生成训练和测试数据集...') stopword = read_file(stopword_path, 'utf').split('\n') #分类的类别数量 count_train_1 = 0 count_test_1 = 0 #临时数据保存 train_list = [] test_list = [] #三级类数目统计 le3_count_cate = 0 #类别标志 temp_cate = le_n_names[0][0] for i in le_n_names: #n级目录列表 #类别数据个数统计 count_train_3 = 0 count_test_3 = 0 #临时保存数据 test_1 = [] train_1 = [] test_3 = [] train_3 = [] #训练文件集一级类名字 le1_name = i[0] len_num = 1 if '/' in i: i = i.replace(r'/', ' ') le_3_train_path = train_path + i[0] + '/' le_3_test_path = test_path + i[0] + '/' if not os.path.exists(le_3_train_path): os.makedirs(le_3_train_path) if not os.path.exists(le_3_test_path): os.makedirs(le_3_test_path) for m in enumerate(levs[:]): #原始数据分类号列表 m_list = [] try: if ';' in m[1]: #类别数大于1时 m_list = m[1].split(';') else: m_list.append(m[1]) except: continue for p in m_list: #print(p[0]) if len(p) == 0: continue elif i[0] == p[0] and i in p: #符合级目录则抽取 index = m[0] #获得待抽取数据索引 item = con.loc[index] title = item['标题'] #抽取数据标题 content = item['摘要'] #抽取数据摘要,作为文本内容 key_word = item['关键词'] #抽取数据关键词 content = title + ' ' + content + ' ' + key_word try: if len_num % 4 == 0: test_1.append(deal_datas(i[0], content, stopword)) test_3.append(deal_datas(i, content, stopword)) count_test_3 += 1 else: train_1.append(deal_datas(i[0], content, stopword)) train_3.append(deal_datas(i, content, stopword)) count_train_3 += 1 except: print('抽取数据类别%s时出错!' % i) else: len_num += 1 if count_train_3 >= 100: #抽取三级类训练数据集 save_file(le_3_train_path + i[0] + '_train_count.txt', i + '-->' + str(count_train_3) + ',', 'a') random.shuffle(train_3) write_datas(le_3_train_path + i[0] + '_train.txt', train_3) train_list.append(train_1) if temp_cate == i[0]: le3_count_cate += 1 else: save_file( train_path + temp_cate + '/' + temp_cate + '_train_count.txt', '类别数目' + '-->' + str(le3_count_cate) + ',', 'a') le3_count_cate = 1 temp_cate = i[0] if count_test_3 > 35: #抽取三级类测试数据集 save_file(le_3_test_path + i[0] + '_test_count.txt', i + '-->' + str(count_test_3) + ',', 'a') random.shuffle(test_3) write_datas(le_3_test_path + i[0] + '_test.txt', test_3) test_list.append(test_1) #打乱数据,使同类别的数据分散 for l1 in test_list: random.shuffle(l1) write_datas(test_path + 'level_' + le_n[0] + '_test.txt', l1) for l2 in train_list: random.shuffle(l2) write_datas(train_path + 'level_' + le_n[0] + '_train.txt', l2) save_file(train_path + temp_cate + '/' + temp_cate + '_train_count.txt', '类别数目' + '-->' + str(le3_count_cate) + ',', 'a')
def save_pre_file(file_path, file_con): save_file(file_path, file_con, 'w')
def select_txt(le_n,con,le_n_names,levs,train_path,test_path,stopword_path,lens): print('抽取数据,生成训练和测试数据集...') #分类的类别数量 cate_count = 0 #临时数据保存 train_list = [] test_list = [] stopword = read_file(stopword_path,'utf').split('\n') for i in le_n_names: #n级目录列表 #类别数据个数统计 count_train = 0 count_test = 0 #临时保存数据 test = [] train = [] #训练文件集一级类名字 # le1_name = i[0] len_num = 1 if '/' in i: i=i.replace(r'/',' ') for j in enumerate(levs[:]): #原始数据分类号列表 j_list=[] try: if ';' in j[1]: #类别数大于1时 j_list = j[1].split(';') else: j_list.append(j[1]) except: continue for p in j_list: p=str(p) #print(p[0]) if len(p) == 0: continue # elif i[0] == p[0] and i in p: #符合n级目录则抽取 elif len(i) >= 3 and len(p) >= 3 and i[:3] == p[:3]: #符合n级目录则抽取 index = j[0] #获得待抽取数据索引 item = con.loc[index] # title = item['标题'] #抽取数据标题 content = item['ContentText'] #抽取数据摘要,作为文本内容 # key_word = item['关键词'] #抽取数据关键词 # content = title+' '+content+' '+key_word try: # if len_num%8==0: # test.append(deal_datas(i,content,stopword)) # count_test+=1 # else: train.append(deal_datas(i,content,stopword)) count_train+=1 except: print('抽取数据类别%s时出错!'%i) else: len_num+=1 if count_train >= 20 : save_file(train_path+'train_'+lens+'_count.txt',i+'-->'+str(count_train)+',','a') train_list.append(train) cate_count += 1 # if count_test >= 0: # save_file(test_path+'test_'+lens+'_count.txt',i+'-->'+str(count_test)+',','a') # test_list.append(test) #打乱数据,使得同类别的样本不至于扎堆 for l1 in test_list: random.shuffle(l1) write_datas(test_path+'level_'+lens+'_test.txt',l1) for l2 in train_list: random.shuffle(l2) write_datas(train_path+'level_'+lens+'_train.txt',l2) save_file(train_path+'train_'+lens+'_count.txt','类别数目: '+str(cate_count)+',','a')