def analysis_test(): corpus = read_corpus() data = pd.read_csv(args.train_data, sep='\t') data = [{ 'docid': docid, 'question': question, 'answer': answer, 'text': corpus[docid] } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] train_question = [] question2id = {} for i in data: answer, question, text, docid = i['answer'], i['question'], i[ 'text'], i['docid'] question2id[question] = docid train_question.append(question) data = pd.read_csv('./result/0.6881.csv', sep='\t') predict_data = [{ 'id': id, 'answer': answer, 'docid': docid } for id, docid, answer in zip(data['id'], data['docid'], data['answer'])] data = pd.read_csv(args.test_data, sep='\t') id2question = {} for qid, question in zip(data['id'], data['question']): id2question[qid] = question corpus = read_corpus() import json with open('predict.json', 'w', encoding='utf-8') as f: for i in predict_data: ques = id2question[i['id']] # similar_q, score = para_recall(ques, train_question, max_para_num=5) # print(similar_q[:3]) print('ques:', ques) # print('ans:', i['answer']) # print('doc:', corpus.get(i['docid'], 'none')) # print() f.write( json.dumps({ 'question': ques, 'predict': i['answer'] }, ensure_ascii=False) + '\n')
def top_para(): # 计算单文档抽取关键句子后的答案保持率 correct, total = 0, 0 data = pd.read_csv(args.train_data, sep='\t') data = [{ 'id': id, 'docid': docid, 'question': question, 'answer': answer } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] content_data = read_corpus() for step, i in enumerate(data): answer, question, docid = i['answer'], i['question'], i['docid'] text = content_data[docid] if answer in text: if len(text) > 0: text = re.split('[\n\t]', text) # paras, _ = para_recall(question, text, max_para_num=30, sort=False) # 极限0.969 text = '。'.join( text )[: 512] # 3072:9919, 2048:9715,1536:0.9361, 1024:0.8578, 512:0.5854 if answer in text: correct += 1 total += 1 if step % 200 == 0: print(float(correct) / total) print(float(correct) / total) print(correct, total)
def make_test_data(self, topk_path, pkl_name): not_recall = 0 print('load test data in {}'.format(topk_path)) with open('./data/similarity/id2index.pkl', 'rb') as f: id2index = pickle.load(f) with open(topk_path, 'rb') as f: topk = pickle.load(f) print('4500:', len(topk)) print('top:', len(topk[list(topk.keys())[0]])) data = [] with open(args.split_train_data, 'r', encoding='utf-8') as f: for step, line in enumerate(tqdm(f.readlines())): sample = json.loads(line) sample['top'] = topk[sample['id']][:15] if id2index[sample['docid']] not in sample['top']: sample['top'][0] = id2index[sample['docid']] # 百分百召回 not_recall += 1 data.append(sample) print('not_recall: ',not_recall) dev_data = [] content_data = read_corpus() index_content_data = [_ for _ in range(len(content_data))] for docid, text in content_data.items(): index_content_data[id2index[docid]] = text print('len(index_content_data):', len(index_content_data)) for i in data: i['recall_paras'] = [index_content_data[index] for index in i['top']] dev_data.append(i) del i['top'] with open(pkl_name, 'w', encoding="utf-8") as fout: for feature in dev_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
def make_dev_data(topk_path, pkl_name): print('load test data in {}'.format(topk_path)) with open('./data/similarity/id2index.pkl', 'rb') as f: id2index = pickle.load(f) with open(topk_path, 'rb') as f: topk = pickle.load(f) print('500:', len(topk)) print('top:', len(topk[list(topk.keys())[0]])) data = [] with open(args.split_dev_data, 'r', encoding='utf-8') as f: for step, line in enumerate(tqdm(f.readlines())): sample = json.loads(line) sample['top'] = topk[sample['id']] data.append(sample) dev_data = [] content_data = read_corpus() index_content_data = [_ for _ in range(len(content_data))] for docid, text in content_data.items(): index_content_data[id2index[docid]] = text #index_content_data = get_all_subdoc() print('len(index_content_data):', len(index_content_data)) for i in data: i['recall_paras'] = [index_content_data[index] for index in i['top']] i['top'] = [str(n) for n in i['top']] dev_data.append(i) with open(pkl_name, 'w', encoding="utf-8") as fout: for feature in dev_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
def make_test_data(topk_path, pkl_name): print('load test data in {}'.format(topk_path)) with open('./data/similarity/id2index.pkl', 'rb') as f: id2index = pickle.load(f) with open(topk_path, 'rb') as f: topk = pickle.load(f) print('1643:', len(topk)) print('top:', len(topk[list(topk.keys())[0]])) data = pd.read_csv(args.test_data, sep='\t') data = [{ 'qid': qid, 'question': question } for qid, question in zip(data['id'], data['question'])] test_data = [] content_data = read_corpus() index_content_data = [_ for _ in range(len(content_data))] print('len(index_content_data):', len(index_content_data)) for docid, text in content_data.items(): index_content_data[id2index[docid]] = text for i in data: i['recall_paras'] = [ index_content_data[index] for index in topk[i['qid']] ] test_data.append(i) with open(pkl_name, 'w', encoding="utf-8") as fout: for feature in test_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
def make_test_data(self, topk_path, pkl_name): not_recall = 0 print('load test data in {}'.format(topk_path)) with open('./data/similarity/id2index.pkl', 'rb') as f: id2index = pickle.load(f) with open(topk_path, 'rb') as f: topk = pickle.load(f) with open('./data/similarity/bm25_test_2gram_top30.dict', 'rb') as f: test_topk = pickle.load(f) topk.update(test_topk) print('4500_1643:', len(topk)) print('top:', len(topk[list(topk.keys())[0]])) data = [] with open(args.split_train_data, 'r', encoding='utf-8') as f: for step, line in enumerate(tqdm(f.readlines())): sample = json.loads(line) sample['top'] = topk[sample['id']][:15] if id2index[sample['docid']] not in sample['top']: sample['top'][0] = id2index[sample['docid']] # 百分百召回 not_recall += 1 data.append(sample) test_data = pd.read_csv(args.test_data, sep='\t') test_question = {} for qid, question in zip(test_data['id'], test_data['question']): test_question[qid] = question test_data = pd.read_csv('vote.csv', sep='\t') for id, docid, answer in zip(test_data['id'], test_data['docid'], test_data['answer']): sample = {} sample['top'] = topk[id][:15] sample['text'] = 'none' sample['answer'] = answer sample['docid'] = -1 sample['question'] = test_question[id] data.append(sample) print('not_recall: ', not_recall) dev_data = [] content_data = read_corpus() index_content_data = [_ for _ in range(len(content_data))] for docid, text in content_data.items(): index_content_data[id2index[docid]] = text print('len(index_content_data):', len(index_content_data)) for i in data: i['recall_paras'] = [ index_content_data[index] for index in i['top'] ] dev_data.append(i) del i['top'] with open(pkl_name, 'w', encoding="utf-8") as fout: for feature in dev_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
def use_bm25(mode, data_path, seg, save_path, bm25Model, stopwords, topk=5): corpus = read_corpus() data = pd.read_csv(args.train_data, sep='\t') data = [{ 'docid': docid, 'question': question, 'answer': answer, 'text': corpus[docid] } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] train_question = [] question2id = {} for i in data: answer, question, text, docid = i['answer'], i['question'], i[ 'text'], i['docid'] question2id[question] = docid train_question.append(question) if mode == 'dev_eval': nCoV_BM25.dev_eval([train_question, question2id], dev_data=data_path, seg=seg, save_path=save_path, bm25Model=bm25Model, stopwords=stopwords, topk=topk) elif mode == 'test_eval': nCoV_BM25.test_eval([train_question, question2id], test_data=data_path, seg=seg, save_path=save_path, bm25Model=bm25Model, stopwords=stopwords, topk=topk) elif mode == 'train_eval': id2index, index2id = get_index_id_trans( os.path.join(save_path, 'id2index.pkl'), os.path.join(save_path, 'index2id.pkl')) ret_tmp = nCoV_BM25.train_eval( from_train=[train_question, question2id], train_data=data_path, seg=seg, bm25Model=bm25Model, stopwords=stopwords, id2index=id2index) tmp = [] print('k=', my_bm25.PARAM_K1) for i in range(0, 20, 1): tmp.append(ret_tmp[i] / 500) print(i + 1, ret_tmp[i] / 500) print(tmp) elif mode == 'train_model': nCoV_BM25.train('./data/similarity/context.json', seg=seg, save_path=save_path, stopwords=stopwords)
def unk(): a, b = 0, 0 corpus = read_corpus() print(corpus['72a3eb8cada539ab9583e5ba0652b04b']) exit() data = pd.read_csv(args.train_data, sep='\t') data = [{ 'docid': docid, 'question': question, 'answer': answer, 'text': corpus[docid] } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] test_data = pd.read_csv(args.test_data, sep='\t') test_data = [{ 'qid': qid, 'question': question } for qid, question in zip(test_data['id'], test_data['question'])] test_question = [i['question'] for i in test_data] train_question = [] question2id = {} for i in tqdm(data): answer, question, text, docid = i['answer'], i['question'], i[ 'text'], i['docid'] question2id[question] = docid train_question.append(question) # print(text) # print(question+' : '+answer) # print() if len(text.split(answer)) > 2: # print(text) # print(len(text.split(answer)), answer) a += 1 # print() if len(text.split(answer)) < 2: b += 1 print(text) print(answer) print() print('no-ans:', b) print('muti-ans:', a)
def noise(): from find_noise import DataFilter, cut_sent filter = DataFilter() correct, total = 0, 0 data = pd.read_csv(args.train_data, sep='\t') data = [{ 'id': id, 'docid': docid, 'question': question, 'answer': answer } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] content_data = read_corpus() lenth = 0 for step, i in enumerate(data): answer, question, docid = i['answer'], i['question'], i['docid'] text = content_data[docid] if answer in text: filter_paras = [] for p in cut_sent(text): if True: filter_paras.append(p) else: if not p in filter.noise_paras: filter_p = p temp = [] for tem in filter_p.split(' '): if not tem in filter.noise_words: temp.append(tem) filter_paras.append(' '.join(temp)) # else: # print(p) text = ''.join(filter_paras) text = text[:2048 + 1024] lenth += len(text) if answer in text: correct += 1 total += 1 if step % 200 == 0: print(float(correct) / total) print(lenth / step)
def calculate_corpus(): # 统计训练集中出现的总文档数目 data = pd.read_csv(args.train_data, sep='\t') data = [{ 'docid': docid, 'question': question, 'answer': answer } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] corpus = read_corpus() all_corpus = [n for i, n in corpus.items()] a = 0 num = [] for i in tqdm(data): question, answer = i['question'], i['answer'] for idx, n in enumerate(all_corpus): if answer in n: num.append(idx) a += 1 print(a / len(data)) print(len(set(num))) print(len(all_corpus))
def topk_para(): # 计算召回文档抽取关键句子后的答案保持率 with open('./data/similarity/id2index.pkl', 'rb') as f: id2index = pickle.load(f) with open('./data/similarity/bm25_train_top300.pkl', 'rb') as f: topk = pickle.load(f) correct, total = 0, 0 data = pd.read_csv(args.train_data, sep='\t') del data['id'], data['docid'] print(data) exit() data = [{ 'id': id, 'docid': docid, 'question': question, 'answer': answer } for id, docid, question, answer in zip(data['id'], data['docid'], data['question'], data['answer'])] content_data = read_corpus() index_content_data = [_ for _ in range(len(content_data))] for docid, text in content_data.items(): index_content_data[id2index[docid]] = text correct, total = 0, 0 for step, i in enumerate(data): answer, question, docid, top = i['answer'], i['question'], i[ 'docid'], topk[step] text = content_data[docid] top = top[:5] candidate_text = [index_content_data[index] for index in top] merge_para = [] main_para = [] for num, text in enumerate(candidate_text): if num > 5: text = text.split('。') if len(text) > 2: merge_para.extend([ text[l - 1] + '。' + text[l] + '。' + text[l + 1] for l in range(1, len(text) - 1, 2) ]) else: merge_para.extend(text) else: main_para.extend([ text[l - 1] + '。' + text[l] + '。' + text[l + 1] for l in range(1, len(text) - 1, 2) ]) # paras, _ = para_recall(question, merge_para, max_para_num=1, sort=False) main_para, _ = para_recall(question, main_para, max_para_num=20000, sort=False) temp = '。'.join(main_para).split('。') # print(len('。'.join(main_para))) # print(len('。'.join(paras))) a = [] for t in temp: if t not in a: a.append(t) # print(len(''.join(main_para+paras))) if answer in '。'.join(main_para): correct += 1 total += 1 if step % 200 == 0: print(float(correct) / total) print(correct, total)