def process_data(self, file_name, pkl_name): # 入口函数 process_data = [] ans = [] a = 0 with open(file_name, 'r', encoding='utf-8') as f: for line in tqdm(f.readlines()): sample = json.loads(line) docid, question, answer, text = sample['docid'], sample['question'], sample['answer'], sample['text'] recall_paras = sample['recall_paras'] pre_sample = {'question': question, 'docs': [], 'answers':answer} recall_paras = [normalized(n) for n in recall_paras] text = normalized(text) answer = normalized(answer) ans.append(answer) #recall_paras = [text] doc_stride = [] for _, text in enumerate(recall_paras[:5]): max_c_len = 512 - len(encode_pieces(self.sp_model, question)) - 5 #text = '凰'.join([x for x in text.split(' ') if len(x) != 0]) #answer = '凰'.join([x for x in answer.split(' ') if len(x) != 0]) doc_stride.extend(self.get_doc_strides(text, max_c_len=max_c_len, ds=256)) doc_stride = list(set(doc_stride)) for ds_id, doc_span in enumerate(doc_stride): doc_span_token = ['<cls>'] + encode_pieces(self.sp_model, question) + ['<sep>'] + \ encode_pieces(self.sp_model, doc_span) ref_ans_token = encode_pieces(self.sp_model, answer) start, end = self.find_answer(doc_span_token, ref_ans_token) if start[0] == 0: ans_dict = {'is_impossible': True, 'answers': [[0, 0, 0, 0]]} doc = {'content': doc_span_token, 'ans_dict': ans_dict} pre_sample['docs'].append(doc) a += 1 else: ans_dict = {'is_impossible': False, 'answers': [[0, 0, answer, 0]], 'muti_ans':[start, end]} doc = {'content': doc_span_token, 'ans_dict': ans_dict} pre_sample['docs'].append(doc) process_data.append(pre_sample) print('no-ans:',a) print('len(ans):',len(set(ans))) print('len(process_data): ', len(process_data)) with open(pkl_name, 'w', encoding="utf-8") as fout: for feature in process_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
def train(json_path, seg, save_path, stopwords): all_question_gram = get_all_question_token() print('===== loading data =====') with open(json_path, 'r') as f: corpus = [] g = f.read().split('\n') for _, data_piece in enumerate(tqdm(g)): context = normalized(json.loads(data_piece)['context']) # 切词 item_str = seg(context) # 去停用词 doc = [] for word in item_str: if word not in stopwords: doc.append(word) # 2-gram ngram = [ ''.join([doc[x], doc[x + 1]]) for x in range(len(doc[:]) - 1) ] ngram = set(ngram) ngram = list(ngram & set(all_question_gram)) # 过滤掉不在问题中的2-gram # ngram = [x for x in ngram if x in all_question_gram] doc.extend(ngram) corpus.append(doc) dictionary = corpora.Dictionary(corpus) print('document dictionary length: {}'.format(len(dictionary))) # corpus为文本全集 bm25Model = BM25(corpus) with open(os.path.join(save_path, 'bm25_2gram.Model'), 'wb') as f: pickle.dump(bm25Model, f)
def context_eval(seg, tfidf_model, sparse_result): id2index, index2id = get_index_id_trans( os.path.join(SAVE_PATH, 'id2index.pkl'), os.path.join(SAVE_PATH, 'index2id.pkl')) print('===== loading data =====') acc = 0 total = 0 with open(os.path.join(SAVE_PATH, 'context.json'), 'r') as f: g = f.read().split('\n') for _, data_piece in enumerate(tqdm(g)): context = normalized(json.loads(data_piece)['context']) res = TFIDF.predict(context, seg, tfidf_model, sparse_result) total += 1 if index2id[json.loads(data_piece)['id']] in res: acc += 1 print("total: {}, acc: {}, rate: {}".format(total, acc, acc / total)) return acc / total
def predict(from_train, query, seg, bm25Model, stopwords): train_question, question2id = from_train context = normalized(query) item_str = seg(context) # n-gram ngram = [ ''.join([item_str[x], item_str[x + 1]]) for x in range(len(item_str[:]) - 1) ] item_str.extend(ngram) doc = [] for word in item_str: if word not in stopwords: doc.append(word) avg_idf = nCoV_BM25.get_avg_idf(bm25Model) scores = bm25Model.get_scores(doc) scores = np.array(scores) # fak = find_from_train(query, train_question, question2id) # scores[fak] = (scores[fak]+1)*10 return list(reversed(list(np.argsort(scores))))[:300]
def evaluate(model, one_or_more='one', path=args.split_dev_data): with open('./data/similarity/id2index.pkl', 'rb') as f: id2index = pickle.load(f) sp_model = spm.SentencePieceProcessor() sp_model.Load('data/spiece.model') rouge_L = RougeL(beta=1) device = torch.device("cuda", 0) tokenizer = XLNetTokenizer() model = model.to(device) predict_data = [] hit_rate = { 'docid_correct': 0, 'ans_in_doc_correct': 0, 'total': 0, 'docid_score': 0, 'ans_in_doc_score': 0, 'wrong_score': 0 } look_data = [] with torch.no_grad(): model.eval() with open(path, 'r', encoding='utf-8') as f: for step, line in enumerate(tqdm(f.readlines()[0::1])): sample = json.loads(line) docid, question, answer, text, recall_paras = \ sample['docid'], sample['question'], sample['answer'], sample['text'], sample['recall_paras'] top = [int(t) for t in sample['top']] recall_paras = [normalized(n) for n in recall_paras] ori_text = normalized(text) answer = normalized(answer) all_best_score = -999 pre_ans = '' doc_index = top[0] doc_head_len = len(encode_pieces(sp_model, question)) if one_or_more == 'more': texts = recall_paras[:15] else: texts = [text] for doc_num, text in enumerate(texts): text = '凰'.join( [x for x in text.split(' ') if len(x) != 0]) max_c_len = 512 - len(encode_pieces(sp_model, question)) - 5 doc_strides = get_doc_strides(sp_model, content=text, max_c_len=max_c_len, ds=256) for ds_id, doc_span in enumerate(doc_strides): tokens = ['<cls>'] + encode_pieces(sp_model, question) + ['<sep>'] + \ encode_pieces(sp_model, doc_span) ques_len = len(encode_pieces(sp_model, question)) input_ids = tokenizer.convert_tokens_to_ids(tokens) tokentype_ids = [1] * len(input_ids) tokentype_ids[:ques_len + 1] = [0] * (ques_len + 1) assert len(tokentype_ids) == len(input_ids) tokentype_ids = torch.tensor(tokentype_ids).unsqueeze( 0).to(device) input_ids = torch.tensor(input_ids).unsqueeze(0).to( device) start, end, verify_gate, cls_logit = model( input_ids, token_type_ids=tokentype_ids) # cls_logit = torch.nn.Softmax(dim=-1)(cls_logit) # cls_logit = cls_logit.cpu().squeeze().tolist() start, end = start.cpu().squeeze().tolist(), end.cpu( ).squeeze().tolist() #verify_gate = torch.nn.Softmax(dim=-1)(verify_gate) verify_gate = verify_gate.cpu().squeeze().tolist() verify_gate = [i[1] for i in verify_gate] is_ans = (start[0] + end[0]) start_g = sorted(start)[-5:][0] end_g = sorted(end)[-5:][0] for s, s_prob in enumerate(start[1:-2]): # 不遍历doc_head if s < doc_head_len - 2: continue if s_prob < start_g: continue for e, e_prob in enumerate(end[s + 1:s + 1 + 280]): if e_prob < end_g: continue v_score = np.min(verify_gate[s + 1:e + s + 2]) h_score = (s_prob + e_prob) * 1 - is_ans + v_score #h_score = h_score * cls_logit[1] if doc_num > 14 and h_score > 0: h_score *= 0.7 if h_score > all_best_score: here_ans = ''.join(tokens[s + 1:e + s + 2]) all_best_score = h_score pre_ans = here_ans doc_index = top[doc_num] pre_ans = pre_ans.replace('凰', ' ') sc = rouge_L.get_rouge_L(answer, pre_ans) if doc_index == id2index[docid]: hit_rate['docid_correct'] += 1 hit_rate['ans_in_doc_correct'] += 1 hit_rate['docid_score'] += sc hit_rate['ans_in_doc_score'] += sc elif answer in recall_paras[top.index(doc_index)]: hit_rate['ans_in_doc_correct'] += 1 hit_rate['ans_in_doc_score'] += sc else: is_docid_in_top15 = id2index[docid] in top[:15] look_data.append({ 'predict_ans': pre_ans, 'real_ans': answer, 'top15_docs': recall_paras[:15], 'model_choose_doc': recall_paras[top.index(doc_index)], 'real_doc': ori_text, 'is_docid_in_top15': is_docid_in_top15, 'question': question, 'rouge-L': sc }) hit_rate['wrong_score'] += sc hit_rate['total'] += 1 predict_data.append({'pre': pre_ans, 'rel': answer}) if step % 20 == -1: print('docid_hit_rate:{}\n' 'ans_in_doc_hit_rate:{}\n' 'docid_score:{}\n' 'ans_in_doc_score:{}\n' 'wrong_score:{}\n'.format( float(hit_rate['docid_correct']) / (hit_rate['total']), float(hit_rate['ans_in_doc_correct']) / (hit_rate['total']), float(hit_rate['docid_score']) / (hit_rate['total']), float(hit_rate['ans_in_doc_score']) / (hit_rate['total']), float(hit_rate['wrong_score']) / (hit_rate['total']))) score = 0.0 for n in predict_data: a = rouge_L.get_rouge_L(n['pre'], n['rel']) score += a print('docid_hit_rate:{}\n' 'ans_in_doc_hit_rate:{}\n' 'docid_score:{}\n' 'ans_in_doc_score:{}\n' 'wrong_score:{}\n'.format( float(hit_rate['docid_correct']) / (hit_rate['total']), float(hit_rate['ans_in_doc_correct']) / (hit_rate['total']), float(hit_rate['docid_score']) / (hit_rate['total']), float(hit_rate['ans_in_doc_score']) / (hit_rate['total']), float(hit_rate['wrong_score']) / (hit_rate['total']))) print('rouge_L : ', score / len(predict_data)) with open('look_data.json', 'w', encoding="utf-8") as fout: for feature in look_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n') return score / len(predict_data)
def predict(model, paths): sp_model = spm.SentencePieceProcessor() sp_model.Load('data/spiece.model') device = torch.device("cuda", 1) tokenizer = XLNetTokenizer() model = model.to(device) predict_data = [] with torch.no_grad(): model.eval() for path in paths: with open(path, 'r', encoding='utf-8') as f: for step, line in enumerate(tqdm(f.readlines()[0::1])): sample = json.loads(line) qid, question, recall_paras = \ sample['qid'], sample['question'], sample['recall_paras'] recall_paras = [normalized(d) for d in recall_paras] max_c_len = 512 - len(question) - 5 all_best_score = -999 pre_ans = '' doc_head_len = len(encode_pieces(sp_model, question)) for text in recall_paras[:15]: text = '凰'.join( [x for x in text.split(' ') if len(x) != 0]) doc_strides = get_doc_strides(sp_model, content=text, max_c_len=max_c_len, ds=256) for ds_id, doc_span in enumerate(doc_strides): tokens = ['<cls>'] + encode_pieces(sp_model, question) + ['<sep>'] + \ encode_pieces(sp_model, doc_span) ques_len = len(encode_pieces(sp_model, question)) input_ids = tokenizer.convert_tokens_to_ids(tokens) tokentype_ids = [1] * len(input_ids) tokentype_ids[:ques_len + 1] = [0] * (ques_len + 1) assert len(tokentype_ids) == len(input_ids) input_ids = torch.tensor(input_ids).unsqueeze( 0).to(device) tokentype_ids = torch.tensor( tokentype_ids).unsqueeze(0).to(device) start, end, verify_gate, cls_logit = model( input_ids, token_type_ids=tokentype_ids) is_ans = (start.cpu().squeeze().tolist()[0] + end.cpu().squeeze().tolist()[0]) #cls_logit = torch.nn.Softmax(dim=-1)(cls_logit) #cls_logit = cls_logit.cpu().squeeze().tolist() start, end = start.cpu().squeeze().tolist( ), end.cpu().squeeze().tolist() # verify_gate = torch.nn.Softmax(dim=-1)(verify_gate) verify_gate = verify_gate.cpu().squeeze().tolist() verify_gate = [i[1] for i in verify_gate] start_g = sorted(start)[-5:][0] end_g = sorted(end)[-5:][0] for s, s_prob in enumerate(start[1:-2]): if s < doc_head_len - 2: continue if s_prob < start_g: continue for e, e_prob in enumerate(end[s + 1:s + 1 + 280]): if e_prob < end_g: continue v_score = np.min(verify_gate[s + 1:e + s + 2]) h_score = ( s_prob + e_prob ) * 1 - is_ans + v_score # + 0.002*sim # - is_ans # h_score = h_score * cls_logit[1] if h_score > all_best_score: here_ans = ''.join(tokens[s + 1:e + s + 2]) all_best_score = h_score pre_ans = here_ans pre_ans = pre_ans.replace('凰', ' ') predict_data.append({ 'pre': pre_ans, 'qid': qid, 'question': question }) with open('result.json', 'w', encoding="utf-8") as fout: for feature in predict_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')