def process_data(self, file_name, pkl_name): # 入口函数 process_data = [] ans = [] a = 0 with open(file_name, 'r', encoding='utf-8') as f: for line in tqdm(f.readlines()): sample = json.loads(line) docid, question, answer, text = sample['docid'], sample['question'], sample['answer'], sample['text'] recall_paras = sample['recall_paras'] pre_sample = {'question': question, 'docs': [], 'answers':answer} recall_paras = [normalized(n) for n in recall_paras] text = normalized(text) answer = normalized(answer) ans.append(answer) #recall_paras = [text] doc_stride = [] for _, text in enumerate(recall_paras[:5]): max_c_len = 512 - len(encode_pieces(self.sp_model, question)) - 5 #text = '凰'.join([x for x in text.split(' ') if len(x) != 0]) #answer = '凰'.join([x for x in answer.split(' ') if len(x) != 0]) doc_stride.extend(self.get_doc_strides(text, max_c_len=max_c_len, ds=256)) doc_stride = list(set(doc_stride)) for ds_id, doc_span in enumerate(doc_stride): doc_span_token = ['<cls>'] + encode_pieces(self.sp_model, question) + ['<sep>'] + \ encode_pieces(self.sp_model, doc_span) ref_ans_token = encode_pieces(self.sp_model, answer) start, end = self.find_answer(doc_span_token, ref_ans_token) if start[0] == 0: ans_dict = {'is_impossible': True, 'answers': [[0, 0, 0, 0]]} doc = {'content': doc_span_token, 'ans_dict': ans_dict} pre_sample['docs'].append(doc) a += 1 else: ans_dict = {'is_impossible': False, 'answers': [[0, 0, answer, 0]], 'muti_ans':[start, end]} doc = {'content': doc_span_token, 'ans_dict': ans_dict} pre_sample['docs'].append(doc) process_data.append(pre_sample) print('no-ans:',a) print('len(ans):',len(set(ans))) print('len(process_data): ', len(process_data)) with open(pkl_name, 'w', encoding="utf-8") as fout: for feature in process_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
def get_doc_strides(self, content, max_c_len=args.max_c_len, ds=256): c_tokens = encode_pieces(self.sp_model, content) all_strides = [] here_start = 0 while here_start < len(c_tokens): here_c = ''.join(c_tokens[here_start:here_start + max_c_len]) all_strides.append(here_c) here_start += ds if len(c_tokens) <= max_c_len: return all_strides[:1] if all_strides[-1] in all_strides[-2]: all_strides = all_strides[:-1] return all_strides[:15]
def train(json_path, seg, save_path): # Similarity.get_index_id_trans() print('===== loading data =====') with open(json_path, 'r') as f: real_documents = [] g = f.read().split('\n') for _, data_piece in enumerate(tqdm(g)): context = json.loads(data_piece)['context'] item_str = encode_pieces(sp_model, context) #seg(context) real_documents.append(item_str) tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_df=0.7, min_df=1 / 3000) document = [' '.join(sent) for sent in real_documents] tfidf_model = tfidf_vectorizer.fit(document) # 下面这个pickle存不了。。 # sparse matrix, [n_samples, n_features] Tf-idf-weighted document-term matrix. sparse_result = tfidf_model.transform(document).todense() with open(os.path.join(save_path, 'tfidf_model.pkl'), 'wb') as f: pickle.dump(tfidf_model, f) with open(os.path.join(save_path, 'document.pkl'), 'wb') as f: pickle.dump(document, f)
def predict(query, seg, tfidf_model, sparse_result): test_document = [' '.join(encode_pieces(sp_model, query))] result = tfidf_model.transform(test_document).todense() scores = np.array(result * sparse_result.T)[0] return list(reversed(list(np.argsort(scores))))[:300]
def predict(model, paths): sp_model = spm.SentencePieceProcessor() sp_model.Load('data/spiece.model') device = torch.device("cuda", 1) tokenizer = XLNetTokenizer() model = model.to(device) predict_data = [] with torch.no_grad(): model.eval() for path in paths: with open(path, 'r', encoding='utf-8') as f: for step, line in enumerate(tqdm(f.readlines()[0::1])): sample = json.loads(line) qid, question, recall_paras = \ sample['qid'], sample['question'], sample['recall_paras'] recall_paras = [normalized(d) for d in recall_paras] max_c_len = 512 - len(question) - 5 all_best_score = -999 pre_ans = '' doc_head_len = len(encode_pieces(sp_model, question)) for text in recall_paras[:15]: text = '凰'.join( [x for x in text.split(' ') if len(x) != 0]) doc_strides = get_doc_strides(sp_model, content=text, max_c_len=max_c_len, ds=256) for ds_id, doc_span in enumerate(doc_strides): tokens = ['<cls>'] + encode_pieces(sp_model, question) + ['<sep>'] + \ encode_pieces(sp_model, doc_span) ques_len = len(encode_pieces(sp_model, question)) input_ids = tokenizer.convert_tokens_to_ids(tokens) tokentype_ids = [1] * len(input_ids) tokentype_ids[:ques_len + 1] = [0] * (ques_len + 1) assert len(tokentype_ids) == len(input_ids) input_ids = torch.tensor(input_ids).unsqueeze( 0).to(device) tokentype_ids = torch.tensor( tokentype_ids).unsqueeze(0).to(device) start, end, verify_gate, cls_logit = model( input_ids, token_type_ids=tokentype_ids) is_ans = (start.cpu().squeeze().tolist()[0] + end.cpu().squeeze().tolist()[0]) #cls_logit = torch.nn.Softmax(dim=-1)(cls_logit) #cls_logit = cls_logit.cpu().squeeze().tolist() start, end = start.cpu().squeeze().tolist( ), end.cpu().squeeze().tolist() # verify_gate = torch.nn.Softmax(dim=-1)(verify_gate) verify_gate = verify_gate.cpu().squeeze().tolist() verify_gate = [i[1] for i in verify_gate] start_g = sorted(start)[-5:][0] end_g = sorted(end)[-5:][0] for s, s_prob in enumerate(start[1:-2]): if s < doc_head_len - 2: continue if s_prob < start_g: continue for e, e_prob in enumerate(end[s + 1:s + 1 + 280]): if e_prob < end_g: continue v_score = np.min(verify_gate[s + 1:e + s + 2]) h_score = ( s_prob + e_prob ) * 1 - is_ans + v_score # + 0.002*sim # - is_ans # h_score = h_score * cls_logit[1] if h_score > all_best_score: here_ans = ''.join(tokens[s + 1:e + s + 2]) all_best_score = h_score pre_ans = here_ans pre_ans = pre_ans.replace('凰', ' ') predict_data.append({ 'pre': pre_ans, 'qid': qid, 'question': question }) with open('result.json', 'w', encoding="utf-8") as fout: for feature in predict_data: fout.write(json.dumps(feature, ensure_ascii=False) + '\n')