def testMAP(): result_dic = pickleload("./modelsave/pyramidModel0_predict.pkl", "./modelsave/result_dic.pkl") true_label_dic = pickleload("./modelsave/pyramidModel0_true.pkl", "./modelsave/true_label_dic.pkl") keys = result_dic.keys() MAPS = 0 precisions = 0 recalls = 0 for key in keys: out = torch.cat(result_dic[key], dim=0) print(out) print(true_label_dic[key]) predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy() print("预测标签:", predict_index) print("-------------------------------------") precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index) MAPS += MAP precisions += precision recalls += recall print(len(keys)) MAPS /= len(keys) precisions /= len(keys) recalls /= len(keys) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))
def testRetrievalModelResult(topn = 5): with open("./result/predict.test.arcii_ranking.txt") as fp: lines = fp.readlines() result_lis = {} last_name = "" for line in lines: results = line.replace("\n","").split(" ") if last_name != results[0]: last_name = results[0] result_lis[last_name] = [] result_lis[last_name].append(int(results[2].split("_")[-1])) predict_indexs = [value for key, value in result_lis.items()] MAPS = 0 precisions = 0 recalls = 0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data) * 4 // 5:len(data)] bleus = 0 rouges = 0 for id in range(len(dev_data)): target = dev_data[id]["target_tokens"] citations = dev_data[id]['citations_tokens'] true_label = [] predict_index = predict_indexs[id][0:topn] for i in range(len(citations)): citation = citations[i] if citation['label'] == 1: true_label.append(i) bleu = 0 rouge = 0 for predict in predict_index: # print(predict) alternative_citation = citations[predict]["target_tokens"] if len(target.strip().split(" ")) < 5 or len(alternative_citation.strip().split(" ")) < 5: continue bleu += test_bleu(alternative_citation, target, 1) rouge += test_rouge(alternative_citation, target) print(bleu) print("------------------------") bleus += bleu/len(predict_index) rouges += rouge/len(predict_index) precision, recall, MAP = cal_MAP(true_label, predict_index) precisions += precision recalls+= recall MAPS += MAP MAPS /= len(predict_indexs) precisions /= len(predict_indexs) recalls /= len(predict_indexs) bleus /= len(dev_data) rouges /= len(dev_data) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls)) print("bleu", topn, ":", bleus) print("rouge:", rouges)
def Bm25_similar(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source"_tokens:"", "target_tokens":"" "citations":[ { "up_source_tokens": "down_source_tokens": "target_tokens": } ... ] } ...... ] :return: ''' datas = pickleload("../data2/random_train_data.pkl", "../data2/random_train_data.pkl") datas = datas[len(datas)*4//5:len(datas)] MAPS = 0 precisions= 0 recalls = 0 for data in tqdm(datas): target_up_content = process(data["up_source_tokens"]).split(" ") target_down_content = process(data["down_source_tokens"]).split(" ") target_content = process(data["target_tokens"]) content_tokens = target_up_content #+ target_down_content citations = data["citations_tokens"] citation_content_dict = dict() citation_target_dict = dict() index = 0 # print(len(citations)) ref_lis = [] for citation in citations: sel_up_content = process(citation["up_source_tokens"]).split(" ") sel_down_content =process( citation["down_source_tokens"]).split(" ") sel_target = process(citation["target_tokens"]) citation_content_tokens = sel_up_content + sel_target.split(" ") +sel_down_content citation_content_dict[str(index)] = citation_content_tokens citation_target_dict[str(index)] = sel_target if citation['label'] == 1: ref_lis.append(index) index += 1 pre_lis = getBm25TopSimilar(content_tokens, citation_content_dict, num=5) precision, recall ,MAP = cal_MAP(ref_lis, pre_lis) MAPS += MAP precisions += precision recalls += recall MAPS /= len(datas) precisions /= len(datas) recalls /= len(datas) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))
def all_doubletrainKey(args): data = pickleload( '../Retrieval/train_data/small_pairs_random_train_data.pkl', "small_pairs_random_train_data") dev_data = pickleload("../data2/random_train_data.pkl", "dev_data") train_data = data[0] + data[1] + data[2] + data[3] dev_data = dev_data[len(dev_data) * 4 // 5:len(dev_data)] batch = Batch(args) # source_embedding = pickleload(args.source_emb_mat_pkl, "source_emb_mat_pkl") word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl") input_vec = len(word2index) train_batches = batch.double_train_batch(train_data, args.context_limit, args.num_epoches, args.batch_size) log_msg = "输入词空间大小:%d" % (input_vec) logger.info(log_msg) print(log_msg) transform = Transformer(args, input_vec) if torch.cuda.is_available(): transform = transform.cuda() transform.load_state_dict( torch.load("./modelsave/" + "TransformModel0.pkl")) model = AllClassifyGetKeyWords(args, transform) model = model.cuda() if args.loadmodel == True: model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName)) # for param in model.parameters(): # param.data.uniform_(-0.08, 0.08) # param.data.uniform_(-0.08, 0.08) parameters_trainable = list( filter(lambda p: p.requires_grad, model.parameters())) if args.optim == "Adadelta": optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) elif args.optim == "Adam": optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) elif args.optim == "SGD": optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) if args.loadmodel == True: model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName)) # 打印参数: log_msg = "优化函数:%s \n 学习率:%s \n 隐藏层:%s\n 保存模型名称:%s \n" % ( args.optim, args.learning_rate, args.d_model, args.modelName) # print("dropout:", args.dropout) logger.info(log_msg) print(log_msg) set_epoch = 1 pbar = tqdm(total=len(train_data) * args.num_epoches // args.batch_size + 1) def loss_func(high_out, low_out, seleout11, seleout12, seleout21, seleout22): ones = torch.ones(high_out.size(0), 1).cuda() ones1 = 7 * torch.ones(high_out.size(0), 1).cuda() loss = torch.mean(ones - high_out + low_out) + torch.mean((ones1 - seleout11)*(ones1 - seleout11)) + torch.mean((ones1 - seleout12)*(ones1 - seleout12)) + \ torch.mean((ones1 - seleout21)*(ones1 - seleout21)) + torch.mean((ones1 - seleout22)*(ones1 - seleout22)) return F.relu(loss), torch.mean(ones - high_out + low_out) print_loss_total = 0 old_accu = 0 print_loss_total2 = 0 for train_step, (train_batch, epoch) in enumerate(train_batches): pbar.update(1) high_context_idxs = train_batch['high_cit_context_idxs'] high_seg_ids = train_batch['high_seg_indexs'] low_context_idxs = train_batch['low_cit_context_idxs'] low_seg_ids = train_batch['low_seg_indexs'] high_source_context_idxs = train_batch['high_source_context_idxs'] high_source_seg_indexs = train_batch['high_source_seg_indexs'] low_source_context_idxs = train_batch['low_source_context_idxs'] low_source_seg_indexs = train_batch['low_source_seg_indexs'] high_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in high_context_idxs.data.numpy() ], dtype=np.float)).cuda() low_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in low_context_idxs.data.numpy() ], dtype=np.float)).cuda() high_source_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in high_source_context_idxs.data.numpy() ], dtype=np.float)).cuda() low_source_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in low_source_context_idxs.data.numpy() ], dtype=np.float)).cuda() high_context_idxs = Variable(high_context_idxs).cuda() high_seg_ids = Variable(high_seg_ids).cuda() low_context_idxs = Variable(low_context_idxs).cuda() low_seg_ids = Variable(low_seg_ids).cuda() high_source_context_idxs = Variable(high_source_context_idxs).cuda() high_source_seg_indexs = Variable(high_source_seg_indexs).cuda() low_source_context_idxs = Variable(low_source_context_idxs).cuda() low_source_seg_indexs = Variable(low_source_seg_indexs).cuda() out1, seleout11, seleout12 = model.forward(high_context_idxs, high_seg_ids, high_context_mask, high_source_context_idxs, high_source_seg_indexs, high_source_context_mask) out2, seleout21, seleout22 = model.forward(low_context_idxs, low_seg_ids, low_context_mask, low_source_context_idxs, low_source_seg_indexs, low_source_context_mask) # Get loss optimizer.zero_grad() #out1:batch * num_target * word_vec #out2:batch * 2 loss, loss2 = loss_func(out1, out2, seleout11, seleout12, seleout21, seleout22) # Backward propagation loss.backward() optimizer.step() loss_value = loss.data.item() print_loss_total += loss_value print_loss_total2 += loss2.data.item() del out1, out2 if train_step % 100 == 0: log_msg = 'Epoch: %d, Train_step %d loss1: %.4f, loss2:%.4f' % ( epoch, train_step, print_loss_total / 100, print_loss_total2 / 100) logger.debug(log_msg) print(log_msg) print_loss_total = 0 print_loss_total2 = 0 if epoch == set_epoch: set_epoch += 1 dev_batches = batch.dev_batch(dev_data, args.context_limit) result_dic = {} true_label_dic = {} for dev_step, dev_batch in enumerate(dev_batches): context_idxs = dev_batch['context_idxs'] source_context_idxs = dev_batch['source_context_idxs'] seg_indexs = dev_batch['seg_indexs'] source_seg_indexs = dev_batch['source_seg_indexs'] ref_labels = dev_batch['ref_labels'] id = dev_batch['id'] context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in context_idxs.data.numpy() ], dtype=np.float)).cuda() source_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in source_context_idxs.data.numpy() ], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() seg_indexs = Variable(seg_indexs).cuda() source_context_idxs = Variable(source_context_idxs).cuda() source_seg_indexs = Variable(source_seg_indexs).cuda() out, seleout1, seleout2 = model.forward( context_idxs, seg_indexs, context_mask, source_context_idxs, source_seg_indexs, source_context_mask) # Get loss if id not in result_dic: result_dic[id] = [] result_dic[id].append(out.cpu().data) true_label_dic[id] = ref_labels else: result_dic[id].append(out.cpu().data) del out picklesave(result_dic, "./modelsave/all_dev_result_dic22.pkl", "./modelsave/result_dic.pkl") picklesave(true_label_dic, "./modelsave/all_dev_true_label_dic22.pkl", "./modelsave/true_label_dic.pkl") keys = result_dic.keys() MAPS = 0 precisions = 0 recalls = 0 for key in keys: out = torch.cat(result_dic[key], dim=0) predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy() # print("预测标签:",predict_index) precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index) MAPS += MAP precisions += precision recalls += recall MAPS /= len(dev_data) precisions /= len(dev_data) recalls /= len(dev_data) all_loss = MAPS if all_loss > old_accu: old_accu = all_loss torch.save(model.state_dict(), "./modelsave/max" + args.modelName) best_epoch = epoch # else: # args.learning_rate = args.learning_rate / 2.0 # if args.learning_rate <= 1e-6: # args.learning_rate = 1e-6 # if args.optim == "Adadelta": # optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate, # weight_decay=args.init_weight_decay) # elif args.optim == "Adam": # optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate, # weight_decay=args.init_weight_decay) # elif args.optim == "SGD": # optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate, # weight_decay=args.init_weight_decay) log_msg = '\n验证集的MAP为: %.4f P为: %.4f R为: %.4f\n 取得最小loss的epoch为:%d' % ( all_loss, precisions, recalls, best_epoch) logger.info(log_msg) print(log_msg) # 实时保存每个epoch的模型 torch.save(model.state_dict(), "./modelsave/" + args.modelName) torch.save(model.state_dict(), "./modelsave/" + args.modelName) pbar.close()
def test(args): args.dropout = 0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data)*4//5:len(data)] # dev_data = data[2000: 4000] batch = Batch(args) word2index = pickleload(args.word2index_pkl, 'word2index') input_vec = len(word2index) source_embedding = pickleload("./word2vec/glove_300.pkl", "glove_300.pkl") source_embedding = np.array(source_embedding, dtype=np.float32) dev_batches = batch.dev_batch(dev_data, args.context_limit, args.citation_limit) log_msg = "输入词空间大小:%d" %(input_vec) logger.info(log_msg) print(log_msg) if args.model == "MatchPyramid": model = MatchPyramid(args, input_vec, source_embedding) elif args.model == "LstmMatch": model = LstmMatch(args, input_vec, source_embedding) elif args.model == "Decomposable": model = Decomposable(args, input_vec, source_embedding) elif args.model == "Inference": model = Inference(args, input_vec, source_embedding) elif args.model == "ESIM": model = ESIM(args, input_vec, source_embedding) elif args.model == "ArcII": model = ArcII(args, input_vec, source_embedding) if args.loadmodel ==True: model.load_state_dict(torch.load("./modelsave/"+ args.loadmodelName)) if torch.cuda.is_available(): model = model.cuda() # 打印参数: log_msg = "模型名称:%s \n"%( args.loadmodelName) logger.info(log_msg) print(log_msg) pbar2 = tqdm(total=len(dev_data)) MAPS = 0 precisions = 0 recalls = 0 blues = 0 rouges = 0 for dev_step, dev_batch in enumerate(dev_batches): pbar2.update(1) context_idxs = dev_batch['context_idxs'] cit_context_idxs = dev_batch['cit_context_idxs'] ref_labels = dev_batch['ref_labels'] target = dev_batch["targets"] citations = dev_batch['citations'] context_mask = torch.Tensor( np.array([list(map(function, xx)) for xx in context_idxs.data.numpy()], dtype=np.float)).cuda() cit_context_mask = torch.Tensor( np.array([list(map(function, xx)) for xx in cit_context_idxs.data.numpy()], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() cit_context_idxs = Variable(cit_context_idxs).cuda() out = model.forward(context_idxs, cit_context_idxs, context_mask, cit_context_mask) # Get loss # print("真实值:",out) # print("真实标签:",ref_labels) topn = 3 predict_index = torch.topk(out,topn, dim=0)[1].squeeze(1).data.cpu().numpy() bleu = 0 rouge = 0 for index in predict_index: alternative_citation = citations[index]["target_tokens"] bleu += test_bleu(alternative_citation, target, 1) rouge += test_rouge(alternative_citation, target) # print("候选citation:", alternative_citation) bleu = bleu / topn rouge = rouge / topn blues += bleu rouges += rouge # print("预测标签:",predict_index) precision, recall, MAP = cal_MAP(ref_labels, predict_index) MAPS += MAP precisions += precision recalls += recall MAPS /= len(dev_data) precisions /= len(dev_data) recalls /= len(dev_data) blues /= len(dev_data) rouges /= len(dev_data) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls)) print("bleu", topn, ":", blues) print("rouge:", rouges) pbar2.close()
def findSimilar(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source":"", "down_source":"", "target":"" "citations":[ citation0, citation1, ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("./data2/random_train_data.pkl", "./data2/random_train_data.pkl") datas = datas[len(datas) * 4 // 5:len(datas)] idf_dic = pickleload("./data2/idf.pkl", "idf.pkl") # datas = datas[0:10] print(len(idf_dic)) print(len(datas)) count = 0 MAPS = 0 precisions = 0 recalls = 0 for data in tqdm(datas): up_source_tokens = process(data["up_source_tokens"]) down_source_tokens = process(data["down_source_tokens"]) target = process(data["target_tokens"]) #计算citation citations = data["citations_tokens"] scores = [] count += 1 ref_lis = [] for index in range(len(citations)): if citations[index]['label'] == 1: ref_lis.append(index) ciation = citations[index] cit_up_source_tokens = process(ciation["up_source_tokens"]) cit_down_source_tokens = process(ciation["down_source_tokens"]) cit_target = process(ciation["target_tokens"]) score = getSVMScore( idf_dic, up_source_tokens, cit_up_source_tokens + " " + cit_target + " " + cit_down_source_tokens) scores.append(score) # print("scores:",scores) new_score = sorted(scores, reverse=True) pre_lis = [] for i in range(3): pre_lis.append(scores.index(new_score[i])) # print("原文:",up_source_tokens + " "+ down_source_tokens) # print("候选:",citations[pre_lis[0]]["up_source_tokens"]) # print("候选:",citations[pre_lis[0]]["target_tokens"]) # print("候选:",citations[pre_lis[0]]["down_source_tokens"]) # print("ref_lis",ref_lis) # print("pre_lis",pre_lis) precision, recall, MAP = cal_MAP(ref_lis, pre_lis) # print("precision:", precision) # print("recall:", recall) # print("MAP:", MAP) # print("-----------------------------------------------") MAPS += MAP precisions += precision recalls += recall MAPS /= len(datas) precisions /= len(datas) recalls /= len(datas) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))
def test(args): args.dropout = 0.0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data) * 4 // 5:len(data)] # dev_data = data[2000: 4000] batch = Batch(args) word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl") input_vec = len(word2index) dev_batches = batch.dev_batch(dev_data, args.context_limit) log_msg = "输入词空间大小:%d" % (input_vec) logger.info(log_msg) print(log_msg) transform = Transformer(args, input_vec) # transform.load_state_dict(torch.load("./modelsave/" + "TransformModel0.pkl")) if torch.cuda.is_available(): transform = transform.cuda() # model = Classify(args, transform) model = Classify(args, transform) #if args.loadmodel ==True: model.load_state_dict(torch.load("./modelsave/" + "maxclassifyModel2.pkl")) if torch.cuda.is_available(): model = model.cuda() # 打印参数: log_msg = "模型名称:%s \n" % (args.loadmodelName) logger.info(log_msg) print(log_msg) result_dic = {} true_label_dic = {} for dev_step, dev_batch in enumerate(dev_batches): context_idxs = dev_batch['context_idxs'] seg_indexs = dev_batch['seg_indexs'] cit_targets = dev_batch['cit_targets'] target = dev_batch['targets'] ref_labels = dev_batch['ref_labels'] id = dev_batch['id'] print(id) context_mask = torch.Tensor( np.array( [list(map(function, xx)) for xx in context_idxs.data.numpy()], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() seg_indexs = Variable(seg_indexs).cuda() out = model.forward(context_idxs, seg_indexs, context_mask) # Get loss if id not in result_dic: result_dic[id] = [] result_dic[id].append(out.cpu().data) true_label_dic[id] = ref_labels else: result_dic[id].append(out.cpu().data) del out picklesave(result_dic, "./modelsave/classifyModel2_predict.pkl", "./modelsave/result_dic.pkl") picklesave(true_label_dic, "./modelsave/classifyModel2_true.pkl", "./modelsave/true_label_dic.pkl") keys = result_dic.keys() MAPS = 0 precisions = 0 recalls = 0 for key in keys: out = torch.cat(result_dic[key], dim=0) predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy() # print("预测标签:",predict_index) precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index) MAPS += MAP precisions += precision recalls += recall MAPS /= len(dev_data) precisions /= len(dev_data) recalls /= len(dev_data) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))