コード例 #1
0
def testMAP():
    result_dic = pickleload("./modelsave/pyramidModel0_predict.pkl",
                            "./modelsave/result_dic.pkl")
    true_label_dic = pickleload("./modelsave/pyramidModel0_true.pkl",
                                "./modelsave/true_label_dic.pkl")
    keys = result_dic.keys()
    MAPS = 0
    precisions = 0
    recalls = 0
    for key in keys:
        out = torch.cat(result_dic[key], dim=0)
        print(out)
        print(true_label_dic[key])
        predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy()
        print("预测标签:", predict_index)
        print("-------------------------------------")
        precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index)
        MAPS += MAP
        precisions += precision
        recalls += recall
    print(len(keys))
    MAPS /= len(keys)
    precisions /= len(keys)
    recalls /= len(keys)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
コード例 #2
0
def testRetrievalModelResult(topn = 5):
    with open("./result/predict.test.arcii_ranking.txt") as fp:
        lines = fp.readlines()
    result_lis = {}
    last_name = ""
    for line in lines:
        results = line.replace("\n","").split("	")
        if last_name != results[0]:
            last_name = results[0]
            result_lis[last_name] = []
        result_lis[last_name].append(int(results[2].split("_")[-1]))
    predict_indexs = [value for key, value in result_lis.items()]
    MAPS = 0
    precisions = 0
    recalls = 0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data) * 4 // 5:len(data)]
    bleus = 0
    rouges = 0
    for id in range(len(dev_data)):
        target = dev_data[id]["target_tokens"]
        citations = dev_data[id]['citations_tokens']
        true_label = []
        predict_index = predict_indexs[id][0:topn]
        for i in range(len(citations)):
            citation = citations[i]
            if citation['label'] == 1:
                true_label.append(i)
        bleu = 0
        rouge = 0
        for predict in predict_index:
            # print(predict)
            alternative_citation = citations[predict]["target_tokens"]
            if len(target.strip().split(" ")) < 5 or len(alternative_citation.strip().split(" ")) < 5:
                continue
            bleu += test_bleu(alternative_citation, target, 1)
            rouge += test_rouge(alternative_citation, target)
            print(bleu)
        print("------------------------")
        bleus += bleu/len(predict_index)
        rouges += rouge/len(predict_index)
        precision, recall, MAP = cal_MAP(true_label, predict_index)
        precisions += precision
        recalls+= recall
        MAPS += MAP
    MAPS /= len(predict_indexs)
    precisions /= len(predict_indexs)
    recalls /= len(predict_indexs)
    bleus /= len(dev_data)
    rouges /= len(dev_data)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
    print("bleu", topn, ":", bleus)
    print("rouge:", rouges)
コード例 #3
0
def Bm25_similar():
	'''
	[
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source_tokens":"",
             "down_source"_tokens:"",
             "target_tokens":""
             "citations":[
                          {
                          	"up_source_tokens":
                          	"down_source_tokens":
                          	"target_tokens":
                          }
                           ...
                          ]
            }
            ......

        ]
	:return:
	'''
	datas = pickleload("../data2/random_train_data.pkl", "../data2/random_train_data.pkl")
	datas = datas[len(datas)*4//5:len(datas)]
	MAPS = 0
	precisions= 0
	recalls = 0
	for data in tqdm(datas):
		target_up_content = process(data["up_source_tokens"]).split(" ")
		target_down_content = process(data["down_source_tokens"]).split(" ")
		target_content = process(data["target_tokens"])
		content_tokens = target_up_content #+ target_down_content
		citations = data["citations_tokens"]
		citation_content_dict = dict()
		citation_target_dict = dict()
		index = 0
		# print(len(citations))
		ref_lis = []
		for citation in citations:
			sel_up_content = process(citation["up_source_tokens"]).split(" ")
			sel_down_content =process( citation["down_source_tokens"]).split(" ")
			sel_target = process(citation["target_tokens"])
			citation_content_tokens = sel_up_content + sel_target.split(" ") +sel_down_content
			citation_content_dict[str(index)] = citation_content_tokens
			citation_target_dict[str(index)] = sel_target
			if citation['label'] == 1:
				ref_lis.append(index)
			index += 1

		pre_lis = getBm25TopSimilar(content_tokens, citation_content_dict, num=5)

		precision, recall ,MAP = cal_MAP(ref_lis, pre_lis)
		MAPS += MAP
		precisions += precision
		recalls += recall

	MAPS /= len(datas)
	precisions /= len(datas)
	recalls /= len(datas)
	print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
コード例 #4
0
def all_doubletrainKey(args):
    data = pickleload(
        '../Retrieval/train_data/small_pairs_random_train_data.pkl',
        "small_pairs_random_train_data")
    dev_data = pickleload("../data2/random_train_data.pkl", "dev_data")
    train_data = data[0] + data[1] + data[2] + data[3]
    dev_data = dev_data[len(dev_data) * 4 // 5:len(dev_data)]

    batch = Batch(args)
    # source_embedding = pickleload(args.source_emb_mat_pkl, "source_emb_mat_pkl")
    word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl")
    input_vec = len(word2index)

    train_batches = batch.double_train_batch(train_data, args.context_limit,
                                             args.num_epoches, args.batch_size)

    log_msg = "输入词空间大小:%d" % (input_vec)
    logger.info(log_msg)
    print(log_msg)

    transform = Transformer(args, input_vec)

    if torch.cuda.is_available():
        transform = transform.cuda()

    transform.load_state_dict(
        torch.load("./modelsave/" + "TransformModel0.pkl"))

    model = AllClassifyGetKeyWords(args, transform)

    model = model.cuda()
    if args.loadmodel == True:
        model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName))
    # for param in model.parameters():
    #     param.data.uniform_(-0.08, 0.08)
    #     param.data.uniform_(-0.08, 0.08)

    parameters_trainable = list(
        filter(lambda p: p.requires_grad, model.parameters()))

    if args.optim == "Adadelta":
        optimizer = torch.optim.Adadelta(parameters_trainable,
                                         lr=args.learning_rate,
                                         weight_decay=args.init_weight_decay)
    elif args.optim == "Adam":
        optimizer = torch.optim.Adam(parameters_trainable,
                                     lr=args.learning_rate,
                                     weight_decay=args.init_weight_decay)
    elif args.optim == "SGD":
        optimizer = torch.optim.SGD(parameters_trainable,
                                    lr=args.learning_rate,
                                    weight_decay=args.init_weight_decay)

    if args.loadmodel == True:
        model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName))
    # 打印参数:
    log_msg = "优化函数:%s \n 学习率:%s \n 隐藏层:%s\n 保存模型名称:%s \n" % (
        args.optim, args.learning_rate, args.d_model, args.modelName)
    # print("dropout:", args.dropout)
    logger.info(log_msg)
    print(log_msg)

    set_epoch = 1
    pbar = tqdm(total=len(train_data) * args.num_epoches // args.batch_size +
                1)

    def loss_func(high_out, low_out, seleout11, seleout12, seleout21,
                  seleout22):
        ones = torch.ones(high_out.size(0), 1).cuda()
        ones1 = 7 * torch.ones(high_out.size(0), 1).cuda()
        loss = torch.mean(ones - high_out + low_out) + torch.mean((ones1 - seleout11)*(ones1 - seleout11)) + torch.mean((ones1 - seleout12)*(ones1 - seleout12)) + \
               torch.mean((ones1 - seleout21)*(ones1 - seleout21)) + torch.mean((ones1 - seleout22)*(ones1 - seleout22))
        return F.relu(loss), torch.mean(ones - high_out + low_out)

    print_loss_total = 0
    old_accu = 0
    print_loss_total2 = 0
    for train_step, (train_batch, epoch) in enumerate(train_batches):
        pbar.update(1)
        high_context_idxs = train_batch['high_cit_context_idxs']
        high_seg_ids = train_batch['high_seg_indexs']
        low_context_idxs = train_batch['low_cit_context_idxs']
        low_seg_ids = train_batch['low_seg_indexs']
        high_source_context_idxs = train_batch['high_source_context_idxs']
        high_source_seg_indexs = train_batch['high_source_seg_indexs']
        low_source_context_idxs = train_batch['low_source_context_idxs']
        low_source_seg_indexs = train_batch['low_source_seg_indexs']

        high_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in high_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()
        low_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in low_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()
        high_source_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in high_source_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()
        low_source_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in low_source_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()

        high_context_idxs = Variable(high_context_idxs).cuda()
        high_seg_ids = Variable(high_seg_ids).cuda()
        low_context_idxs = Variable(low_context_idxs).cuda()
        low_seg_ids = Variable(low_seg_ids).cuda()
        high_source_context_idxs = Variable(high_source_context_idxs).cuda()
        high_source_seg_indexs = Variable(high_source_seg_indexs).cuda()
        low_source_context_idxs = Variable(low_source_context_idxs).cuda()
        low_source_seg_indexs = Variable(low_source_seg_indexs).cuda()

        out1, seleout11, seleout12 = model.forward(high_context_idxs,
                                                   high_seg_ids,
                                                   high_context_mask,
                                                   high_source_context_idxs,
                                                   high_source_seg_indexs,
                                                   high_source_context_mask)
        out2, seleout21, seleout22 = model.forward(low_context_idxs,
                                                   low_seg_ids,
                                                   low_context_mask,
                                                   low_source_context_idxs,
                                                   low_source_seg_indexs,
                                                   low_source_context_mask)
        # Get loss
        optimizer.zero_grad()
        #out1:batch * num_target * word_vec
        #out2:batch * 2
        loss, loss2 = loss_func(out1, out2, seleout11, seleout12, seleout21,
                                seleout22)
        # Backward propagation
        loss.backward()
        optimizer.step()
        loss_value = loss.data.item()
        print_loss_total += loss_value
        print_loss_total2 += loss2.data.item()
        del out1, out2
        if train_step % 100 == 0:
            log_msg = 'Epoch: %d, Train_step %d  loss1: %.4f, loss2:%.4f' % (
                epoch, train_step, print_loss_total / 100,
                print_loss_total2 / 100)
            logger.debug(log_msg)
            print(log_msg)
            print_loss_total = 0
            print_loss_total2 = 0
        if epoch == set_epoch:
            set_epoch += 1
            dev_batches = batch.dev_batch(dev_data, args.context_limit)
            result_dic = {}
            true_label_dic = {}
            for dev_step, dev_batch in enumerate(dev_batches):
                context_idxs = dev_batch['context_idxs']
                source_context_idxs = dev_batch['source_context_idxs']
                seg_indexs = dev_batch['seg_indexs']
                source_seg_indexs = dev_batch['source_seg_indexs']
                ref_labels = dev_batch['ref_labels']
                id = dev_batch['id']

                context_mask = torch.Tensor(
                    np.array([
                        list(map(function, xx))
                        for xx in context_idxs.data.numpy()
                    ],
                             dtype=np.float)).cuda()
                source_context_mask = torch.Tensor(
                    np.array([
                        list(map(function, xx))
                        for xx in source_context_idxs.data.numpy()
                    ],
                             dtype=np.float)).cuda()

                context_idxs = Variable(context_idxs).cuda()
                seg_indexs = Variable(seg_indexs).cuda()
                source_context_idxs = Variable(source_context_idxs).cuda()
                source_seg_indexs = Variable(source_seg_indexs).cuda()
                out, seleout1, seleout2 = model.forward(
                    context_idxs, seg_indexs, context_mask,
                    source_context_idxs, source_seg_indexs,
                    source_context_mask)
                # Get loss
                if id not in result_dic:
                    result_dic[id] = []
                    result_dic[id].append(out.cpu().data)
                    true_label_dic[id] = ref_labels
                else:
                    result_dic[id].append(out.cpu().data)
                del out
            picklesave(result_dic, "./modelsave/all_dev_result_dic22.pkl",
                       "./modelsave/result_dic.pkl")
            picklesave(true_label_dic,
                       "./modelsave/all_dev_true_label_dic22.pkl",
                       "./modelsave/true_label_dic.pkl")
            keys = result_dic.keys()
            MAPS = 0
            precisions = 0
            recalls = 0
            for key in keys:
                out = torch.cat(result_dic[key], dim=0)
                predict_index = torch.topk(out, 2,
                                           dim=0)[1].squeeze(1).data.numpy()
                # print("预测标签:",predict_index)
                precision, recall, MAP = cal_MAP(true_label_dic[key],
                                                 predict_index)
                MAPS += MAP
                precisions += precision
                recalls += recall

            MAPS /= len(dev_data)
            precisions /= len(dev_data)
            recalls /= len(dev_data)
            all_loss = MAPS
            if all_loss > old_accu:
                old_accu = all_loss
                torch.save(model.state_dict(),
                           "./modelsave/max" + args.modelName)
                best_epoch = epoch
            # else:
            #     args.learning_rate = args.learning_rate / 2.0
            #     if args.learning_rate <= 1e-6:
            #         args.learning_rate = 1e-6
            #     if args.optim == "Adadelta":
            #         optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate,
            #                                          weight_decay=args.init_weight_decay)
            #     elif args.optim == "Adam":
            #         optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate,
            #                                      weight_decay=args.init_weight_decay)
            #     elif args.optim == "SGD":
            #         optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate,
            #                                     weight_decay=args.init_weight_decay)
            log_msg = '\n验证集的MAP为: %.4f  P为: %.4f  R为: %.4f\n 取得最小loss的epoch为:%d' % (
                all_loss, precisions, recalls, best_epoch)
            logger.info(log_msg)
            print(log_msg)
            # 实时保存每个epoch的模型
            torch.save(model.state_dict(), "./modelsave/" + args.modelName)
    torch.save(model.state_dict(), "./modelsave/" + args.modelName)
    pbar.close()
コード例 #5
0
def test(args):
    args.dropout = 0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data)*4//5:len(data)]
    # dev_data = data[2000: 4000]

    batch = Batch(args)
    word2index = pickleload(args.word2index_pkl, 'word2index')
    input_vec = len(word2index)
    source_embedding = pickleload("./word2vec/glove_300.pkl", "glove_300.pkl")
    source_embedding = np.array(source_embedding, dtype=np.float32)

    dev_batches = batch.dev_batch(dev_data, args.context_limit, args.citation_limit)

    log_msg = "输入词空间大小:%d" %(input_vec)
    logger.info(log_msg)
    print(log_msg)

    if args.model == "MatchPyramid":
        model = MatchPyramid(args, input_vec, source_embedding)
    elif args.model == "LstmMatch":
        model = LstmMatch(args, input_vec, source_embedding)
    elif args.model == "Decomposable":
        model = Decomposable(args, input_vec, source_embedding)
    elif args.model == "Inference":
        model = Inference(args, input_vec, source_embedding)
    elif args.model == "ESIM":
        model = ESIM(args, input_vec, source_embedding)
    elif args.model == "ArcII":
        model = ArcII(args, input_vec, source_embedding)

    if args.loadmodel ==True:
        model.load_state_dict(torch.load("./modelsave/"+ args.loadmodelName))

    if torch.cuda.is_available():
        model = model.cuda()

    # 打印参数:
    log_msg = "模型名称:%s \n"%( args.loadmodelName)
    logger.info(log_msg)
    print(log_msg)

    pbar2 = tqdm(total=len(dev_data))
    MAPS = 0
    precisions = 0
    recalls = 0
    blues = 0
    rouges = 0
    for dev_step, dev_batch in enumerate(dev_batches):
        pbar2.update(1)
        context_idxs = dev_batch['context_idxs']
        cit_context_idxs = dev_batch['cit_context_idxs']
        ref_labels = dev_batch['ref_labels']
        target = dev_batch["targets"]
        citations = dev_batch['citations']
        context_mask = torch.Tensor(
            np.array([list(map(function, xx)) for xx in context_idxs.data.numpy()],
                     dtype=np.float)).cuda()
        cit_context_mask = torch.Tensor(
            np.array([list(map(function, xx)) for xx in cit_context_idxs.data.numpy()],
                     dtype=np.float)).cuda()

        context_idxs = Variable(context_idxs).cuda()
        cit_context_idxs = Variable(cit_context_idxs).cuda()

        out = model.forward(context_idxs, cit_context_idxs, context_mask, cit_context_mask)
        # Get loss
        # print("真实值:",out)
        # print("真实标签:",ref_labels)
        topn = 3
        predict_index = torch.topk(out,topn, dim=0)[1].squeeze(1).data.cpu().numpy()

        bleu = 0
        rouge = 0

        for index in predict_index:
            alternative_citation = citations[index]["target_tokens"]
            bleu += test_bleu(alternative_citation, target, 1)
            rouge += test_rouge(alternative_citation, target)
            # print("候选citation:", alternative_citation)
        bleu = bleu / topn
        rouge = rouge / topn
        blues += bleu
        rouges += rouge

        # print("预测标签:",predict_index)
        precision, recall, MAP = cal_MAP(ref_labels, predict_index)
        MAPS += MAP
        precisions += precision
        recalls += recall

    MAPS /= len(dev_data)
    precisions /= len(dev_data)
    recalls /= len(dev_data)
    blues /= len(dev_data)
    rouges /= len(dev_data)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
    print("bleu", topn, ":", blues)
    print("rouge:", rouges)
    pbar2.close()
コード例 #6
0
def findSimilar():
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source":"",
             "down_source":"",
             "target":""
             "citations":[
                          citation0,
                          citation1,
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("./data2/random_train_data.pkl",
                       "./data2/random_train_data.pkl")
    datas = datas[len(datas) * 4 // 5:len(datas)]
    idf_dic = pickleload("./data2/idf.pkl", "idf.pkl")
    # datas = datas[0:10]
    print(len(idf_dic))
    print(len(datas))
    count = 0

    MAPS = 0
    precisions = 0
    recalls = 0
    for data in tqdm(datas):
        up_source_tokens = process(data["up_source_tokens"])
        down_source_tokens = process(data["down_source_tokens"])
        target = process(data["target_tokens"])

        #计算citation
        citations = data["citations_tokens"]
        scores = []
        count += 1
        ref_lis = []
        for index in range(len(citations)):
            if citations[index]['label'] == 1:
                ref_lis.append(index)
            ciation = citations[index]
            cit_up_source_tokens = process(ciation["up_source_tokens"])
            cit_down_source_tokens = process(ciation["down_source_tokens"])
            cit_target = process(ciation["target_tokens"])
            score = getSVMScore(
                idf_dic, up_source_tokens, cit_up_source_tokens + " " +
                cit_target + " " + cit_down_source_tokens)
            scores.append(score)
        # print("scores:",scores)
        new_score = sorted(scores, reverse=True)
        pre_lis = []
        for i in range(3):
            pre_lis.append(scores.index(new_score[i]))
        # print("原文:",up_source_tokens + " "+ down_source_tokens)
        # print("候选:",citations[pre_lis[0]]["up_source_tokens"])
        # print("候选:",citations[pre_lis[0]]["target_tokens"])
        # print("候选:",citations[pre_lis[0]]["down_source_tokens"])
        # print("ref_lis",ref_lis)
        # print("pre_lis",pre_lis)
        precision, recall, MAP = cal_MAP(ref_lis, pre_lis)
        # print("precision:", precision)
        # print("recall:", recall)
        # print("MAP:", MAP)
        # print("-----------------------------------------------")
        MAPS += MAP
        precisions += precision
        recalls += recall

    MAPS /= len(datas)
    precisions /= len(datas)
    recalls /= len(datas)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
コード例 #7
0
def test(args):
    args.dropout = 0.0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data) * 4 // 5:len(data)]
    # dev_data = data[2000: 4000]

    batch = Batch(args)
    word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl")
    input_vec = len(word2index)

    dev_batches = batch.dev_batch(dev_data, args.context_limit)

    log_msg = "输入词空间大小:%d" % (input_vec)
    logger.info(log_msg)
    print(log_msg)

    transform = Transformer(args, input_vec)
    # transform.load_state_dict(torch.load("./modelsave/" + "TransformModel0.pkl"))
    if torch.cuda.is_available():
        transform = transform.cuda()

    # model = Classify(args, transform)
    model = Classify(args, transform)

    #if args.loadmodel ==True:
    model.load_state_dict(torch.load("./modelsave/" + "maxclassifyModel2.pkl"))

    if torch.cuda.is_available():
        model = model.cuda()

    # 打印参数:
    log_msg = "模型名称:%s \n" % (args.loadmodelName)
    logger.info(log_msg)
    print(log_msg)

    result_dic = {}
    true_label_dic = {}
    for dev_step, dev_batch in enumerate(dev_batches):
        context_idxs = dev_batch['context_idxs']
        seg_indexs = dev_batch['seg_indexs']
        cit_targets = dev_batch['cit_targets']
        target = dev_batch['targets']
        ref_labels = dev_batch['ref_labels']
        id = dev_batch['id']
        print(id)
        context_mask = torch.Tensor(
            np.array(
                [list(map(function, xx)) for xx in context_idxs.data.numpy()],
                dtype=np.float)).cuda()

        context_idxs = Variable(context_idxs).cuda()
        seg_indexs = Variable(seg_indexs).cuda()
        out = model.forward(context_idxs, seg_indexs, context_mask)
        # Get loss
        if id not in result_dic:
            result_dic[id] = []
            result_dic[id].append(out.cpu().data)
            true_label_dic[id] = ref_labels
        else:
            result_dic[id].append(out.cpu().data)
        del out
    picklesave(result_dic, "./modelsave/classifyModel2_predict.pkl",
               "./modelsave/result_dic.pkl")
    picklesave(true_label_dic, "./modelsave/classifyModel2_true.pkl",
               "./modelsave/true_label_dic.pkl")
    keys = result_dic.keys()
    MAPS = 0
    precisions = 0
    recalls = 0
    for key in keys:
        out = torch.cat(result_dic[key], dim=0)
        predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy()
        # print("预测标签:",predict_index)
        precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index)
        MAPS += MAP
        precisions += precision
        recalls += recall

    MAPS /= len(dev_data)
    precisions /= len(dev_data)
    recalls /= len(dev_data)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))