def map_eval(eval_file, token_length, tokenizer, device, model, label_list): model.eval() datasets, labels = get_datasets(eval_file) total_batches = 0 total_avp = 0.0 total_mrr = 0.0 # scores, labels = [], [] for k, dataset in tqdm(datasets.items(), desc="Eval datasets"): examples = [] for i, data in enumerate(dataset): examples.append(InputExample(i, data[0], data[1], '0')) eval_features = convert_examples_to_features(examples, label_list, token_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long).to(device) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long).to(device) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long).to(device) # all_label_ids = torch.tensor( # [f.label_id for f in eval_features], dtype=torch.long).to(device) x_input_ids = torch.tensor([f.input_ids_x for f in eval_features], dtype=torch.long).to(device) x_input_mask = torch.tensor([f.input_mask_x for f in eval_features], dtype=torch.long).to(device) x_segment_ids = torch.tensor([f.segment_ids_x for f in eval_features], dtype=torch.long).to(device) y_input_ids = torch.tensor([f.input_ids_y for f in eval_features], dtype=torch.long).to(device) y_input_mask = torch.tensor([f.input_mask_y for f in eval_features], dtype=torch.long).to(device) y_segment_ids = torch.tensor([f.segment_ids_y for f in eval_features], dtype=torch.long).to(device) with torch.no_grad(): logits = model(x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids, all_input_ids, all_segment_ids, all_input_mask) score = F.softmax(logits, dim=1)[:, 1].cpu().numpy() label = np.array(list(map(int, labels[k]))) # print(score, label) # scores.append(score) # labels.append(label) total_avp += mean_average_precision(label, score) total_mrr += mean_reciprocal_rank(label, score) total_batches += 1 mAP = total_avp / total_batches mRR = total_mrr / total_batches logger.info("map is : {}, mrr is : {}".format(mAP, mRR)) data = {'map': mAP, 'mrr': mRR} with open('./result.json', 'w', encoding='utf-8') as f: json.dump(data, f)
def evaluate(args, model, tokenizer, processor, eval_dataset, matched_questions_indexs, prefix=""): results = {} if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) epoch_iterator = tqdm(eval_dataloader, desc="Eval_Iteration", disable=args.local_rank not in [-1, 0]) eval_original_dataset, _ = load_and_cache_examples( args, args.task_name, tokenizer, processor, data_type='eval_original') eval_original_sampler = SequentialSampler(eval_original_dataset) eval_original_dataloader = DataLoader(eval_original_dataset, sampler=eval_original_sampler, batch_size=args.eval_batch_size) original_iterator = tqdm(eval_original_dataloader, desc="Original_Iteration", disable=args.local_rank not in [-1, 0]) original_embeddings = [] eval_question_embeddings = [] for step, batch in enumerate(original_iterator): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): original_inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } original_outputs = model(**original_inputs)[0].mean(1) original_embeddings.append(original_outputs) original_embeddings = torch.cat( [embed.cpu().data for embed in original_embeddings]).numpy() for step, batch in enumerate(epoch_iterator): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): eval_question_inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } eval_questions_outputs = model(**eval_question_inputs)[0].mean(1) eval_question_embeddings.append(eval_questions_outputs) eval_question_embeddings = torch.cat( [o.cpu().data for o in eval_question_embeddings]).numpy() scores = cosine_similarity(eval_question_embeddings, original_embeddings) sorted_indices = scores.argsort()[:, ::-1] mmr = mean_reciprocal_rank(matched_questions_indexs == sorted_indices) map = mean_average_precision(matched_questions_indexs == sorted_indices) print("mean reciprocal rank: {}".format(mmr)) print("mean average precision: {}".format(map)) results['mmr'] = mmr results['map'] = map output_eval_file = os.path.join(args.output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(" %s = %s", key, str(value)) writer.write("%s = %s\n" % (key, value)) return results
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file", default='../data/right_samples.csv', type=str, required=False, help="training file") parser.add_argument("--evaluate_file", default='../data/eval_touzi.xlsx', type=str, required=False, help="training file") parser.add_argument("--do_evaluate", action="store_true", help="Whether to run evaluate.") parser.add_argument("--do_predict", default=True, action="store_true", help="Whether to run predict.") parser.add_argument("--batch_size", default=16, type=int, required=False, help="batch size for train and eval") args = parser.parse_args() if not os.path.exists("embeddings.pkl"): train_df = pd.read_csv(args.train_file, sep='\t') candidate_title = train_df['best_title'].tolist() candidate_reply = train_df["reply"].tolist() titels = [] for title in tqdm(candidate_title, desc='对原问题进行分词ing'): titels.append(seg.cut(title)) embeddings = [] for i in trange(0, len(titels), 16, desc='获取ELMo的句子表示'): mini_embeddings = e.sents2elmo(titels[i:min(len(titels), i + 16)]) for mini_embedding in mini_embeddings: # 获取句子向量,对词取平均 embeddings.append(np.mean(mini_embedding, axis=0)) if i == 0: print(len(embeddings)) print(embeddings[0].shape) print("原始问题句子向量表示获取完毕,保存ing") with open("embeddings.pkl", 'wb') as fout: pickle.dump([candidate_title, candidate_reply, embeddings], fout) else: with open("embeddings.pkl", 'rb') as fint: candidate_title, candidate_reply, embeddings = pickle.load(fint) if args.do_evaluate: evulate_df = pd.read_excel(args.evaluate_file, '投资知道') # code.interact(local=locals()) evulate_df = evulate_df[['问题', '匹配问题']] evulate_df = evulate_df[evulate_df['问题'].notna()] evulate_df = evulate_df[evulate_df['匹配问题'].notna()] questions = evulate_df['问题'].tolist() matched_questions = evulate_df['匹配问题'].tolist() matched_questions_indexs = [] for k, q in enumerate(matched_questions): flag = False for i, _q in enumerate(candidate_title): if q == _q: matched_questions_indexs.append([i]) flag = True break if not flag: matched_questions_indexs.append([-1]) matched_questions_indexs = np.asarray(matched_questions_indexs) # print("size of matched_questions_index:{}".format(matched_questions_indexs.shape)) questions = [seg.cut(question.strip()) for question in questions] question_embedding = [ np.mean(emb, 0) for emb in e.sents2elmo(questions) ] scores = cosine_similarity(question_embedding, embeddings) # print("scores shape : {}".format(scores.shape)) sorted_indices = scores.argsort()[:, ::-1] # code.interact(local=locals()) mmr = mean_reciprocal_rank(sorted_indices == matched_questions_indexs) map = mean_average_precision( sorted_indices == matched_questions_indexs) logger.info("mean reciprocal rank: {}".format(mmr)) logger.info("mean average precision: {}".format(map)) if args.do_predict: while True: title = input("你的问题是?\n") if len(str(title).strip()) == 0: continue title = [seg.cut(str(title).strip())] title_embedding = np.mean(e.sents2elmo(title)[0], 0).reshape(1, -1) scores = cosine_similarity(title_embedding, embeddings)[0] top5_indices = scores.argsort()[-5:][::-1] for index in top5_indices: print("可能的答案,参考问题:" + candidate_title[index] + "\t答案:" + candidate_reply[index] + "\t得分:" + str(scores[index]))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default="../data/right_samples.csv", type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--evaluate_dir", default="../data/eval_touzi.xlsx", type=str, required=False, help= "The evaluate data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_type", default="bert", type=str, required=False, help="Model type selected in the list:", ) parser.add_argument( "--model_name_or_path", default='D:\\NLP\\my-wholes-models\\chinese_wwm_pytorch\\', type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list", ) parser.add_argument( "--output_type", default="avg", type=str, required=False, choices=["pooled", "avg"], help="the type of choice output vector", ) parser.add_argument( "--task_name", default="faq", type=str, required=False, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_predict", default=True, action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=True, action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=1000, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=5000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging # set_log(logger) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", filename="BERT.log", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) set_seed(args) args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASS[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else "./cache") tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else "./cache") model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else "./cache") model = model.to(device) if not os.path.exists("embeddings.pkl"): eval_dataset, candidate_title, candidate_reply = load_examples( args, tokenizer) outputs = evaluate(args, model, eval_dataset) candidate_embeddings = torch.cat([o.cpu().data for o in outputs]).numpy() torch.save([candidate_title, candidate_reply, candidate_embeddings], "embeddings.pkl") else: candidate_title, candidate_reply, candidate_embeddings = torch.load( "embeddings.pkl") if args.do_eval: evulate_df = pd.read_excel(args.evaluate_dir, '投资知道') evulate_df = evulate_df[['问题', '匹配问题']] evulate_df = evulate_df[evulate_df['问题'].notna()] evulate_df = evulate_df[evulate_df['匹配问题'].notna()] questions = evulate_df['问题'].tolist() matched_questions = evulate_df['匹配问题'].tolist() matched_questions_indexs = [] # 找出匹配问题对应的index for k, q in enumerate(matched_questions): flag = False for i, _q in enumerate(candidate_title): if q == _q: matched_questions_indexs.append([i]) flag = True break if not flag: matched_questions_indexs.append([-1]) matched_questions_indexs = np.asarray(matched_questions_indexs) examples = [ InputExample(guid='eva_' + str(idx), text_a=title, text_b=None, label=1) for idx, title in enumerate(questions) ] dataset = from_examples2dataset(args, examples, tokenizer) outputs = evaluate(args, model, dataset) question_embedding = torch.cat([o.cpu().data for o in outputs]).numpy() scores = cosine_similarity(question_embedding, candidate_embeddings) sorted_indices = scores.argsort()[:, ::-1] mmr = mean_reciprocal_rank(matched_questions_indexs == sorted_indices) map = mean_average_precision( matched_questions_indexs == sorted_indices) print("mean reciprocal rank: {}".format(mmr)) print("mean average precision: {}".format(map)) logger.info("====" * 100) logger.info("mean reciprocal rank: {}".format(mmr)) logger.info("mean average precision: {}".format(map)) logger.info("====" * 100) if args.do_predict: while True: question = input("你的问题是?\n") if len(str(question)) == 0: continue examples = [ InputExample(guid=0, text_a=question, text_b=None, label=1) ] dataset = from_examples2dataset(args, examples, tokenizer) outputs = evaluate(args, model, dataset) question_embedding = torch.cat([o.cpu().data for o in outputs]).numpy() scores = cosine_similarity(question_embedding, candidate_embeddings)[0] top5 = scores.argsort()[-5:][::-1] for index in top5: print("可能得答案,参考问题为:{},答案:{},得分:{}".format( candidate_title[index], candidate_reply[index], str(scores[index]))) print()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file", default=None, type=str, required=True, help="training file") parser.add_argument("--batch_size", default=64, type=int, required=False, help="batch size for train and eval") args = parser.parse_args() # load dataset if not os.path.exists(args.train_file + "_embeddings.pkl"): train_df = pd.read_csv(args.train_file) candidates = train_df[train_df["is_best"] == 1][["title", "reply"]] candidate_title = candidates["title"].tolist() candidate_reply = candidates["reply"].tolist() titles = [seg.cut(title) for title in candidates["title"]] embeddings = e.sents2elmo(titles) # list of numpy arrays, each array with shape [seq_len * 1024] # code.interact(local=locals()) candidate_embeddings = [ np.mean(embedding, 0) for embedding in embeddings ] # a list of 1024 dimensional vectors with open(args.train_file + "_embeddings.pkl", "wb") as fout: pickle.dump( [candidate_title, candidate_reply, candidate_embeddings], fout) else: with open(args.train_file + "_embeddings.pkl", "rb") as fin: candidate_title, candidate_reply, candidate_embeddings = pickle.load( fin) df = pd.read_excel("../../dataset/faq/验证数据.xlsx", "投资知道") df = df[["问题", "匹配问题"]] df = df[df["匹配问题"].notna()] df = df[df["问题"].notna()] questions = df["问题"].tolist() matched_questions = df["匹配问题"].tolist() matched_questions_index = [] for q in matched_questions: flg = False for i, _q in enumerate(candidate_title): if q == _q: matched_questions_index.append([i]) flg = True break if flg == False: matched_questions_index.append([-1]) matched_questions_index = np.asarray(matched_questions_index) questions = [seg.cut(q.strip()) for q in questions] question_embedding = [np.mean(emb, 0) for emb in e.sents2elmo(questions) ] # 得到了新问题的ELMo embedding scores = cosine_similarity(question_embedding, candidate_embeddings) # 计算新问题和所有候选问题的分数 sorted_indices = scores.argsort()[:, ::-1] #[-5:][::-1] # 对分数做排名 # code.interact(local=locals()) mmr = mean_reciprocal_rank( sorted_indices == matched_questions_index) # 利用分数排名来计算mean receiprocal rank print("mean reciprocal rank: {}".format(mmr))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) if not os.path.exists(args.data_dir + "_bert_embeddings.pkl"): logger.info("Training/evaluation parameters %s", args) eval_dataset, candidate_title, candidate_reply = load_examples( args, args.task_name, tokenizer) sequence_outputs = evaluate(args, model, eval_dataset) candidate_embeddings = torch.cat([o for o in sequence_outputs]).numpy() with open(args.data_dir + "_bert_embeddings.pkl", "wb") as fout: pickle.dump( [candidate_title, candidate_reply, candidate_embeddings], fout) # 18677 * 768 else: with open(args.data_dir + "_bert_embeddings.pkl", "rb") as fin: candidate_title, candidate_reply, candidate_embeddings = pickle.load( fin) # code.interact(local=locals()) # load dataset # if not os.path.exists("embeddings.pkl"): # # code.interact(local=locals()) # candidate_embeddings = [np.mean(embedding, 0) for embedding in embeddings] # with open("embeddings.pkl", "wb") as fout: # pickle.dump([candidate_title, candidate_reply, embeddings], fout) # else: # with open("embeddings.pkl", "rb") as fin: # candidate_title, candidate_reply, embeddings = pickle.load(fin) df = pd.read_excel("../../dataset/faq/验证数据.xlsx", "投资知道") df = df[["问题", "匹配问题"]] df = df[df["匹配问题"].notna()] df = df[df["问题"].notna()] questions = df["问题"].tolist() matched_questions = df["匹配问题"].tolist() matched_questions_index = [] for q in matched_questions: flg = False for i, _q in enumerate(candidate_title): if q == _q: matched_questions_index.append([i]) flg = True break if flg == False: matched_questions_index.append([-1]) matched_questions_index = np.asarray(matched_questions_index) examples = [ InputExample(guid=0, text_a=title, text_b=None, label=1) for title in questions ] features = convert_examples_to_features( examples, tokenizer, label_list=[1], output_mode="classification", max_length=args.max_seq_length, pad_on_left=bool( args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids) sequence_outputs = evaluate(args, model, dataset) # code.interact(local=locals()) question_embedding = torch.cat([o for o in sequence_outputs]).numpy() scores = cosine_similarity(question_embedding, candidate_embeddings) sorted_indices = scores.argsort()[:, ::-1] #[-5:][::-1] # code.interact(local=locals()) mmr = mean_reciprocal_rank(sorted_indices == matched_questions_index) print("mean reciprocal rank: {}".format(mmr))