def evalute(args, model, tokenizer, prefix=""): eval_task_names = ("span_detection", ) eval_dataset, examples, features = load_and_cache_examples( args, eval_task_names, tokenizer, evaluate=True, output_examples=True) args.eval_batch_size = 1 eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating", position=0, leave=True, ncols=100): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "cls_index": batch[4], "p_mask": batch[5], "task": 2, } feature_indices = batch[3] outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SpanDetectionResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, top_n=model.config.start_n_top, ) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions{}.json".format(prefix)) output_best_file = os.path.join(args.output_dir, "best_predictions{}.json".format(prefix)) start_n_top = model.config.start_n_top end_n_top = model.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, args.min_answer_length, output_prediction_file, output_nbest_file, start_n_top, end_n_top, tokenizer, args.verbose_logging, ) results = span_detection_evaluate(examples, predictions, output_best_file) return results
def evaluate(args, model, tokenizer, prefix="dev", step=0): dataset, examples, features = load_and_cache_examples(args, tokenizer, set_type=prefix, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if prefix == 'test': with open(os.path.join(args.output_dir, args.test_prob_file), 'wb') as f: pickle.dump(all_results, f) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join( args.output_dir, "predictions_{}_{}.json".format(prefix, step)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}_{}.json".format(prefix, step)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}_{}.json".format(prefix, step)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer) if prefix == 'dev': # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results else: return None
def evalute(args, model, tokenizer, prefix=""): eval_task_names = ("span_detection", ) eval_dataset = load_and_cache_examples(args, eval_task_names, tokenizer, evaluate=True) args.eval_batch_size = args.train_batch_size eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) device = 'cpu' #model.to(device) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] all_examples = [] all_features = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating", position=0, leave=True, ncols=100): model.eval() input_ids, attention_mask, token_type_ids, cls_index, p_mask = [ t.squeeze(0).to(args.device) for t in batch[0:5] ] with torch.no_grad(): inputs = { "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, "cls_index": cls_index, "p_mask": p_mask, "task": 2, } example_index = batch[5] unique_id = batch[6] outputs = model(**inputs) description_text, context_text, span_text, start_position_character = [ t[0] for t in batch[-5:-1] ] example = SpanDetectionExample( description_text=description_text, context_text=context_text, span_text=span_text, start_position_character=start_position_character, unique_id=unique_id, ) feature = span_detection_convert_example_to_features( example, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False, example_index=example_index, unique_id=unique_id, ) start_logits = outputs[0] start_top_index = outputs[1] end_logits = outputs[2] end_top_index = outputs[3] cls_logits = outputs[4] result = SpanDetectionResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, top_n=model.config.start_n_top, ) all_results.append(result) all_examples.append(example) all_features.append(feature) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions{}.json".format(prefix)) output_best_file = os.path.join(args.output_dir, "best_predictions{}.json".format(prefix)) start_n_top = model.config.start_n_top end_n_top = model.config.end_n_top predictions = compute_predictions_log_probs( all_examples, all_features, all_results, args.n_best_size, args.max_answer_length, args.min_answer_length, output_prediction_file, output_nbest_file, start_n_top, end_n_top, tokenizer, args.verbose_logging, ) results = span_detection_evaluate(all_examples, predictions, output_best_file) return results