def evaluate(args, model, tokenizer, global_step=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(global_step)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in progress_bar(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert", "distilkobert", "xlm-roberta"]: del inputs["token_type_ids"] example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs.values()] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(global_step)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(global_step)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(global_step)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Write the result # Write the evaluation result on file output_dir = os.path.join(args.output_dir, 'eval') if not os.path.exists(output_dir): os.makedirs(output_dir) output_eval_file = os.path.join(output_dir, "eval_result_{}_{}.txt".format(list(filter(None, args.model_name_or_path.split("/"))).pop(), global_step)) with open(output_eval_file, "w", encoding='utf-8') as f: official_eval_results = eval_during_train(args, step=global_step) results.update(official_eval_results) return results
def main(cli_args): # Read from config file and make args with open( os.path.join(cli_args.config_dir, cli_args.task, cli_args.config_file)) as f: args = AttrDict(json.load(f)) logger.info("Training/evaluation parameters {}".format(args)) args.output_dir = os.path.join(args.ckpt_dir, args.output_dir) if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) init_logger() set_seed(args) logging.getLogger("transformers.data.metrics.squad_metrics").setLevel( logging.WARN) # Reduce model loading logs # Load pretrained model and tokenizer config = CONFIG_CLASSES[args.model_type].from_pretrained( args.model_name_or_path, ) tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case, ) model = MODEL_FOR_QUESTION_ANSWERING[args.model_type].from_pretrained( args.model_name_or_path, config=config, ) # GPU or CPU args.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))) if not args.eval_all_checkpoints: checkpoints = checkpoints[-1:] else: logging.getLogger("transformers.configuration_utils").setLevel( logging.WARN) # Reduce model loading logs logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] model = MODEL_FOR_QUESTION_ANSWERING[ args.model_type].from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) with open("eval_result.txt", "w", encoding='utf-8') as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(" %s = %s", key, str(official_eval_results[key])) f.write(" {} = {}\n".format(key, str(official_eval_results[key])))