def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) if output_null_log_odds_file: with open(output_null_log_odds_file, "r") as fin: import json no_answer_probs = json.load(fin) else: no_answer_probs = None # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions, no_answer_probs) output_eval_file = os.path.join(args.output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1]} if args.model_type != "distilbert": inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2] # XLM don't use segment_ids example_indices = batch[3] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) # 만약 output_dir 존재하지 않는다면 만든다 if not os.path.exists(output_dir) and local_rank in [-1, 0]: os.makedirs(output_dir) # eval_batch_size 어차피 ngpu = 1이므로 개당 batch size와 같음 eval_batch_size = per_gpu_eval_batch_size # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) # multi-gpu evaluate if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix)) if version_2_with_negative: output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, version_2_with_negative, null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! print("***** Running evaluation {} *****".format(prefix)) print(" Num examples = %d" % len(dataset)) print(" Batch size = %d" % args.eval_batch_size) all_results = [] start_time = timeit.default_timer() eval_pbar = tqdm(total=len(dataset), position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)) for batch in eval_dataloader: model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } feature_indices = batch[3] outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) eval_pbar.update(batch[0].size(0)) # hiepnh eval_pbar.close() # hiepnh evalTime = timeit.default_timer() - start_time print(" Evaluation done in total %f secs (%f sec per example)" % (evalTime, evalTime / len(dataset))) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def test_mrc(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], } example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) outputs = model.mrc_forward(**inputs, output_embedding=True) start_logits, end_logits, classification_logits, emb = outputs for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits, _ = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, config, model, tokenizer, prefix="", global_step=0): dataset, examples, features = load_and_cache_examples(args, config, tokenizer, evaluate=True, output_examples=True) config['eval'][ 'eval_batch_size'] = config.eval.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=config.eval.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", config.eval.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if config.model.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if config.model.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * config.input.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(config.output.log_dir, f"predictions_{prefix}.json") output_nbest_file = os.path.join( config.output.log_dir, f"nbest_{config.model.n_best_size}_predictions_{prefix}.json") if config.input.version_2_with_negative: output_null_log_odds_file = os.path.join(config.output.log_dir, f"null_odds_{prefix}.json") else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if config.model.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, config.model.n_best_size, config.model.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, config.input.version_2_with_negative, tokenizer, config.output.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, config.model.n_best_size, config.model.max_answer_length, config.model.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, config.output.verbose_logging, config.input.version_2_with_negative, config.model.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Save eval results to output file as well if prefix == "-1": # evaluate at the end of training, store in the log_dir directly output_eval_file = os.path.join(config.output.log_dir, "eval_results.tsv") else: # there is a 'prefix' subfolder output_eval_file = os.path.join(config.output.log_dir, prefix, "eval_results.tsv") if not os.path.exists( output_eval_file): # file does not exist yet. write header first with open(output_eval_file, "a") as writer: writer.write("global_step\t" + "\t".join(results.keys()) + "\n") # write header with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) line = [str(global_step)] + [str(r) for r in results.values()] writer.write("\t".join(line) + "\n") return results
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] # 这里面 的 example_indices 只的是 features 中的哪几个。 (切分问题) example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): # tensor.item() only works for tensor containing only one element eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output # print("size of start_logits of {0} is {1}".format(unique_id, len(start_logits))) result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: # print('******* length of examples is {0}*******'.format(len(examples))) # print(len(examples)) # print(len(features)) # print(len(all_results)) predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, real_pred=args.do_real_pred, ) # Compute the F1 and exact scores. Skip this step when we do real prediction (without ground truth) # Writing files are done in the previous steps, if not args.do_real_pred: results = squad_evaluate(examples, predictions) # print('ok, {0}'.format(results)) return results else: return None
def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) if args.use_jit_trace: enable_tracing() args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) is_eval_traced = False # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): ## Habana doesn't support Long tensors ## Hence we need to convert start and end positions to int if args.use_habana: batch[0] = batch[0].to(dtype=torch.int32) batch[1] = batch[1].to(dtype=torch.int32) batch[2] = batch[2].to(dtype=torch.int32) position_ids_cpu = compute_position_ids(batch[0]) model.eval() batch = tuple(t.to(args.device) for t in batch) position_ids = position_ids_cpu.to(args.device) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "position_ids": position_ids, } tensor_dummy = torch.zeros(1).to(args.device) if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) if args.use_jit_trace and is_eval_traced == False: model_trace = torch.jit.trace( model, (batch[0], batch[1], batch[2], position_ids, tensor_dummy, tensor_dummy, tensor_dummy, tensor_dummy, tensor_dummy, tensor_dummy), check_trace=False) is_eval_traced = True model_trace.eval() if args.use_jit_trace: outputs = model_trace(batch[0], batch[1], batch[2], position_ids, tensor_dummy, tensor_dummy, tensor_dummy, tensor_dummy, tensor_dummy, tensor_dummy) else: outputs = model(**inputs) feature_indices = feature_indices.to("cpu") for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix="", global_step=None): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in progress_bar(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["xlm", "roberta", "distilbert", "distilkobert", "xlm-roberta"]: del inputs["token_type_ids"] example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) # Write the result # Write the evaluation result on file output_dir = os.path.join(args.output_dir, 'eval') if not os.path.exists(output_dir): os.makedirs(output_dir) output_eval_file = os.path.join(output_dir, "eval_result_{}_{}.txt".format(list(filter(None, args.model_name_or_path.split("/"))).pop(), global_step)) logger.info("***** Official Eval results *****") with open(output_eval_file, "w", encoding='utf-8') as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(" %s = %s", key, str(official_eval_results[key])) f.write(" {} = {}\n".format(key, str(official_eval_results[key]))) return results
def evaluate(args, model, tokenizer, prefix="", calibration=False): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) dataset_cached = "./dataset_cached" if not os.path.exists(dataset_cached): os.makedirs(dataset_cached) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) all_results = [] evalTime = 0 nb_eval_steps = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) if nb_eval_steps >= args.warmup: start_time = timeit.default_timer() outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if nb_eval_steps >= args.warmup: evalTime += (timeit.default_timer() - start_time) nb_eval_steps += 1 if nb_eval_steps >= args.warmup: perf = (len(eval_dataloader) - args.warmup) * args.eval_batch_size / evalTime logger.info("Evaluation done in total %f secs (%f samples/sec)", evalTime, (len(dataset) - args.warmup) * args.eval_batch_size / evalTime) else: logger.info( "*****no perfformance, please check dataset length and warmup number *****" ) # Compute predictions output_prediction_file = os.path.join(dataset_cached, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( dataset_cached, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( dataset_cached, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) elif not calibration: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. if not calibration: results = squad_evaluate(examples, predictions) return results, perf
def run_prediction(question_texts, context_text): """Setup function to compute predictions""" examples = [] for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=4, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = "predictions.json" output_nbest_file = "nbest_predictions.json" output_null_log_odds_file = "null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions # context = "New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland." # questions = ["How many people live in New Zealand?", # "What's the largest city?"] # # # Run method # predictions = run_prediction(questions, context) # # # Print results # for key in predictions.keys(): # print(predictions[key])
def evaluate(args, model_path1, model1, model2, model3, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, model_path1, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model1, torch.nn.DataParallel): model1 = torch.nn.DataParallel(model1) if args.n_gpu > 1 and not isinstance(model2, torch.nn.DataParallel): model2 = torch.nn.DataParallel(model2) if args.n_gpu > 1 and not isinstance(model3, torch.nn.DataParallel): model3 = torch.nn.DataParallel(model3) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model1.eval() model2.eval() model3.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } feature_indices = batch[3] outputs1 = model1(**inputs) outputs2 = model2(**inputs) outputs3 = model3(**inputs) # print("outputs1", outputs1) for i, feature_index in enumerate(feature_indices): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output1 = [to_list(output1[i]) for output1 in outputs1] output2 = [to_list(output2[i]) for output2 in outputs2] output3 = [to_list(output3[i]) for output3 in outputs3] start_logits1, end_logits1 = output1 start_logits2, end_logits2 = output2 start_logits3, end_logits3 = output3 # TODO homework 补充模型集成代码 权重为 0.4, 0.2, 0.4 计算:start_logits, end_logits weight = [0.4, 0.2, 0.4] start_logits = start_logits1 * weight[0] + start_logits2 * weight[ 1] + start_logits3 * weight[2] end_logits = end_logits1 * weight[0] + end_logits2 * weight[ 1] + end_logits3 * weight[2] result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info( " Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset), ) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results
def evaluate(args, model, tokenizer, prefix=""): languages = args.language.split(',') all_languages_results = {} processor = MLQAProcessor() for split, lang in itertools.product(["dev", "test"], languages): # for split, lang in itertools.product(["dev", "test"], languages): print("evaluating on {0} {1}".format(split, lang)) dataset, examples, features = load_and_cache_examples( args, tokenizer, language=lang, split=split, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "{}.prediction".format(lang)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}_{}_{}.json".format(prefix, split, lang)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}_{}_{}.json".format(prefix, split, lang)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging, ) else: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, map_to_origin=not (args.model_type == "xlmr" and lang == 'zh')) # Compute the F1 and exact scores. # results = squad_evaluate(examples, predictions) results = evaluate_with_path( processor.get_dataset_path(args.data_dir, split, lang), output_prediction_file, lang) all_languages_results["{0}_{1}".format(split, lang)] = results for split in ["dev", "test"]: all_languages_results["{0}_avg".format(split)] = average_dic([ value for key, value in all_languages_results.items() if split in key ]) return all_languages_results
def run_prediction(model, tokenizer, device, output_dir, filename, question_texts, context_text): """Setup function to compute predictions""" examples = [] for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) output_prediction_file = os.path.join(output_dir, filename + "_predictions.json") output_nbest_file = os.path.join(output_dir, filename + "_nbest_predictions.json") output_null_log_odds_file = os.path.join(output_dir, filename + "_null_predictions.json") predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions
def predict( self, query: Union[List[str], str], context: Union[List[str], str], n_best_size: int = 5, mini_batch_size: int = 32, max_answer_length: int = 10, do_lower_case: bool = False, version_2_with_negative: bool = False, verbose_logging: bool = False, null_score_diff_threshold: float = 0.0, max_seq_length: int = 512, doc_stride: int = 128, max_query_length: int = 64, **kwargs, ) -> Tuple[Tuple[str, List[OrderedDict]], Tuple[OrderedDict, OrderedDict]]: """ Predict method for running inference using the pre-trained question answering model * **query** - String or list of strings that specify the ordered questions corresponding to `context` * **context** - String or list of strings that specify the ordered contexts corresponding to `query` * **n_best_size** - Number of top n results you want * **mini_batch_size** - Mini batch size * **max_answer_length** - Maximum token length for answers that are returned * **do_lower_case** - Set as `True` if using uncased QA models * **version_2_with_negative** - Set as True if using QA model with SQUAD2.0 * **verbose_logging** - Set True if you want prediction verbose loggings * **null_score_diff_threshold** - Threshold for predicting null(no answer) in Squad 2.0 Model. Default is 0.0. Raise this if you want fewer null answers * **max_seq_length** - Maximum context token length. Check model configs to see max sequence length the model was trained with * **doc_stride** - Number of token strides to take when splitting up conext into chunks of size `max_seq_length` * **max_query_length** - Maximum token length for queries * ****kwargs**(Optional) - Optional arguments for the Transformers model (mostly for saving evaluations) """ # Make string input consistent as list if isinstance(query, str): query = [query] context = [context] assert len(query) == len(context) examples = self._mini_squad_processor(query=query, context=context) features, dataset = squad_convert_examples_to_features( examples, self.tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, return_dataset="pt", threads=1, ) all_results = [] with torch.no_grad(): dataloader = DataLoader(dataset, batch_size=mini_batch_size) for batch in tqdm(dataloader, desc="Predicting answer"): self.model.eval() batch = tuple(t.to(self.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] if isinstance( self.model, ( XLMForQuestionAnswering, RobertaForQuestionAnswering, DistilBertForQuestionAnswering, CamembertForQuestionAnswering, ), ): del inputs["token_type_ids"] # XLNet and XLM use more arguments for their predictions if isinstance(self.model, self.xmodel_instances): inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(self.model, "config") and hasattr( self.model.config, "lang2id"): # Set language id as 0 for now inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * 0).to(self.device) }) outputs = self.model(**inputs) # Iterate through and produce `SquadResults for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [self.to_list(output[i]) for output in outputs] if isinstance(self.model, self.xmodel_instances): # Some models like the ones in `self.xmodel_instances` use 5 arguments for their predictions start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if isinstance(self.model, self.xmodel_instances): start_n_top = (self.model.config.start_n_top if hasattr( self.model, "config") else self.model.module.config.start_n_top) end_n_top = (self.model.config.end_n_top if hasattr( self.model, "config") else self.model.module.config.end_n_top) answers, n_best = compute_predictions_log_probs( examples, features, all_results, n_best_size, max_answer_length, start_n_top, end_n_top, version_2_with_negative, self.tokenizer, verbose_logging, **kwargs, ) else: answers, n_best = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, verbose_logging, version_2_with_negative, null_score_diff_threshold, self.tokenizer, **kwargs, ) return answers, n_best
def evaluate(self, dataset, examples, features, prefix=""): eval_batch_size = 8 eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): self._model.eval() batch = tuple(t.to(self._device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] if self._model_name in ['xlnet']: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) print("Coding: inputs ", inputs) outputs = self._model(**inputs) # feature is needed for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) print("Coding: unique_id ", unique_id) output = [self._to_list(output[i]) for output in outputs] start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) all_results.append(result) # Compute predictions output_dir = os.getcwd() output_prediction_file = os.path.join( output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( output_dir, "nbest_predictions_{}.json".format(prefix)) version_2_with_negative = True output_null_log_odds_file = None if version_2_with_negative: output_null_log_odds_file = os.path.join( output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None n_best_size = 20 max_answer_length = 30 verbose_logging = True if self._model_name in ['xlnet']: start_n_top = self._model.config.start_n_top end_n_top = self._model.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, n_best_size, max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, version_2_with_negative, self._tokenizer, verbose_logging) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results