Ejemplo n.º 1
0
    def compute_predictions(self, eval_results, prefix, examples, features):
        output_prediction_file = os.path.join(
            self.output_path, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.output_path, "nbest_predictions_{}.json".format(prefix))
        if self.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                self.output_path, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        if self.model_type in ['xlnet', 'xlm']:
            # XLNet uses a more complex post-processing procedure
            write_predictions_extended(
                examples, features, eval_results, self.n_best_size,
                self.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file,
                self.model.config.start_n_top, self.model.config.end_n_top,
                self.version_2_with_negative, self.tokenizer, False)
        else:
            write_predictions(
                examples, features, eval_results, self.n_best_size,
                self.max_answer_length, self.do_lower_case, output_prediction_file,
                output_nbest_file, output_null_log_odds_file, False,
                self.version_2_with_negative, self.null_score_diff_threshold)
        return output_prediction_file, output_null_log_odds_file
Ejemplo n.º 2
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and (args.local_rank in [-1, 0] or
                                                args.no_distributed_training):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if (
        args.local_rank == -1
        or args.no_distributed_training) else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': None if args.model_type == 'xlm' else
                batch[2]  # XLM don't use segment_ids
            }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, args.predict_file,
            model.config.start_n_top, model.config.end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                          args.max_answer_length, args.do_lower_case,
                          output_prediction_file, output_nbest_file,
                          output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative,
                          args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)

    return results
Ejemplo n.º 3
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            if args.model_type in ["xlnet", "xlm"]:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]),
                )
            else:
                result = RawResult(
                    unique_id=unique_id,
                    start_logits=to_list(outputs[0][i]),
                    end_logits=to_list(outputs[1][i]),
                )

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.predict_file,
            model.config.start_n_top,
            model.config.end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        write_predictions(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
        )

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(
        data_file=args.predict_file,
        pred_file=output_prediction_file,
        na_prob_file=output_null_log_odds_file,
    )
    results = evaluate_on_squad(evaluate_options)
    return results
def evaluate(args, model, tokenizer, prefix=""):
    global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank()
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if args.preprocess_only:
        return

    write_dir = args.output_dir if args.write_dir is None else args.write_dir
    if not os.path.exists(write_dir) and global_rank in [-1, 0]:
        os.makedirs(write_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)  # if global_rank == -1 else DistributedSampler(dataset)  # No distributed eval to eval on full dev set
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': None if args.model_type in ['xlm', 'roberta'] else batch[2]  # XLM don't use segment_ids
                      }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4],
                               'p_mask':    batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(unique_id            = unique_id,
                                           start_top_log_probs  = to_list(outputs[0][i]),
                                           start_top_index      = to_list(outputs[1][i]),
                                           end_top_log_probs    = to_list(outputs[2][i]),
                                           end_top_index        = to_list(outputs[3][i]),
                                           cls_logits           = to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id    = unique_id,
                                   start_logits = to_list(outputs[0][i]),
                                   end_logits   = to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(write_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(write_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(write_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        all_predictions, all_nbest_predictions, all_null_odds = write_predictions_extended(
                        examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.predict_file,
                        model.config.start_n_top, model.config.end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        all_predictions, all_nbest_predictions, all_null_odds = write_predictions(
                        examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    def filter_keys(qid_to_val, task_name):
        task_name = task_name.lower()
        assert task_name in {'hotpot', 'squad'}, 'task_name {} not implemented.'.format(task_name)
        return {qid: val for qid, val in qid_to_val.items() if len(qid.split('.')) == (2 if task_name == 'hotpot' else 1)}

    if len(filter_keys(all_predictions, 'squad')) == 0:
        results = {}  # No SQuAD data in evaluation set
    else:
        squad_output_prediction_file = os.path.join(write_dir, "squad_predictions_{}.json".format(prefix))
        with open(squad_output_prediction_file, 'w') as writer:
            writer.write(json.dumps(filter_keys(all_predictions, 'squad'), indent=2))
        squad_output_nbest_file = os.path.join(write_dir, "squad_nbest_predictions_{}.json".format(prefix))
        with open(squad_output_nbest_file, 'w') as writer:
            writer.write(json.dumps(filter_keys(all_nbest_predictions, 'squad'), indent=2))
        if args.version_2_with_negative:
            squad_output_null_log_odds_file = os.path.join(write_dir, "squad_null_odds_{}.json".format(prefix))
            with open(squad_output_null_log_odds_file, 'w') as writer:
                writer.write(json.dumps(filter_keys(all_null_odds, 'squad'), indent=2))
        else:
            squad_output_null_log_odds_file = None
        predict_file_parts = args.predict_file.split('/')
        squad_predict_file = '/'.join(predict_file_parts[:-2] + ['squad', predict_file_parts[-1]])
        evaluate_options = EVAL_OPTS(data_file=squad_predict_file,
                                     pred_file=squad_output_prediction_file,
                                     na_prob_file=squad_output_null_log_odds_file)
        results = evaluate_on_squad(evaluate_options)

    # Check if HotpotQA answer file exists to do HotpotQA evaluation
    hotpot_answer_file_parts = args.predict_file.split('/')
    hotpot_answer_file_parts[-2] = 'hotpot-orig'
    hotpot_answer_file = '/'.join(hotpot_answer_file_parts)
    if (not args.no_answer_file) and (not os.path.exists(hotpot_answer_file)):
        with open(os.path.join(write_dir, "squad_results_{}.json".format(prefix)), "w") as writer:
            writer.write(json.dumps(results, indent=2, sort_keys=True))
        return results

    # Evaluate with official HotpotQA script
    nbest_predictions = filter_keys(all_nbest_predictions, 'hotpot')
    null_odds = filter_keys(all_null_odds, 'hotpot')

    qids = {single_hop_qid.split('.')[0] for single_hop_qid in nbest_predictions.keys()}
    pred_answers_and_sps = {'answer': {}, 'sp': {}}
    globally_normed_pred_answers_and_sps = {'answer': {}, 'sp': {}}
    pred_infos = {}
    globally_normed_pred_infos = {}
    max_num_paragraphs = 10
    for qid in qids:
        # Find paragraph with answer prediction
        min_null_odds = float('inf')
        max_logit_sum = float('-inf')
        best_single_hop_qid = None
        for paragraph_no in range(max_num_paragraphs):
            single_hop_qid = qid + '.' + str(paragraph_no)
            if (single_hop_qid in null_odds) and (null_odds[single_hop_qid] < min_null_odds):
                best_single_hop_qid = single_hop_qid
                min_null_odds = null_odds[single_hop_qid]
            if single_hop_qid in nbest_predictions:
                for nbest_prediction in nbest_predictions[single_hop_qid]:
                    if (len(nbest_prediction['text']) > 0) and (args.model_type not in ['xlnet', 'xlm']):
                        logit_sum = nbest_prediction['start_logit'] + nbest_prediction['end_logit'] - null_odds[single_hop_qid]
                        if logit_sum > max_logit_sum:
                            globally_normed_pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                            globally_normed_pred_infos[qid] = nbest_prediction
                            max_logit_sum = logit_sum

        # Find/store answer and supporting fact
        pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        globally_normed_pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        for nbest_prediction in nbest_predictions[best_single_hop_qid]:
            if len(nbest_prediction['text']) > 0:
                pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                pred_infos[qid] = nbest_prediction
                break
        assert qid in pred_answers_and_sps['answer'], 'Error: No predicted answer found.'
        # assert qid in globally_normed_pred_answers_and_sps['answer'], 'Error: No globally normed predicted answer found.'

    hotpot_output_prediction_file = os.path.join(write_dir, "hotpot_predictions_{}.json".format(prefix))
    with open(hotpot_output_prediction_file, "w") as writer:
        writer.write(json.dumps(pred_answers_and_sps, indent=2))
    hotpot_results = evaluate_on_hotpot(hotpot_output_prediction_file, hotpot_answer_file) if not args.no_answer_file else {}
    with open(os.path.join(write_dir, "hotpot_predictions_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(pred_infos, indent=2))

    hotpot_output_prediction_gn_file = os.path.join(write_dir, "hotpot_predictions_gn_{}.json".format(prefix))
    with open(hotpot_output_prediction_gn_file, "w") as writer:
        writer.write(json.dumps(globally_normed_pred_answers_and_sps, indent=2))
    hotpot_gn_results = evaluate_on_hotpot(hotpot_output_prediction_gn_file, hotpot_answer_file) \
        if ((not args.no_answer_file) and (args.model_type not in ['xlnet', 'xlm'])) else {}
    with open(os.path.join(write_dir, "hotpot_predictions_gn_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(globally_normed_pred_infos, indent=2))

    hotpot_results = {k: v * 100. for k, v in hotpot_results.items()}
    hotpot_gn_results = {'gn_' + k: v * 100. for k, v in hotpot_gn_results.items()}
    results = {'squad_' + k: v for k, v in results.items()}
    results.update(hotpot_results)
    results.update(hotpot_gn_results)
    with open(os.path.join(write_dir, "hotpot_results_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(results, indent=2, sort_keys=True))
    return results
Ejemplo n.º 5
0
def predict(args, model, tokenizer, prefix="test"):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    pred_dataloader = DataLoader(dataset,
                                 batch_size=args.pred_batch_size,
                                 shuffle=False)

    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.pred_batch_size)
    all_results = []
    for batch in tqdm(pred_dataloader, desc="Predicting"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': None if args.model_type == 'xlm' else
                batch[2]  # XLM don't use segment_ids
            }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            pred_feature = features[example_index.item()]
            unique_id = int(pred_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.save_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, args.predict_file,
            model.config.start_n_top, model.config.end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                          args.max_answer_length, args.do_lower_case,
                          output_prediction_file, output_nbest_file,
                          output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative,
                          args.null_score_diff_threshold)

    output_pred_file = os.path.join(args.save_dir, 'submit.csv')
    convert_json_to_csv(output_nbest_file, output_pred_file,
                        args.max_answer_length)
Ejemplo n.º 6
0
    def evaluate(self, dataset, examples, features, model, tokenizer, prefix=1):
        # args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        eval_batch_size = 8 # by default # args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset) # if args.local_rank == -1 else DistributedSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)
        s_pred_file = os.path.join(os.getcwd(), "predict.json")

        all_results = []
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(self._device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2],  
                          'cls_index':      batch[4],
                          'p_mask':         batch[5]
                         }
                example_indices = batch[3]
                outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(unique_id = unique_id,
                                            start_top_log_probs  =  self._to_list(outputs[0][i]),
                                            start_top_index      =  self._to_list(outputs[1][i]),
                                            end_top_log_probs    =  self._to_list(outputs[2][i]),
                                            end_top_index        =  self._to_list(outputs[3][i]),
                                            cls_logits           =  self._to_list(outputs[4][i]))
                all_results.append(result)

        # Compute predictions
        output_dir = os.getcwd()
        output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
        version_2_with_negative = True
        output_null_log_odds_file = None
        if version_2_with_negative:
            output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        # XLNet uses a more complex post-processing procedure
        n_best_size = 20
        max_answer_length = 30
        verbose_logging = True
        predict_file = "predict.json"
        write_predictions_extended(examples, features, all_results, n_best_size,
                                    max_answer_length, output_prediction_file,
                                    output_nbest_file, output_null_log_odds_file, predict_file,
                                    model.config.start_n_top, model.config.end_n_top,
                                    version_2_with_negative, tokenizer, verbose_logging)

        # Evaluate with the official SQuAD script
        evaluate_options = EVAL_OPTS(data_file=s_pred_file,
                                    pred_file=output_prediction_file,
                                    na_prob_file=output_null_log_odds_file)
        results = evaluate_on_squad(evaluate_options)
        # print("Coding: final result ", results)
        return results