Example #1
0
    def compute_predictions(self, eval_results, prefix, examples, features):
        output_prediction_file = os.path.join(
            self.output_path, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.output_path, "nbest_predictions_{}.json".format(prefix))
        if self.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                self.output_path, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        if self.model_type in ['xlnet', 'xlm']:
            # XLNet uses a more complex post-processing procedure
            write_predictions_extended(
                examples, features, eval_results, self.n_best_size,
                self.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file,
                self.model.config.start_n_top, self.model.config.end_n_top,
                self.version_2_with_negative, self.tokenizer, False)
        else:
            write_predictions(
                examples, features, eval_results, self.n_best_size,
                self.max_answer_length, self.do_lower_case, output_prediction_file,
                output_nbest_file, output_null_log_odds_file, False,
                self.version_2_with_negative, self.null_score_diff_threshold)
        return output_prediction_file, output_null_log_odds_file
Example #2
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and (args.local_rank in [-1, 0] or
                                                args.no_distributed_training):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if (
        args.local_rank == -1
        or args.no_distributed_training) else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': None if args.model_type == 'xlm' else
                batch[2]  # XLM don't use segment_ids
            }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, args.predict_file,
            model.config.start_n_top, model.config.end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                          args.max_answer_length, args.do_lower_case,
                          output_prediction_file, output_nbest_file,
                          output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative,
                          args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)

    return results
Example #3
0
            'attention_mask': batch[1],
            'token_type_ids': batch[2]
        }
        example_indices = batch[3]

        outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
        eval_feature = features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        result = RawResult(unique_id=unique_id,
                           start_logits=to_list(outputs[0][i]),
                           end_logits=to_list(outputs[1][i]))
        all_results.append(result)

# Compute predictions
output_prediction_file = "predictions_.json"
output_nbest_file = "nbest_predictions_.json"

write_predictions(examples, features, all_results, 20, max_answer_length,
                  do_lower_case, output_prediction_file, output_nbest_file,
                  output_null_log_odds_file, False, False,
                  null_score_diff_threshold)

# Evaluate with the official SQuAD script
evaluate_options = EVAL_OPTS(data_file=dev_file,
                             pred_file=output_prediction_file,
                             na_prob_file=output_null_log_odds_file)
results = evaluate_on_squad(evaluate_options)
print(results)
Example #4
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            if args.model_type in ["xlnet", "xlm"]:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]),
                )
            else:
                result = RawResult(
                    unique_id=unique_id,
                    start_logits=to_list(outputs[0][i]),
                    end_logits=to_list(outputs[1][i]),
                )

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.predict_file,
            model.config.start_n_top,
            model.config.end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        write_predictions(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
        )

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(
        data_file=args.predict_file,
        pred_file=output_prediction_file,
        na_prob_file=output_null_log_odds_file,
    )
    results = evaluate_on_squad(evaluate_options)
    return results
def evaluate(args, model, tokenizer, prefix=""):
    global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank()
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if args.preprocess_only:
        return

    write_dir = args.output_dir if args.write_dir is None else args.write_dir
    if not os.path.exists(write_dir) and global_rank in [-1, 0]:
        os.makedirs(write_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)  # if global_rank == -1 else DistributedSampler(dataset)  # No distributed eval to eval on full dev set
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': None if args.model_type in ['xlm', 'roberta'] else batch[2]  # XLM don't use segment_ids
                      }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4],
                               'p_mask':    batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(unique_id            = unique_id,
                                           start_top_log_probs  = to_list(outputs[0][i]),
                                           start_top_index      = to_list(outputs[1][i]),
                                           end_top_log_probs    = to_list(outputs[2][i]),
                                           end_top_index        = to_list(outputs[3][i]),
                                           cls_logits           = to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id    = unique_id,
                                   start_logits = to_list(outputs[0][i]),
                                   end_logits   = to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(write_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(write_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(write_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        all_predictions, all_nbest_predictions, all_null_odds = write_predictions_extended(
                        examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.predict_file,
                        model.config.start_n_top, model.config.end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        all_predictions, all_nbest_predictions, all_null_odds = write_predictions(
                        examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    def filter_keys(qid_to_val, task_name):
        task_name = task_name.lower()
        assert task_name in {'hotpot', 'squad'}, 'task_name {} not implemented.'.format(task_name)
        return {qid: val for qid, val in qid_to_val.items() if len(qid.split('.')) == (2 if task_name == 'hotpot' else 1)}

    if len(filter_keys(all_predictions, 'squad')) == 0:
        results = {}  # No SQuAD data in evaluation set
    else:
        squad_output_prediction_file = os.path.join(write_dir, "squad_predictions_{}.json".format(prefix))
        with open(squad_output_prediction_file, 'w') as writer:
            writer.write(json.dumps(filter_keys(all_predictions, 'squad'), indent=2))
        squad_output_nbest_file = os.path.join(write_dir, "squad_nbest_predictions_{}.json".format(prefix))
        with open(squad_output_nbest_file, 'w') as writer:
            writer.write(json.dumps(filter_keys(all_nbest_predictions, 'squad'), indent=2))
        if args.version_2_with_negative:
            squad_output_null_log_odds_file = os.path.join(write_dir, "squad_null_odds_{}.json".format(prefix))
            with open(squad_output_null_log_odds_file, 'w') as writer:
                writer.write(json.dumps(filter_keys(all_null_odds, 'squad'), indent=2))
        else:
            squad_output_null_log_odds_file = None
        predict_file_parts = args.predict_file.split('/')
        squad_predict_file = '/'.join(predict_file_parts[:-2] + ['squad', predict_file_parts[-1]])
        evaluate_options = EVAL_OPTS(data_file=squad_predict_file,
                                     pred_file=squad_output_prediction_file,
                                     na_prob_file=squad_output_null_log_odds_file)
        results = evaluate_on_squad(evaluate_options)

    # Check if HotpotQA answer file exists to do HotpotQA evaluation
    hotpot_answer_file_parts = args.predict_file.split('/')
    hotpot_answer_file_parts[-2] = 'hotpot-orig'
    hotpot_answer_file = '/'.join(hotpot_answer_file_parts)
    if (not args.no_answer_file) and (not os.path.exists(hotpot_answer_file)):
        with open(os.path.join(write_dir, "squad_results_{}.json".format(prefix)), "w") as writer:
            writer.write(json.dumps(results, indent=2, sort_keys=True))
        return results

    # Evaluate with official HotpotQA script
    nbest_predictions = filter_keys(all_nbest_predictions, 'hotpot')
    null_odds = filter_keys(all_null_odds, 'hotpot')

    qids = {single_hop_qid.split('.')[0] for single_hop_qid in nbest_predictions.keys()}
    pred_answers_and_sps = {'answer': {}, 'sp': {}}
    globally_normed_pred_answers_and_sps = {'answer': {}, 'sp': {}}
    pred_infos = {}
    globally_normed_pred_infos = {}
    max_num_paragraphs = 10
    for qid in qids:
        # Find paragraph with answer prediction
        min_null_odds = float('inf')
        max_logit_sum = float('-inf')
        best_single_hop_qid = None
        for paragraph_no in range(max_num_paragraphs):
            single_hop_qid = qid + '.' + str(paragraph_no)
            if (single_hop_qid in null_odds) and (null_odds[single_hop_qid] < min_null_odds):
                best_single_hop_qid = single_hop_qid
                min_null_odds = null_odds[single_hop_qid]
            if single_hop_qid in nbest_predictions:
                for nbest_prediction in nbest_predictions[single_hop_qid]:
                    if (len(nbest_prediction['text']) > 0) and (args.model_type not in ['xlnet', 'xlm']):
                        logit_sum = nbest_prediction['start_logit'] + nbest_prediction['end_logit'] - null_odds[single_hop_qid]
                        if logit_sum > max_logit_sum:
                            globally_normed_pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                            globally_normed_pred_infos[qid] = nbest_prediction
                            max_logit_sum = logit_sum

        # Find/store answer and supporting fact
        pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        globally_normed_pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        for nbest_prediction in nbest_predictions[best_single_hop_qid]:
            if len(nbest_prediction['text']) > 0:
                pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                pred_infos[qid] = nbest_prediction
                break
        assert qid in pred_answers_and_sps['answer'], 'Error: No predicted answer found.'
        # assert qid in globally_normed_pred_answers_and_sps['answer'], 'Error: No globally normed predicted answer found.'

    hotpot_output_prediction_file = os.path.join(write_dir, "hotpot_predictions_{}.json".format(prefix))
    with open(hotpot_output_prediction_file, "w") as writer:
        writer.write(json.dumps(pred_answers_and_sps, indent=2))
    hotpot_results = evaluate_on_hotpot(hotpot_output_prediction_file, hotpot_answer_file) if not args.no_answer_file else {}
    with open(os.path.join(write_dir, "hotpot_predictions_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(pred_infos, indent=2))

    hotpot_output_prediction_gn_file = os.path.join(write_dir, "hotpot_predictions_gn_{}.json".format(prefix))
    with open(hotpot_output_prediction_gn_file, "w") as writer:
        writer.write(json.dumps(globally_normed_pred_answers_and_sps, indent=2))
    hotpot_gn_results = evaluate_on_hotpot(hotpot_output_prediction_gn_file, hotpot_answer_file) \
        if ((not args.no_answer_file) and (args.model_type not in ['xlnet', 'xlm'])) else {}
    with open(os.path.join(write_dir, "hotpot_predictions_gn_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(globally_normed_pred_infos, indent=2))

    hotpot_results = {k: v * 100. for k, v in hotpot_results.items()}
    hotpot_gn_results = {'gn_' + k: v * 100. for k, v in hotpot_gn_results.items()}
    results = {'squad_' + k: v for k, v in results.items()}
    results.update(hotpot_results)
    results.update(hotpot_gn_results)
    with open(os.path.join(write_dir, "hotpot_results_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(results, indent=2, sort_keys=True))
    return results
Example #6
0
def do_prediction(model_dir):
    # 1. Load a trained model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
    model = BertForQuestionAnswering.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    # 2. Load and pre-process the test set

    dev_file = "data/sfu.json"
    predict_batch_size = 2
    max_seq_length = 384

    eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False)

    tokenizer = BertTokenizer.from_pretrained(model_dir)
    eval_features = convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length,
                doc_stride=128,
                max_query_length=64,
                is_training=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size)

    # 3. Run inference on the test set

    all_results = []
    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader):

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():       
            batch_start_logits, batch_end_logits = model(input_ids, input_mask, segment_ids)
                
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                                 start_logits=start_logits,
                                                 end_logits=end_logits))
            
    output_prediction_file = os.path.join(model_dir, "predictions_sfu.json")
    output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json")
    output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json")

    preds = write_predictions(eval_examples, eval_features, all_results, 20,
                          30, True, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file, True,
                          False, 0.0)
Example #7
0
def evaluate(args, model, tokenizer, checkpoint_id=None, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if args.eval_data_subset > 0:
        dataset = Subset(dataset, list(range(min(args.eval_data_subset, len(dataset)))))

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids':  batch[2],
                      }
            example_indices = batch[3]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            result = RawResult(unique_id    = unique_id,
                               start_logits = to_list(outputs[0][i]),
                               end_logits   = to_list(outputs[1][i]))
            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix))
    output_nbest_file = os.path.join(args.output_dir,
                                     "nbest_predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix))
    else:
        output_null_log_odds_file = None

    write_predictions(examples, features, all_results, args.n_best_size,
                    args.max_answer_length, args.do_lower_case, output_prediction_file,
                    output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                    args.version_2_with_negative, args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)

    output_eval_file = os.path.join(args.output_dir,
                                    "eval_results_{}.txt".format(checkpoint_id) if checkpoint_id is not None
                                    else "eval_results.txt"
                                    )
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
            writer.write("%s = %s\n" % (key, str(results[key])))

    return results
Example #8
0
def predict(args, model, tokenizer, prefix="test"):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    pred_dataloader = DataLoader(dataset,
                                 batch_size=args.pred_batch_size,
                                 shuffle=False)

    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.pred_batch_size)
    all_results = []
    for batch in tqdm(pred_dataloader, desc="Predicting"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': None if args.model_type == 'xlm' else
                batch[2]  # XLM don't use segment_ids
            }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            pred_feature = features[example_index.item()]
            unique_id = int(pred_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.save_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, args.predict_file,
            model.config.start_n_top, model.config.end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                          args.max_answer_length, args.do_lower_case,
                          output_prediction_file, output_nbest_file,
                          output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative,
                          args.null_score_diff_threshold)

    output_pred_file = os.path.join(args.save_dir, 'submit.csv')
    convert_json_to_csv(output_nbest_file, output_pred_file,
                        args.max_answer_length)
Example #9
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(
        dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            example_indices = batch[3]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            result = RawResult(unique_id=unique_id,
                               start_logits=to_list(outputs[0][i]),
                               end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    write_predictions(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
    )

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)
    return results