Ejemplo n.º 1
0
Archivo: squad.py Proyecto: yf1291/nlp4
    def compute_predictions_logits(self, all_results, prefix=""):
        output_prediction_file = os.path.join(self.args.save_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(self.args.save_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(self.args.save_dir,
                                                 "null_odds.json")

        predictions = compute_predictions_logits(
            self.examples,
            self.features,
            all_results,
            self.args.n_best_size,
            self.args.max_answer_length,
            True,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            False,
            False,
            0.0,
            self.tokenizer,
        )
        results = squad_evaluate(self.examples, predictions)
        return results
Ejemplo n.º 2
0
def evaluate(model, tokenizer):
    # Evaluate
    dataset, examples, features = load_and_cache_examples(tokenizer, is_training=False)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=8)

    # Eval!
    print("***** Running evaluation *****")
    print("  Num examples = ", len(dataset))

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = [output[i].detach().cpu().tolist() for output in outputs]
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    print("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size = 20,
        max_answer_length = 30,
        do_lower_case=False,
        output_prediction_file="predictions.json",
        output_nbest_file="nbest_predictions.json",
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=False,
        null_score_diff_threshold=0.0,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    return results
Ejemplo n.º 3
0
    def aggregate_reader_metrics(examples, reader_predictions, reader_metrics):
        '''
        Scores and aggregates reader metrics 

        '''

        reader_eval = dict(squad_evaluate(examples, reader_predictions))
        reader_took = np.mean(reader_metrics)
        reader_eval['Average Prediction Duration'] = reader_took

        return reader_eval
Ejemplo n.º 4
0
    def evaluate_full_dataset(self, data_loader: DataLoader):
        all_results = []

        for batch in data_loader:
            inputs = {
                "input_ids": batch[0].cuda(),
                "attention_mask": batch[1].cuda(),
                "token_type_ids": batch[2].cuda(),
            }
            feature_indices = batch[3]
            outputs = self.model(**inputs)
            for i, feature_index in enumerate(feature_indices):
                eval_feature = self.validation_features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)
                output = [
                    output[i].detach().cpu().tolist() for output in outputs
                ]
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

        task = self.context.get_data_config().get("task")
        if task == "SQuAD1.1":
            version_2_with_negative = False
        elif task == "SQuAD2.0":
            version_2_with_negative = True
        else:
            raise NameError(f"Incompatible dataset '{task}' detected")

        # TODO: Make verbose logging configurable
        verbose_logging = False
        predictions = compute_predictions_logits(
            self.validation_examples,
            self.validation_features,
            all_results,
            self.context.get_hparam("n_best_size"),
            self.context.get_hparam("max_answer_length"),
            self.context.get_hparam("do_lower_case"),
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            verbose_logging,
            version_2_with_negative,
            self.context.get_hparam("null_score_diff_threshold"),
            self.tokenizer,
        )
        results = squad_evaluate(self.validation_examples, predictions)
        return results
Ejemplo n.º 5
0
    def evaluate(self, model, dataset, examples, features):

        eval_batch_size, eval_dataloader = self.get_dataloader_sampler(dataset)

        # multi-gpu evaluate
        if self.args_dict[N_GPU] > 1 and not isinstance(
                model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)
        else:
            model = model

        # Eval!
        logger.info("***** Running evaluation {} *****".format(
            self.global_step))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args_dict[eval_batch_size])

        all_results = []
        start_time = timeit.default_timer()
        model.eval()
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.args_dict[DEVICE]) for t in batch)

            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
                example_indices = batch[3]
                outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        eval_time = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    eval_time, eval_time / len(dataset))

        # Compute predictions
        predictions = self.calcuate_predictions(all_results, examples,
                                                features)

        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results, eval_time
Ejemplo n.º 6
0
def qa_evaluate(lang, test_set, model_type, loader, bert_model, learner, save_dir):
    all_results, loss, uids = [], [], []
    examples = test_set.examples
    features = test_set.features
    for batch in loader:
        with torch.no_grad():
            input_ids, attention_mask, token_type_ids, labels, unique_ids = (
                batch[0],
                batch[1],
                batch[2],
                batch[3],
                batch[4],
            )
            bert_output = bert_model(input_ids, attention_mask, token_type_ids)
            outputs = learner(bert_output, labels=labels, attention_mask=attention_mask)
            loss.append(outputs.loss.mean().item())

        for i, uid in enumerate(unique_ids):
            unique_id = int(uid.item())
            start_logits = outputs.start_logits[i].detach().cpu().tolist()
            end_logits = outputs.end_logits[i].detach().cpu().tolist()
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
            uids.append(unique_id)

    save_dir = os.path.join(save_dir, "result")
    os.makedirs(save_dir, exist_ok=True)
    output_prediction_file = os.path.join(save_dir, f"{lang}.predictions")
    output_nbest_file = os.path.join(save_dir, f"{lang}.nbest_predictions")
    features = [f for f in features if f.unique_id in uids]
    qas_ids = list(dict.fromkeys([f.qas_id for f in features]))
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=False,
        output_prediction_file=output_prediction_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=None,
        verbose_logging=True,
        version_2_with_negative=False,
        null_score_diff_threshold=-np.inf,
        tokenizer=AutoTokenizer.from_pretrained(model_type),
    )
    results = squad_evaluate(test_set.get_by_ids(qas_ids), predictions)
    return torch.tensor(loss), dict(results)
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--predict_file",
        default="data/squad/dev-v2.0.json",
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
    )
    parser.add_argument("--predict_tag_file", default="data/squad/dev-v2.0_tag", type=str)
    parser.add_argument("--prediction_file", type=str)
    args = parser.parse_args()

    eval_examples = read_squad_examples(
        input_file=args.predict_file, input_tag_file=args.predict_tag_file, is_training=False,
    )
    with open(args.prediction_file) as f:
        result = json.load(f)
    print(json.dumps(dict(squad_evaluate(eval_examples, result)), indent=4))
Ejemplo n.º 8
0
    def evaluate_full_dataset(self, data_loader: DataLoader):
        all_results = []
        for batch in data_loader:
            inputs = {
                "input_ids": batch[0].cuda(),
                "attention_mask": batch[1].cuda(),
                "token_type_ids": batch[2].cuda(),
            }
            feature_indices = batch[3]
            outputs = self.model(**inputs)
            for i, feature_index in enumerate(feature_indices):
                eval_feature = self.validation_features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)
                output = [
                    output[i].detach().cpu().tolist() for output in outputs
                ]
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None
        predictions = compute_predictions_logits(
            self.validation_examples,
            self.validation_features,
            all_results,
            self.context.get_hparam("n_best_size"),
            self.context.get_hparam("max_answer_length"),
            True,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            True,
            False,
            self.context.get_hparam("null_score_diff_threshold"),
            self.tokenizer,
        )
        results = squad_evaluate(self.validation_examples, predictions)
        return results
Ejemplo n.º 9
0
    async def accuracy(self, sources: Sources):
        if not os.path.isfile(
                os.path.join(self.parent.config.output_dir,
                             "pytorch_model.bin")):
            raise ModelNotTrained("Train model before assessing for accuracy.")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir,
            do_lower_case=self.parent.config.do_lower_case,
        )
        eval_examples = await self._preprocess_data(sources)
        features, dataset = squad_convert_examples_to_features(
            examples=eval_examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.parent.config.max_seq_length,
            doc_stride=self.parent.config.doc_stride,
            max_query_length=self.parent.config.max_query_length,
            is_training=False,
            return_dataset="pt",
        )

        results = {}
        if self.parent.config.local_rank in [-1, 0]:
            logger.info(
                "Loading checkpoints saved during training for evaluation")
            self.model = AutoModelForQuestionAnswering.from_pretrained(
                self.parent.config.output_dir)
            self.model.to(self.parent.config.device)

            # Evaluate
            predictions = await self._custom_accuracy(eval_examples, features,
                                                      dataset)
            results = squad_evaluate(eval_examples, predictions)

        logger.info("Results: {}".format(results))

        # return results
        return Accuracy(results["f1"])
Ejemplo n.º 10
0
 def compute_predictions_logits(self, all_results, prefix=""):
     output_prediction_file = os.path.join(
         self.args.save_dir, "predictions_{}.json".format(prefix))
     output_nbest_file = os.path.join(
         self.args.save_dir, "nbest_predictions_{}.json".format(prefix))
     output_null_log_odds_file = os.path.join(
         self.args.save_dir, "null_odds_{}.json".format(prefix))
     predictions = compute_predictions_logits(
         self.examples,
         self.features,
         all_results,
         20,
         30,
         True,
         output_prediction_file,
         output_nbest_file,
         output_null_log_odds_file,
         False,
         True,
         0.0,
         self.tokenizer,
     )
     results = squad_evaluate(self.examples, predictions)
     return results
Ejemplo n.º 11
0
def evaluate(args, model, tokenizer, prefix="", global_step=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in progress_bar(eval_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "distilkobert",
                    "xlm-roberta"
            ]:
                del inputs["token_type_ids"]

            # reforbert인 경우
            if args.model_type in ["reforbert"]:
                del inputs["attention_mask"]
            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    # Write the result
    # Write the evaluation result on file
    output_dir = os.path.join(args.output_dir, 'eval')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_eval_file = os.path.join(
        output_dir, "eval_result_{}_{}.txt".format(
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            global_step))

    logger.info("***** Official Eval results *****")
    with open(output_eval_file, "w", encoding='utf-8') as f:
        official_eval_results = eval_during_train(args)
        for key in sorted(official_eval_results.keys()):
            logger.info("  %s = %s", key, str(official_eval_results[key]))
            f.write(" {} = {}\n".format(key, str(official_eval_results[key])))
    return results
Ejemplo n.º 12
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        seq_lens = torch.sum((batch[0] != 0).to(torch.int32), dim=1).numpy()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            # inputs = {
            #     "input_ids": batch[0],
            #     "attention_mask": batch[1].half() if args.data_type == 'fp16' else batch[1],
            #     "token_type_ids": batch[2],
            # }
            inputs = [
                batch[0],
                batch[1].half() if args.data_type == 'fp16' else batch[1],
                batch[2]
            ]

            example_indices = batch[3]

            # outputs = model(**inputs)
            outputs = model(*inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits[:seq_lens[i]],
                                     end_logits[:seq_lens[i]])

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 13
0
def evaluate(args, model, tokenizer, device, prefix=""):
    eval_dataset, examples, features = data.load_and_cache_examples(
        args.validation,
        tokenizer,
        args,
        evaluate=True,
        output_examples=True,
    )
    eval_dataloader = data.get_dataloader(eval_dataset,
                                          args.per_gpu_eval_batch_size,
                                          evaluate=True)

    all_results = []
    start_time = timeit.default_timer()
    eval_batches = 0

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        eval_batches += 1

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime,
                evalTime / (eval_batches * args.per_gpu_eval_batch_size))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_data_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_data_dir, "nbest_predictions_{}.json".format(prefix))

    if args.has_unanswerable:
        output_null_log_odds_file = os.path.join(
            args.output_data_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = squad_metrics.compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_len,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.has_unanswerable,
            tokenizer,
            logger.level < logging.INFO,
        )
    else:
        predictions = squad_metrics.compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_len,
            args.uncased_model,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            logger.level < logging.INFO,
            args.has_unanswerable,
            args.null_score_diff_thresh,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_metrics.squad_evaluate(examples, predictions)
    return results
def QA_evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = squad_load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    #eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, shuffle=False)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "squad_predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_squad_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "squad_null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None
    
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 15
0
    def adv_evaluate(self, prefix: str, args, tokenizer, dataset, examples,
                     features) -> torch.Tensor:
        """Performs PGD attack on each example in the evaluation dataset, recording aggregate metrics

        Parameters
        ----------
        prefix : str
            The model to be used for training
        args :

        tokenizer : 
            The tokenizer used to preprocess the data.

        dataset : List(torch.utils.data.TensorDataset)
            The evaluation dataset

        examples : List(torch.utils.data.TensorDataset)
            The examples in the evaluation dataset

        features : List(torch.utils.data.TensorDataset)
            SQuAD-like features corresponding to the evalaution dataset

        Returns
        -------
        torch.Tensor
            The evaluation metrics (Exact Match (EM) and F1-score)
        """

        if not os.path.exists(
                self.args.output_dir) and self.args.local_rank in [-1, 0]:
            os.makedirs(self.args.output_dir)

        # TODO Add batch attacks for eval
        eval_batch_size = max(1, self.args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)

        # multi-gpu evaluate
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        if self.params.model_type == 'bert':
            _embed_layer = self.model.bert.get_input_embeddings()
        elif self.params.model_type == 'distilbert':
            _embed_layer = self.model.distilbert.get_input_embeddings()
        elif self.params.model_type == 'albert':
            _embed_layer = self.model.albert.get_input_embeddings()

        self.model.eval()
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.args.device) for t in batch)
            adv_outputs = []  # (k_iter, batch_size)
            # Set static embedding layer
            _delta = None
            for i_iter in range(args.K):
                input_embedding = torch.stack(
                    [_embed_layer(x) for x in batch[0]])
                if not _delta:
                    m = torch.distributions.multivariate_normal.MultivariateNormal(
                        torch.zeros(768),
                        torch.eye(768) * (args.sigma**2))
                    _sample = m.sample((args.max_seq_length, ))
                    _delta = torch.tensor(_sample,
                                          requires_grad=True,
                                          device=self.args.device)

                adv_input_embedding = input_embedding + _delta
                inputs = {
                    "input_ids": None,
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "start_positions": batch[3],
                    "end_positions": batch[4],
                    "inputs_embeds": adv_input_embedding,
                }

                if self.params.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                intermed_adv_outputs = self.model(**inputs)

                adv_loss = intermed_adv_outputs[0]
                adv_loss.backward()

                # Calculate g_adv and update delta
                g_adv = _delta.grad.data.detach()
                _delta = self._adv_sgn_attack(_delta, args.eps, args.eta,
                                              'inf')
                del g_adv
            _delta.grad.zero_()

            # TODO: Check inf/NaN. How should we proceed with eval if NaNs?

            with torch.no_grad():
                # Generate adversarial loss with perturbed inputs against predicted logits
                inputs = {
                    "input_ids": None,
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "inputs_embeds": input_embedding + _delta
                }

                if self.params.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                adv_outputs = self.model(**inputs)

                example_indices = batch[5]

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                adv_output = [
                    tensor_to_list(output[i]) for output in adv_outputs
                ]

                start_logits, end_logits = adv_output
                result = SquadResult(unique_id, start_logits, end_logits)
                example_id = example_index.item()
                all_results.append(result)

        eval_time = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    eval_time, eval_time / len(dataset))

        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            None,
            None,
            None,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )
        alum_results = squad_evaluate(examples, predictions)
        print('===alum_results: ', alum_results)
        return alum_results
Ejemplo n.º 16
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 17
0
def evaluate_ensemble(args, checkpoints, tokenizer, model_class, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    model_list = []    
    for ckpt in checkpoints:
        logger.info("Evaluate the following fine_tuned_model: %s", ckpt)
        model_list.append(model_class.from_pretrained(ckpt))

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in ["xlm", "roberta", "distilbert"]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                raise NotImplementedError
                # inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # # for lang_id-sensitive xlm models
                # if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                #     inputs.update(
                #         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                #     )

        outputs_list = []
        for model in model_list:
            model.to(args.device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs)
            outputs_list.append(outputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits_list, end_logits_list = [], []
            for outputs in outputs_list:
                output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
                if len(output) >= 5:
                    raise NotImplementedError
                    # start_logits = output[0]
                    # start_top_index = output[1]
                    # end_logits = output[2]
                    # end_top_index = output[3]
                    # cls_logits = output[4]

                    # result = SquadResult(
                    #     unique_id,
                    #     start_logits,
                    #     end_logits,
                    #     start_top_index=start_top_index,
                    #     end_top_index=end_top_index,
                    #     cls_logits=cls_logits,
                    # )

                else:
                    start_logits, end_logits = output
                    start_logits_list.append(start_logits)
                    end_logits_list.append(end_logits)
                    
            if args.model_type in ["xlnet", "xlm"]:
                raise NotImplementedError
            else:
                start_logits_list = np.array(start_logits_list)
                end_logits_list = np.array(end_logits_list)
                #Ensembling method (eg max/avg/etc)
                start_logits = list(start_logits_list.mean(axis=0))
                end_logits = list(end_logits_list.mean(axis=0))
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        raise NotImplementedError
        # start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
        # end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 18
0
def ensemble_vote(args,
                  save_dir='',
                  save_log_path=None,
                  prefix='',
                  predict_prob_mode='add'):
    examples, all_model_features, all_model_results, tokenizers = load_saved_examples(
        args, evaluate=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    # eval_sampler = SequentialSampler(dataset)
    # eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info(f"***** Running ensemble {prefix}*****")
    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", args.eval_batch_size)

    # We do pure voting now, not taking new inputs
    # start_time = timeit.default_timer()
    # evalTime = timeit.default_timer() - start_time
    # logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    all_predictions = []
    all_probs = []
    logger.info(f'predict_prob_mode: {predict_prob_mode}')
    for model_idx in tqdm(range(len(tokenizers)), desc="Predicting"):
        features = all_model_features[model_idx]
        all_results = all_model_results[model_idx]
        tokenizer = tokenizers[model_idx]

        predictions, probs = hack.compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
            prob_mode=predict_prob_mode)
        all_predictions.append(predictions)
        all_probs.append(probs)
        # continue

    # num of predictions
    num_of_predicions = len(all_predictions[0])
    logger.info(f'Number of predicions {num_of_predicions}')

    final_predictions = collections.OrderedDict()
    output_result = collections.OrderedDict()
    # Grid Search
    if args.do_grid_search:
        grid_search_results = collections.OrderedDict()
        grid_search_predictions = collections.OrderedDict()
        for weights in product(np.arange(6), repeat=len(all_probs)):
            if weights == (0, 0, 0, 0, 0):
                continue
            for qas_id in all_predictions[0].keys():
                probs = np.array([d_prob[qas_id] for d_prob in all_probs])
                for i, w in enumerate(weights):
                    probs[i] *= w

                idx = np.argmax(probs)
                final_predictions[qas_id] = all_predictions[idx][qas_id]
            """
            logger.info('Model individual results')
            for i in range(len(tokenizers)):
                results = squad_evaluate(examples, all_predictions[i])
                logger.info(results)
            """
            # Compute the F1 and exact scores.
            logger.info(f'Weights: {weights}')
            logger.info('Ensemble results')
            final_results = squad_evaluate(examples, final_predictions)
            logger.info(final_results)

            if len(grid_search_results) == 0:
                best_weights = weights
                grid_search_results = final_results
                grid_search_predictions = final_predictions
            else:
                if grid_search_results['exact'] + grid_search_results[
                        'f1'] < final_results['exact'] + final_results['f1']:
                    best_weights = weights
                    grid_search_results = final_results
                    grid_search_predictions = final_predictions
        # save log to file
        logger.info(f'Best Weights: {best_weights}')
        output_result[best_weights] = grid_search_results
        util.save_json_file(os.path.join(save_dir, 'eval_results.json'),
                            output_result)

        # save prediction to file
        # TODO save grid search best
        util.save_json_file(os.path.join(save_dir, 'predictions_.json'),
                            grid_search_predictions)
        util.convert_submission_format_and_save(
            save_dir,
            prediction_file_path=os.path.join(save_dir, 'predictions_.json'))

        return grid_search_results
    else:
        for qas_id in all_predictions[0].keys():
            probs = np.array([d_prob[qas_id] for d_prob in all_probs])

            idx = np.argmax(probs)
            final_predictions[qas_id] = all_predictions[idx][qas_id]

        logger.info('Model individual results')
        for i in range(len(tokenizers)):
            results = squad_evaluate(examples, all_predictions[i])
            logger.info(results)
        # Compute the F1 and exact scores.
        logger.info('Ensemble results')
        final_results = squad_evaluate(examples, final_predictions)
        logger.info(final_results)

        # save log to file
        util.save_json_file(os.path.join(save_dir, 'eval_results.json'),
                            final_results)

        util.save_json_file(os.path.join(save_dir, 'predictions_.json'),
                            final_predictions)
        util.convert_submission_format_and_save(
            save_dir,
            prediction_file_path=os.path.join(save_dir, 'predictions_.json'))

        return final_results
Ejemplo n.º 19
0
def evaluate(args: Args,
             model,
             tokenizer,
             dataset,
             examples,
             features,
             suffix="",
             return_raw=False):
    if args.no_cuda is None:
        args.no_cuda = not _is_gpu_available()
    if args.predictions_folder:
        assert args.eval_file, "Need name of the eval file to save predictions!"
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu)

    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=eval_batch_size)
    model.to(device)
    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    click.echo(
        f"Generating predictions for model {click.style(args.model_path, fg='blue')}, "
        f"running on {click.style(str(device), fg='green')}")
    click.echo("  Num examples = %d" % len(dataset))
    click.echo("  Batch size = %d" % eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(device)
                    })
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    eval_time = timeit.default_timer() - start_time
    logger.info(
        f"Evaluation done in total {eval_time} secs ({eval_time / len(dataset)} sec per example)"
    )
    eval_file = args.eval_file
    predictions_folder = args.predictions_folder
    v2 = args.v2
    if predictions_folder:
        out_file = get_output_predictions_file_name(eval_file,
                                                    predictions_folder, suffix)
        logger.info(f"Saving predictions in {out_file}")

        # Compute predictions
        file_name = os.path.basename(out_file)
        output_prediction_file = os.path.join(predictions_folder, file_name)
        # output_nbest_file = os.path.join(predictions_folder, f"nbest-{file_name}")
        output_nbest_file = None

        if v2:
            output_null_log_odds_file = os.path.join(predictions_folder,
                                                     f"null-odds-{file_name}")
        else:
            output_null_log_odds_file = None
    else:
        logger.info("Not saving predictions...")
        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.v2,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.v2,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    # results = squad_evaluate(examples, predictions)
    # return results
    if return_raw:
        return predictions
    else:
        return squad_evaluate(examples, predictions)
Ejemplo n.º 20
0
    def evaluate_bert(self, model):
        """After the completion of each training epoch, measure the model's performance
        on our validation set.
        """
        # Put the model into the evaluation mode. The dropout layers are disabled during
        # the test time.
        model.eval()
        datasets = self.textData.datasets['dev']
        features = datasets['features']
        eval_sampler = SequentialSampler(datasets['dataset'])
        dev_dataloader = DataLoader(datasets['dataset'],
                                    sampler=eval_sampler,
                                    batch_size=args['batchSize'])
        n_iters = len(datasets['dataset'])

        # Tracking variables
        val_accuracy = []
        val_loss = []
        prefix = "pp"
        output_prediction_file = os.path.join(
            args['rootDir'], "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            args['rootDir'], "nbest_predictions_{}.json".format(prefix))
        output_null_log_odds_file = os.path.join(
            args['rootDir'], "null_odds_{}.json".format(prefix))
        all_results = []
        # For each batch in our validation set...
        for batch in dev_dataloader:
            batch = tuple(t.to(args['device']) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                # "start_positions": batch[3],
                # "end_positions": batch[4],
            }
            # Compute logits
            with torch.no_grad():
                start_logits, end_logits = model.predict(inputs)

            feature_indices = batch[3]
            for i, feature_index in enumerate(feature_indices):
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)
                # output = [to_list(output[i]) for output in outputs]
                # start_logits, end_logits = output
                result = SquadResult(unique_id, self.to_list(start_logits[i]),
                                     self.to_list(end_logits[i]))

                all_results.append(result)

            # val_loss.append(loss.item())

            # print(preds, batch.label)

            # Calculate the accuracy rate
            # accuracy = (preds.cpu() == torch.LongTensor(batch.label)).numpy().mean() * 100
            # val_accuracy.append(accuracy)

        predictions = compute_predictions_logits(
            datasets['examples'],
            datasets['features'],
            all_results,
            20,
            30,
            True,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            True,
            True,
            0.0,
            self.textData.tokenizer,
        )
        results = squad_evaluate(datasets['examples'], predictions)
        # print(results)
        # Compute the average accuracy and loss over the validation set.
        # val_loss = np.mean(val_loss)
        # val_accuracy = np.mean(val_accuracy)

        return -1, results
Ejemplo n.º 21
0
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             save_dir='',
             save_log_path=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    # y_cls_correct = 0
    # y_cls_incorrect = 0
    y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]
            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            is_impossible = eval_feature.is_impossible

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits, logits_cls, prob_cls = output

                prob_cls = np.asarray(prob_cls, dtype=np.float)
                predict_cls = np.argmax(prob_cls)

                if predict_cls == int(not is_impossible):
                    if is_impossible:
                        y_cls_tn += 1
                    else:
                        y_cls_tp += 1
                else:
                    if is_impossible:
                        y_cls_fp += 1
                    else:
                        y_cls_fn += 1
                result = SquadResult(unique_id, start_logits, end_logits)
                # Add cls prediction
                if args.force_cls_pred:
                    result.prob_cls = prob_cls

            all_results.append(result)

    # print(y_cls_correct, y_cls_incorrect)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    if args.force_cls_pred:
        example_index_to_features = collections.defaultdict(list)
        for feature in features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        n_force = 0
        for example_index, example in enumerate(examples):
            eval_features = example_index_to_features[example_index]
            prob = []
            for eval_feature in eval_features:
                eval_result = unique_id_to_result[eval_feature.unique_id]
                prob.append(eval_result.prob_cls[0])

            if np.mean(prob) >= 0.8:
                predictions[example.qas_id] = ""
                n_force += 1

        print("\n")
        print("num of force prediction:", n_force)
    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn +
                                            y_cls_fp)
    cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp)
    cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn)
    # Add CLS accuracy to result
    results.update({
        'cls_accuracy': cls_accuracy,
        'cls_no_ans_accuracy': cls_no_ans_accuracy,
        'cls_has_ans_accuracy': cls_has_ans_accuracy
    })
    # save log to file
    if save_log_path:
        util.save_json_file(save_log_path, results)

    return results
Ejemplo n.º 22
0
def train(EXP: str, MODEL_NAME: str, DELTA: float, WEIGHT_DECAY: float, DEVICE: str) -> float:
    EPOCHS            = 3
    BATCH_SIZE        = 13
    SAMPLES           = 10
    FREEZE            = True
    LOGS              = "logs"
    DOC_STRIDE        = 128
    MAX_SEQ_LENGTH    = 384
    MAX_QUERY_LENGTH  = 64
    MAX_ANSWER_LENGTH = 30
    N_BEST_SIZE       = 20
    NULL_SCORE_THRESH = 0.0
    LOWER_CASE        = True
    THREADS           = 4
    LOADER_OPTIONS    = { "num_workers": 10, "pin_memory": True }
    LR                = 5e-5
    ADAM_EPSILON      = 1e-8
    N_WARMUP_STEPS    = 0
    MAX_GRAD_NORM     = 1
    DATA_DIR          = os.path.join("./dataset/squadv1")

    dumper = Dumper(f'dumps/dump_{EXP}_{MODEL_NAME}_{DELTA}.dump')

    os.makedirs(LOGS, exist_ok=True)
    writer_name = f"bayeformers_bert_squad.{EXP}"
    writer_path = os.path.join(LOGS, writer_name)
    writer_suff = f".DELTA_{DELTA}.WEIGHT_DECAY_{WEIGHT_DECAY}"
    writer      = SummaryWriter(writer_path + writer_suff)

    o_model, tokenizer = setup_model(MODEL_NAME, LOWER_CASE)
    o_model = torch.nn.DataParallel(o_model, device_ids=[0, 1, 2, 3])
    o_model.to(DEVICE)

    squadv1 = {
        "max_seq_length"  : MAX_SEQ_LENGTH,
        "doc_stride"      : DOC_STRIDE,
        "max_query_length": MAX_QUERY_LENGTH,
        "threads"         : THREADS
    }
    
    train_dataset, train_examples, train_features = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=False, **squadv1)
    test_dataset,  test_examples,  test_features  = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=True,  **squadv1)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  **LOADER_OPTIONS)
    test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, **LOADER_OPTIONS)

    decay           = [param for name, param in o_model.named_parameters() if name     in ["bias", "LayerNorm.weight"]]
    no_decay        = [param for name, param in o_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]]
    params_decay    = { "params": decay,    "weight_decay": WEIGHT_DECAY }
    params_no_decay = { "params": no_decay, "weight_decay": 0.0 }
    parameters      = [params_decay, params_no_decay]

    criterion = nn.CrossEntropyLoss().to(DEVICE)
    optim     = AdamW(parameters, lr=LR, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS)

    # =========================== FREQUENTIST ==================================
    
    report = Report()
    with dumper("frequentist_train"):
        for epoch in tqdm(range(EPOCHS), desc="Epoch"):

            # ============================ TRAIN ======================================
            o_model.train()
            report.reset()

            with dumper("epoch", epoch):
                pbar = tqdm(train_loader, desc="Train")
                for inputs in pbar:
                    inputs = setup_inputs(inputs, MODEL_NAME, o_model)
                    inputs = dic2cuda(inputs, DEVICE)
                    
                    start_positions = inputs["start_positions"]
                    end_positions   = inputs["end_positions"]

                    optim.zero_grad()
                    
                    outputs      = o_model(**inputs)
                    start_logits = outputs[1]
                    end_logits   = outputs[2]
                    
                    ignored_idx            = start_logits.size(1)
                    start_logits           = start_logits.clamp_(0, ignored_idx)
                    end_logits             = end_logits.clamp_(0, ignored_idx)
                    criterion.ignore_index = ignored_idx

                    with dumper():
                        dumper['start_positions'] = start_positions
                        dumper['end_positions']   = end_positions
                        dumper['start_logits']    = start_logits
                        dumper['end_logits']      = end_logits

                    start_loss = criterion(start_logits, start_positions)
                    end_loss   = criterion(  end_logits,   end_positions)
                    start_acc  = (torch.argmax(start_logits, dim=1) == start_positions).float().sum()
                    end_acc    = (torch.argmax(  end_logits, dim=1) ==   end_positions).float().sum()

                    loss = 0.5 * (start_loss + end_loss)
                    acc  = 0.5 * (start_acc  + end_acc)

                    loss.backward()
                    nn.utils.clip_grad_norm_(o_model.parameters(), MAX_GRAD_NORM)
                    optim.step()

                    report.total += loss.item()      / len(train_loader)
                    report.acc   += acc.item() * 100 / len(train_dataset)

                    pbar.set_postfix(total=report.total, acc=report.acc)

            scheduler.step()
            writer.add_scalar("train_nll", report.total, epoch)
            writer.add_scalar("train_acc", report.acc,   epoch)

        # ============================ TEST =======================================
        o_model.eval()
        report.reset()
        
        with dumper.section("frequentist_test"):
            with torch.no_grad():
                results = []
                pbar    = tqdm(test_loader, desc="Test")
                for inputs in pbar:
                    inputs          = setup_inputs(inputs, MODEL_NAME, o_model, True)
                    inputs          = dic2cuda(inputs, DEVICE)
                    feature_indices = inputs["feature_indices"]
                    
                    del inputs["feature_indices"]
                    outputs = o_model(**inputs)

                    for i, feature_idx in enumerate(feature_indices):
                        eval_feature             = test_features[feature_idx.item()]
                        unique_id                = int(eval_feature.unique_id)
                        output                   = [to_list(output[i]) for output in outputs]
                        start_logits, end_logits = output
                        result                   = SquadResult(unique_id, start_logits, end_logits)
                        results.append(result)
                        
                        with dumper():
                            dumper['unique_id']     = unique_id
                            dumper['start_logits']  = start_logits
                            dumper['end_logits']    = end_logits

                predictions = compute_predictions_logits(
                    test_examples, test_features, results,
                    N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE,
                    os.path.join(LOGS, f"preds.frequentist.test.{writer_name + writer_suff}.json"),
                    os.path.join(LOGS, f"nbestpreds.frequentist.test.{writer_name + writer_suff}.json"),
                    None, True, False, NULL_SCORE_THRESH, tokenizer,
                )

            results      = squad_evaluate(test_examples, predictions)
            report.em    = results["exact"]
            report.f1    = results["f1"]
            report.total = results["total"]
            
            print(f'em={report.em}, f1={report.f1}, total={report.total}')
            writer.add_scalar("test_em",    report.em,    epoch)
            writer.add_scalar("test_f1",    report.f1,    epoch)
            writer.add_scalar("test_total", report.total, epoch)

    # ============================ EVALUTATION ====================================
    b_model = to_bayesian(o_model, delta=DELTA, freeze=FREEZE)
    b_model = b_model.to(DEVICE)

    b_model.eval()
    report.reset()

    with dumper("bayesian_eval_before_train"):
        with torch.no_grad():
            results = []
            pbar    = tqdm(test_loader, desc="Bayesian Eval")
            for inputs in pbar:
                inputs          = setup_inputs(inputs, MODEL_NAME, o_model, True)
                inputs          = dic2cuda(inputs, DEVICE)
                feature_indices = inputs["feature_indices"]
                B               = inputs["input_ids"].size(0)

                del inputs["feature_indices"]
                samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE)
                _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples
                
                start_logits_list = start_logits.tolist()
                end_logits_list   = end_logits.tolist()

                for i, feature_idx in enumerate(feature_indices):
                    eval_feature = test_features[feature_idx.item()]
                    unique_id    = int(eval_feature.unique_id)
                    result       = SquadResult(unique_id, start_logits_list[i], end_logits_list[i])
                    results.append(result)

                    with dumper():
                        dumper['unique_id']     = unique_id
                        dumper['start_logits']  = start_logits_list[i]
                        dumper['end_logits']    = end_logits_list[i]

            predictions = compute_predictions_logits(
                test_examples, test_features, results,
                N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE,
                os.path.join(LOGS, f"preds.bayesian.eval.{writer_name + writer_suff}.json"),
                os.path.join(LOGS, f"nbestpreds.bayesian.eval.{writer_name + writer_suff}.json"),
                None, True, False, NULL_SCORE_THRESH, tokenizer,
            )

            results      = squad_evaluate(test_examples, predictions)
            report.em    = results["exact"]
            report.f1    = results["f1"]
            report.total = results["total"]
            
            print(f'em={report.em}, f1={report.f1}, total={report.total}')
            writer.add_scalar("bayesian_eval_em",    report.em,    epoch)
            writer.add_scalar("bayesian_eval_f1",    report.f1,    epoch)
            writer.add_scalar("bayesian_eval_total", report.total, epoch)

    # ============================ BAYESIAN ======================================

    decay           = [param for name, param in b_model.named_parameters() if name     in ["bias", "LayerNorm.weight"]]
    no_decay        = [param for name, param in b_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]]
    params_decay    = { "params": decay,    "weight_decay": WEIGHT_DECAY }
    params_no_decay = { "params": no_decay, "weight_decay": 0.0 }
    parameters      = [params_decay, params_no_decay]

    criterion = nn.CrossEntropyLoss().to(DEVICE)
    optim     = AdamW(parameters, lr=LR, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS)

    with dumper("bayesian_train"):
        for epoch in tqdm(range(EPOCHS), desc="Bayesian Epoch"):
            with dumper("epoch", epoch):
                # ============================ TRAIN ======================================
                b_model.train()
                report.reset()
                
                pbar = tqdm(train_loader, desc="Bayesian Train")
                for inputs in pbar:
                    inputs = setup_inputs(inputs, MODEL_NAME, o_model)
                    inputs = dic2cuda(inputs, DEVICE)

                    start_positions = inputs["start_positions"]
                    end_positions   = inputs["end_positions"]
                    B               = inputs["input_ids"].size(0)

                    optim.zero_grad()

                    samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE)
                    raw_start_logits, raw_end_logits, start_logits, end_logits, log_prior, log_variational_posterior = samples
                    
                    ignored_idx            = start_logits.size(1)
                    start_logits           = start_logits.clamp_(0, ignored_idx)
                    end_logits             =   end_logits.clamp_(0, ignored_idx)
                    criterion.ignore_index = ignored_idx

                    with dumper():
                        dumper['start_positions']           = start_positions
                        dumper['end_positions']             = end_positions
                        dumper['start_logits']              = start_logits
                        dumper['end_logits']                = end_logits
                        dumper['log_prior']                 = log_prior
                        dumper['log_variational_posterior'] = log_variational_posterior

                    start_loss    = criterion(start_logits, start_positions)
                    end_loss      = criterion(  end_logits,   end_positions)
                    start_acc     = (torch.argmax(start_logits, dim=1) == start_positions).float().sum()
                    end_acc       = (torch.argmax(  end_logits, dim=1) ==   end_positions).float().sum()
                    start_acc_std = np.std([(torch.argmax(start_logits.clamp(0, ignored_idx), dim=1) == start_positions).float().sum().item() for start_logits in raw_start_logits])
                    end_acc_std   = np.std([(torch.argmax(  end_logits.clamp(0, ignored_idx), dim=1) ==   end_positions).float().sum().item() for   end_logits in raw_end_logits])

                    nll     = 0.5 * (start_loss    + end_loss)
                    acc     = 0.5 * (start_acc     + end_acc)
                    acc_std = 0.5 * (start_acc_std + end_acc_std)
                    loss    = (log_variational_posterior - log_prior) / len(train_loader) + nll

                    loss.backward()
                    nn.utils.clip_grad_norm_(b_model.parameters(), MAX_GRAD_NORM)
                    optim.step()

                    report.total                     += loss.item()                      / len(train_loader)
                    report.nll                       += nll.item()                       / len(train_loader)
                    report.log_prior                 += log_prior.item()                 / len(train_loader)
                    report.log_variational_posterior += log_variational_posterior.item() / len(train_loader)
                    report.acc                       += acc.item() * 100                 / len(train_dataset)
                    report.acc_std                   += acc_std                          / len(train_loader)

                    pbar.set_postfix(
                        total=report.total,
                        nll=report.nll,
                        log_prior=report.log_prior,
                        log_variational_posterior=report.log_variational_posterior,
                        acc=report.acc,
                        acc_std=report.acc_std,
                    )

                scheduler.step()
                writer.add_scalar("bayesian_train_nll",     report.nll,     epoch)
                writer.add_scalar("bayesian_train_acc",     report.acc,     epoch)
                writer.add_scalar("bayesian_train_acc_std", report.acc_std, epoch)

    # ============================ TEST =======================================
    b_model.eval()
    report.reset()
    
    with dumper("bayesian_test_after_train"):
        with torch.no_grad():
            results = []
            pbar    = tqdm(test_loader, desc="Bayesian Test")
            for inputs in pbar:
                inputs          = setup_inputs(inputs, MODEL_NAME, o_model, True)
                inputs          = dic2cuda(inputs, DEVICE)
                feature_indices = inputs["feature_indices"]
                B               = inputs["input_ids"].size(0)

                del inputs["feature_indices"]
                samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE)
                _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples
                
                start_logits_list   = start_logits.tolist()
                end_logits_list     = end_logits.tolist()

                for i, feature_idx in enumerate(feature_indices):
                    eval_feature = test_features[feature_idx.item()]
                    unique_id    = int(eval_feature.unique_id)
                    result       = SquadResult(unique_id, start_logits_list[i], end_logits_list[i])
                    results.append(result)

                    with dumper():
                        dumper['unique_id']     = unique_id
                        dumper['start_logits']  = start_logits_list[i]
                        dumper['end_logits']    = end_logits_list[i]

            predictions = compute_predictions_logits(
                test_examples, test_features, results,
                N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE,
                os.path.join(LOGS, f"preds.bayesian.test.{writer_name + writer_suff}.json"),
                os.path.join(LOGS, f"nbestpreds.bayesian.test.{writer_name + writer_suff}.json"),
                None, True, False, NULL_SCORE_THRESH, tokenizer,
            )

            results      = squad_evaluate(test_examples, predictions)
            report.em    = results["exact"]
            report.f1    = results["f1"]
            report.total = results["total"]
            
            print(f'em={report.em}, f1={report.f1}, total={report.total}')
            writer.add_scalar("bayesian_test_em",    report.em,    epoch)
            writer.add_scalar("bayesian_test_f1",    report.f1,    epoch)
            writer.add_scalar("bayesian_test_total", report.total, epoch)

    # ============================ SAVE =======================================

    torch.save({
        "weight_decay": WEIGHT_DECAY,
        "delta"       : DELTA,
        "model"       : b_model.state_dict(),
        "em"          : report.em,
        "f1"          : report.f1,
        "total"       : report.total,
    }, f"{writer_path + writer_suff}.pth")

    return report.acc
Ejemplo n.º 23
0
    def evaluate(self, prefix: str, args, tokenizer, dataset, examples,
                 features) -> torch.Tensor:
        """Performs evaluation on the dataset

        Parameters
        ----------
        prefix : str
            The model to be used for training
        args :

        tokenizer : 
            The tokenizer used to preprocess the data.

        dataset : List(torch.utils.data.TensorDataset)
            The evaluation dataset

        examples : List(torch.utils.data.TensorDataset)
            The examples in the evaluation dataset

        features : List(torch.utils.data.TensorDataset)
            SQuAD-like features corresponding to the evalaution dataset

        Returns
        -------
        torch.Tensor
            The evaluation metrics (Exact Match (EM) and F1-score)
        """
        if not os.path.exists(
                self.args.output_dir) and self.args.local_rank in [-1, 0]:
            os.makedirs(self.args.output_dir)

        eval_batch_size = self.args.per_device_eval_batch_size * max(
            1, self.args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)

        # multi-gpu evaluate
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):

            self.model.eval()
            batch = tuple(t.to(self.args.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.params.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                example_indices = batch[3]

                outputs = self.model(**inputs)

                example_indices = batch[5]

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [tensor_to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        eval_time = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    eval_time, eval_time / len(dataset))

        # Compute predictions
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            None,
            None,
            None,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results
Ejemplo n.º 24
0
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
    eval_dataset,features, examples = load_and_cache_examples(
        args, tokenizer, labels, pad_token_label_id, mode=mode
    )

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = (
        SequentialSampler(eval_dataset)
        if args.local_rank == -1
        else DistributedSampler(eval_dataset)
    )
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
    )

    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    all_results = []
    start_time = timeit.default_timer()
    model.eval()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                }
                inputs["bbox"] = batch[5]
                inputs["token_type_ids"] = (batch[6])
                outputs = model(**inputs)
                example_indices = batch[7]
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        20, 
        30, 
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        True,
        True,
        0.0,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 25
0
def evaluate(args, model_path1, model1, model2, model3, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          model_path1,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model1, torch.nn.DataParallel):
        model1 = torch.nn.DataParallel(model1)

    if args.n_gpu > 1 and not isinstance(model2, torch.nn.DataParallel):
        model2 = torch.nn.DataParallel(model2)

    if args.n_gpu > 1 and not isinstance(model3, torch.nn.DataParallel):
        model3 = torch.nn.DataParallel(model3)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model1.eval()
        model2.eval()
        model3.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]

            outputs1 = model1(**inputs)
            outputs2 = model2(**inputs)
            outputs3 = model3(**inputs)
            # print("outputs1", outputs1)
        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output1 = [to_list(output1[i]) for output1 in outputs1]
            # print("output1", output1)
            # print("len(output1)", len(output1[0]))
            output2 = [to_list(output2[i]) for output2 in outputs2]
            output3 = [to_list(output3[i]) for output3 in outputs3]

            start_logits1, end_logits1 = output1
            start_logits2, end_logits2 = output2
            start_logits3, end_logits3 = output3

            # 第一种加权加和形式集成
            weights = [0.4, 0.2, 0.4]
            start_logits = [
                weights[0] * log1 + weights[1] * log2 + weights[2] * log3
                for log1, log2, log3 in zip(start_logits1, start_logits2,
                                            start_logits3)
            ]
            end_logits = [
                weights[0] * log1 + weights[1] * log2 + weights[2] * log3 for
                log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3)
            ]
            # # 第二种算数平均是集成
            # start_logits = [
            #     (log1 + log2 + log3)/3
            #     for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3)
            # ]
            # end_logits = [
            #     (log1 + log2 + log3) / 3
            #     for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3)
            # ]
            # # 第三种位置形式
            # start_logits = [
            #     max(log1, log2, log3)
            #     for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3)
            # ]
            # end_logits = [
            #     max(log1, log2, log3)
            #     for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3)
            # ]

            # print("start_logits1", start_logits1[0])
            # print("start_logits2", start_logits2[0])
            # print("start_logits3", start_logits3[0])
            # print("start_logits", start_logits[0])
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info(
        "  Evaluation done in total %f secs (%f sec per example)",
        evalTime,
        evalTime / len(dataset),
    )

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 26
0
def get_evaluation_metrics(
    model,
    tokenizer,
    data_dir: str,
    filename: str,
    per_gpu_batch_size: int = 32,
    num_batches: int = None,
    disable_tqdm: bool = False,
) -> Dict[str, "Number"]:
    """
    Return an OrderedDict in the format:
    {
    'exact': 0.8169797018445212,
    'f1': 4.4469722448269335,
    'total': 11873,
    'HasAns_exact': 0.15182186234817813,
    'HasAns_f1': 7.422216845956518,
    'HasAns_total': 5928,
    'NoAns_exact': 1.4802354920100924,
    'NoAns_f1': 1.4802354920100924,
    'NoAns_total': 5945,
    'best_exact': 50.07159100480081,
    'best_exact_thresh': 0.0,
    'best_f1': 50.0772059855695,
    'best_f1_thresh': 0.0
    }
    """
    # These are not used in inference, only for scoring in `compute_predictions_logits()`.
    processor = SquadV2Processor()
    examples: List[SquadExample] = processor.get_dev_examples(
        data_dir, filename=filename)
    features: List[SquadFeatures] = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=filename,
        per_gpu_batch_size=per_gpu_batch_size,
        shard=False,
        shuffle=False,
        drop_remainder=False,
        return_raw_features=True,
    )

    # Here we get the dataset instead of just the features, with return_raw_features=False.
    dataset: tf.data.Dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=filename,
        per_gpu_batch_size=per_gpu_batch_size,
        shard=False,
        shuffle=False,
        drop_remainder=False,
        return_raw_features=False,
    )
    results: List[SquadResult] = get_squad_results(
        model=model,
        dataset=dataset,
        features=features,
        per_gpu_batch_size=per_gpu_batch_size,
        num_batches=num_batches,
        disable_tqdm=disable_tqdm,
    )

    write_prediction_files = False
    if write_prediction_files:
        output_predictions_file = f"/fsx/{args.checkpoint}_predictions.json"
        output_nbest_file = f"/fsx/{args.checkpoint}_nbest_predictions.json"
        output_null_log_odds_file = f"/fsx/{args.checkpoint}_null_odds.json"
    else:
        output_predictions_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=True,
        output_prediction_file=output_predictions_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=output_null_log_odds_file,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=0.0,
        tokenizer=tokenizer,
    )

    results: collections.OrderedDict = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 27
0
def evaluate(args, model, tokenizer, prefix="", calibration=False):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    dataset_cached = "./dataset_cached"
    if not os.path.exists(dataset_cached):
        os.makedirs(dataset_cached)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    calibation_iteration = int(
        (len(dataset) * 0.05 + args.eval_batch_size - 1) /
        args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    print("  Batch size = %d" % args.eval_batch_size)

    if args.mkldnn_eval:
        from torch.utils import mkldnn as mkldnn_utils
        model = mkldnn_utils.to_mkldnn(model)
        print(model)

    all_results = []
    evalTime = 0
    nb_eval_steps = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        if calibration and nb_eval_steps >= calibation_iteration:
            break

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            if nb_eval_steps >= args.warmup:
                start_time = timeit.default_timer()
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

        if nb_eval_steps >= args.warmup:
            evalTime += (timeit.default_timer() - start_time)

        nb_eval_steps += 1

        if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter):
            break

    if nb_eval_steps >= args.warmup:
        perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime
        if args.eval_batch_size == 1:
            print('Latency: %.3f ms' % (evalTime /
                                        (nb_eval_steps - args.warmup) * 1000))
        print("Evaluation done in total %f secs (Throughput: %f samples/sec)" %
              (evalTime, perf))
    else:
        logger.info(
            "*****no performance, please check dataset length and warmup number *****"
        )

    # Compute predictions
    output_prediction_file = os.path.join(dataset_cached,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        dataset_cached, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            dataset_cached, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    elif not calibration and args.iter == 0:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    if not calibration and args.iter == 0:
        results = squad_evaluate(examples, predictions)
        bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc']
        for key in bert_task_acc_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
    else:
        results = None
    return results, perf
Ejemplo n.º 28
0
def evaluate(args, model, tokenizer, prefix="", global_step=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info(f"***** Running evaluation {prefix} *****")
    logger.info(f"  Num examples = {len(dataset)}")
    logger.info(f"  Batch size = {args.eval_batch_size}")

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Eval"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info(
        f"  Evaluation done in total {evalTime} secs ({evalTime / len(dataset)} sec per example)"
    )

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          f"predictions_{prefix}.json")
    output_nbest_file = os.path.join(args.output_dir,
                                     f"nbest_predictions_{prefix}.json")

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 f"null_odds_{prefix}.json")
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        False,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    # Write the result
    # Write the evaluation result on file
    output_dir = os.path.join(args.output_dir, "eval")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_eval_file = os.path.join(output_dir,
                                    f"eval_result_{global_step}.txt")

    logger.info("***** Official Eval results *****")
    with open(output_eval_file, "w", encoding="utf-8") as f:
        official_eval_results = eval_during_train(args)
        for key in sorted(official_eval_results.keys()):
            logger.info(f"  {key} = {official_eval_results[key]}")
            f.write(f" {key} = {official_eval_results[key]}\n")
    return results
Ejemplo n.º 29
0
def evaluate(args, model, tokenizer, prefix="", adapter_names=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "adapter_names": adapter_names,
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Ejemplo n.º 30
0
    def test_epoch_end(self, outputs):
        example_indices = torch.cat([x["example_indices"] for x in outputs
                                     ]).detach().cpu().tolist()
        start_logits = torch.cat([x["start_logits"]
                                  for x in outputs]).detach().cpu().tolist()
        end_logits = torch.cat([x["end_logits"]
                                for x in outputs]).detach().cpu().tolist()

        if "cls_logits" in list(outputs[0].keys()):
            start_top_index = torch.cat([
                x["start_top_index"] for x in outputs
            ]).detach().cpu().tolist()
            end_top_index = torch.cat([x["end_top_index"] for x in outputs
                                       ]).detach().cpu().tolist()
            cls_logits = torch.cat([x["cls_logits"]
                                    for x in outputs]).detach().cpu().tolist()

        examples = self.trainer.datamodule.test_examples
        features = self.trainer.datamodule.test_features

        all_results = []
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index]
            unique_id = int(eval_feature.unique_id)

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            from transformers.data.processors.squad import SquadResult
            if "cls_logits" in list(outputs[0].keys()):
                result = SquadResult(
                    unique_id,
                    start_logits[i],
                    end_logits[i],
                    start_top_index=start_top_index[i],
                    end_top_index=end_top_index[i],
                    cls_logits=cls_logits[i],
                )

            else:
                result = SquadResult(unique_id, start_logits[i], end_logits[i])

            all_results.append(result)

        # Compute predictions
        output_prediction_file = os.path.join(
            self.trainer.checkpoint_callback.dirpath, "predictions_eval.json")
        output_nbest_file = os.path.join(
            self.trainer.checkpoint_callback.dirpath,
            "nbest_predictions_eval.json")

        if self.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                self.trainer.checkpoint_callback.dirpath,
                "null_odds_eval.json")
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure
        if self.hparams.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(
                self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top

            from transformers.data.metrics.squad_metrics import compute_predictions_log_probs
            predictions = compute_predictions_log_probs(
                examples,
                features,
                all_results,
                self.hparams.n_best_size,
                self.hparams.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                start_n_top,
                end_n_top,
                self.version_2_with_negative,
                self.trainer.datamodule.tokenizer,
                False  # Not want to do verbose logging
            )
        else:
            from transformers.data.metrics.squad_metrics import compute_predictions_logits
            predictions = compute_predictions_logits(
                examples,
                features,
                all_results,
                self.hparams.n_best_size,
                self.hparams.max_answer_length,
                self.hparams.do_lower_case,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                False,  # Not want to do verbose logging
                self.version_2_with_negative,
                self.hparams.null_score_diff_threshold,
                self.trainer.datamodule.tokenizer)

        # Compute the F1 and exact scores.
        from transformers.data.metrics.squad_metrics import squad_evaluate
        results = squad_evaluate(examples, predictions)
        return results