def after_pred(self):
        "Generate SquadResults"
        for i, example_index in enumerate(self.example_indices):
            eval_feature = self.features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [self.pred[output][i] for output in self.pred]
            output = apply(Self.numpy(), to_detach(output))

            if isinstance(self.learn.model, self.xmodel_instances):
                # Some models like the ones in `self.xmodel_instances` use 5 arguments for their predictions
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                self.learn.pred = SquadResult(unique_id,
                                              start_logits,
                                              end_logits,
                                              start_top_index=start_top_index,
                                              end_top_index=end_top_index,
                                              cls_logits=cls_logits)
            else:
                start_logits, end_logits = output
                self.learn.pred = SquadResult(unique_id, start_logits,
                                              end_logits)
def evaluate(model, tokenizer):
    # Evaluate
    dataset, examples, features = load_and_cache_examples(tokenizer, is_training=False)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=8)

    # Eval!
    print("***** Running evaluation *****")
    print("  Num examples = ", len(dataset))

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = [output[i].detach().cpu().tolist() for output in outputs]
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    print("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size = 20,
        max_answer_length = 30,
        do_lower_case=False,
        output_prediction_file="predictions.json",
        output_nbest_file="nbest_predictions.json",
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=False,
        null_score_diff_threshold=0.0,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    return results
Exemple #3
0
    def evaluate_full_dataset(self, data_loader: DataLoader):
        all_results = []

        for batch in data_loader:
            inputs = {
                "input_ids": batch[0].cuda(),
                "attention_mask": batch[1].cuda(),
                "token_type_ids": batch[2].cuda(),
            }
            feature_indices = batch[3]
            outputs = self.model(**inputs)
            for i, feature_index in enumerate(feature_indices):
                eval_feature = self.validation_features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)
                output = [
                    output[i].detach().cpu().tolist() for output in outputs
                ]
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

        task = self.context.get_data_config().get("task")
        if task == "SQuAD1.1":
            version_2_with_negative = False
        elif task == "SQuAD2.0":
            version_2_with_negative = True
        else:
            raise NameError(f"Incompatible dataset '{task}' detected")

        # TODO: Make verbose logging configurable
        verbose_logging = False
        predictions = compute_predictions_logits(
            self.validation_examples,
            self.validation_features,
            all_results,
            self.context.get_hparam("n_best_size"),
            self.context.get_hparam("max_answer_length"),
            self.context.get_hparam("do_lower_case"),
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            verbose_logging,
            version_2_with_negative,
            self.context.get_hparam("null_score_diff_threshold"),
            self.tokenizer,
        )
        results = squad_evaluate(self.validation_examples, predictions)
        return results
Exemple #4
0
    def test_step(self, batch, batch_nb):
        # input_ids, attention_mask, token_type_ids, start_positions, end_positions = batch

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
        }

        all_results = []

        outputs = self.forward(**inputs)

        example_indices = batch[3]

        examples = self.test_examples()
        features = self.test_features()

        batch_features = []

        batch_examples = [
            examples[example_index.item()]
            for i, example_index in enumerate(example_indices)
        ]

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)
            # print("result", result.unique_id)
            all_results.append(result)

            batch_features.append(eval_feature)

        predictions = compute_predictions_logits(examples,
                                                 batch_features,
                                                 all_results,
                                                 do_lower_case=True,
                                                 version_2_with_negative=True,
                                                 tokenizer=self.tokenizer)

        answers_data, predictions = get_metrics_input(batch_examples,
                                                      predictions)

        return {
            **self.calculate_metrics(predictions, answers_data, stage='test')
        }
Exemple #5
0
    def evaluate(self, model, dataset, examples, features):

        eval_batch_size, eval_dataloader = self.get_dataloader_sampler(dataset)

        # multi-gpu evaluate
        if self.args_dict[N_GPU] > 1 and not isinstance(
                model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)
        else:
            model = model

        # Eval!
        logger.info("***** Running evaluation {} *****".format(
            self.global_step))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args_dict[eval_batch_size])

        all_results = []
        start_time = timeit.default_timer()
        model.eval()
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.args_dict[DEVICE]) for t in batch)

            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
                example_indices = batch[3]
                outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        eval_time = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    eval_time, eval_time / len(dataset))

        # Compute predictions
        predictions = self.calcuate_predictions(all_results, examples,
                                                features)

        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results, eval_time
Exemple #6
0
def qa_evaluate(lang, test_set, model_type, loader, bert_model, learner, save_dir):
    all_results, loss, uids = [], [], []
    examples = test_set.examples
    features = test_set.features
    for batch in loader:
        with torch.no_grad():
            input_ids, attention_mask, token_type_ids, labels, unique_ids = (
                batch[0],
                batch[1],
                batch[2],
                batch[3],
                batch[4],
            )
            bert_output = bert_model(input_ids, attention_mask, token_type_ids)
            outputs = learner(bert_output, labels=labels, attention_mask=attention_mask)
            loss.append(outputs.loss.mean().item())

        for i, uid in enumerate(unique_ids):
            unique_id = int(uid.item())
            start_logits = outputs.start_logits[i].detach().cpu().tolist()
            end_logits = outputs.end_logits[i].detach().cpu().tolist()
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
            uids.append(unique_id)

    save_dir = os.path.join(save_dir, "result")
    os.makedirs(save_dir, exist_ok=True)
    output_prediction_file = os.path.join(save_dir, f"{lang}.predictions")
    output_nbest_file = os.path.join(save_dir, f"{lang}.nbest_predictions")
    features = [f for f in features if f.unique_id in uids]
    qas_ids = list(dict.fromkeys([f.qas_id for f in features]))
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=False,
        output_prediction_file=output_prediction_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=None,
        verbose_logging=True,
        version_2_with_negative=False,
        null_score_diff_threshold=-np.inf,
        tokenizer=AutoTokenizer.from_pretrained(model_type),
    )
    results = squad_evaluate(test_set.get_by_ids(qas_ids), predictions)
    return torch.tensor(loss), dict(results)
Exemple #7
0
def get_squad_results(
    model,
    dataset: tf.data.Dataset,
    features: List[SquadFeatures],
    per_gpu_batch_size: int,
    num_batches: int,
    disable_tqdm: bool,
) -> List[SquadResult]:
    results = []

    total_steps = math.ceil(len(features) / per_gpu_batch_size)
    pbar = tqdm.tqdm(total=total_steps, disable=disable_tqdm)
    pbar.set_description(f"Evaluating with batch size {per_gpu_batch_size}")

    if num_batches:
        dataset = dataset.take(num_batches)

    for step, batch in enumerate(dataset):
        input_dict = {
            "input_ids": batch[0]["input_ids"],
            "attention_mask": batch[0]["attention_mask"],
            "token_type_ids": batch[0]["token_type_ids"],
        }
        outputs = model(input_dict, training=False)
        start_logits, end_logits = outputs[0], outputs[1]

        per_gpu_batch_size = len(batch[1]["start_positions"])
        for i in range(per_gpu_batch_size):
            feature_index = batch[0]["feature_index"][i].numpy().item()
            unique_id = int(features[feature_index].unique_id)
            result = SquadResult(
                unique_id=unique_id,
                start_logits=start_logits[i].numpy().tolist(),
                end_logits=end_logits[i].numpy().tolist(),
            )
            results.append(result)

        pbar.update(1)
    pbar.close()

    return results
Exemple #8
0
    def evaluate_full_dataset(self, data_loader: DataLoader):
        all_results = []
        for batch in data_loader:
            inputs = {
                "input_ids": batch[0].cuda(),
                "attention_mask": batch[1].cuda(),
                "token_type_ids": batch[2].cuda(),
            }
            feature_indices = batch[3]
            outputs = self.model(**inputs)
            for i, feature_index in enumerate(feature_indices):
                eval_feature = self.validation_features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)
                output = [
                    output[i].detach().cpu().tolist() for output in outputs
                ]
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None
        predictions = compute_predictions_logits(
            self.validation_examples,
            self.validation_features,
            all_results,
            self.context.get_hparam("n_best_size"),
            self.context.get_hparam("max_answer_length"),
            True,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            True,
            False,
            self.context.get_hparam("null_score_diff_threshold"),
            self.tokenizer,
        )
        results = squad_evaluate(self.validation_examples, predictions)
        return results
Exemple #9
0
    def find_answer(self,
                    question,
                    context,
                    n_best_size=20,
                    max_answer_length=30,
                    full_sentence=False):
        # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317"
        example_id = '55555'
        example = SquadExample(example_id, question, context, None, None, None)

        features, dataset = squad_convert_examples_to_features(
            [example],
            self.tokenizer,
            self.max_seq_length,
            self.doc_stride,
            self.max_query_length,
            False,
            return_dataset='pt')

        sampler = SequentialSampler(dataset)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)

        all_results = []
        for batch in dataloader:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in {"xlm", "roberta", "distilbert"}:
                    del inputs["token_type_ids"]

                example_index = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in {"xlnet", "xlm"}:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

                outputs = self.model(**inputs)
                output = [o.detach().cpu().tolist() for o in outputs]

                unique_id = int(features[example_index].unique_id)

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    squad_result = SquadResult(
                        unique_id,
                        start_logits[0],
                        end_logits[0],
                        start_top_index=start_top_index[0],
                        end_top_index=end_top_index[0],
                        cls_logits=cls_logits[0],
                    )

                else:
                    start_logits, end_logits = output
                    squad_result = SquadResult(unique_id, start_logits[0],
                                               end_logits[0])

                all_results.append(squad_result)

        # XLNet and XLM use a more complex post-processing procedure
        if self.model_type in {"xlnet", "xlm"}:
            if hasattr(model, "config"):
                start_n_top = self.model.config.start_n_top
                end_n_top = self.model.config.end_n_top
            else:
                start_n_top = self.model.module.config.start_n_top
                end_n_top = self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                start_n_top,
                end_n_top,
                self.version_2_with_negative,
                tokenizer,
                self.verbose,
            )
        else:
            predictions = compute_predictions_logits(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                self.do_lower_case,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                self.verbose,
                self.version_2_with_negative,
                self.null_score_diff_threshold,
            )

        prediction = predictions[example_id]

        logger.debug(f'found prediction: "{prediction}"')

        # empty prediction indicates unknown answer
        if not prediction:
            logger.debug('empty prediction')
            return None

        if full_sentence:
            doc = self.nlp(context)
            for sent in doc.sents:
                if prediction in sent.text:
                    prediction = sent.text
                    break

        return prediction
def QA_evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = squad_load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    #eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, shuffle=False)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "squad_predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_squad_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "squad_null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None
    
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Exemple #11
0
def evaluate(args, model, tokenizer, device, prefix=""):
    eval_dataset, examples, features = data.load_and_cache_examples(
        args.validation,
        tokenizer,
        args,
        evaluate=True,
        output_examples=True,
    )
    eval_dataloader = data.get_dataloader(eval_dataset,
                                          args.per_gpu_eval_batch_size,
                                          evaluate=True)

    all_results = []
    start_time = timeit.default_timer()
    eval_batches = 0

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        eval_batches += 1

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime,
                evalTime / (eval_batches * args.per_gpu_eval_batch_size))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_data_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_data_dir, "nbest_predictions_{}.json".format(prefix))

    if args.has_unanswerable:
        output_null_log_odds_file = os.path.join(
            args.output_data_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = squad_metrics.compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_len,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.has_unanswerable,
            tokenizer,
            logger.level < logging.INFO,
        )
    else:
        predictions = squad_metrics.compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_len,
            args.uncased_model,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            logger.level < logging.INFO,
            args.has_unanswerable,
            args.null_score_diff_thresh,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_metrics.squad_evaluate(examples, predictions)
    return results
Exemple #12
0
    def evaluate(self, prefix: str, args, tokenizer, dataset, examples,
                 features) -> torch.Tensor:
        """Performs evaluation on the dataset

        Parameters
        ----------
        prefix : str
            The model to be used for training
        args :

        tokenizer : 
            The tokenizer used to preprocess the data.

        dataset : List(torch.utils.data.TensorDataset)
            The evaluation dataset

        examples : List(torch.utils.data.TensorDataset)
            The examples in the evaluation dataset

        features : List(torch.utils.data.TensorDataset)
            SQuAD-like features corresponding to the evalaution dataset

        Returns
        -------
        torch.Tensor
            The evaluation metrics (Exact Match (EM) and F1-score)
        """
        if not os.path.exists(
                self.args.output_dir) and self.args.local_rank in [-1, 0]:
            os.makedirs(self.args.output_dir)

        eval_batch_size = self.args.per_device_eval_batch_size * max(
            1, self.args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)

        # multi-gpu evaluate
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):

            self.model.eval()
            batch = tuple(t.to(self.args.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.params.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                example_indices = batch[3]

                outputs = self.model(**inputs)

                example_indices = batch[5]

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [tensor_to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        eval_time = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    eval_time, eval_time / len(dataset))

        # Compute predictions
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            None,
            None,
            None,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results
Exemple #13
0
def evaluate(args, model, tokenizer, prefix="", global_step=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info(f"***** Running evaluation {prefix} *****")
    logger.info(f"  Num examples = {len(dataset)}")
    logger.info(f"  Batch size = {args.eval_batch_size}")

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Eval"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info(
        f"  Evaluation done in total {evalTime} secs ({evalTime / len(dataset)} sec per example)"
    )

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          f"predictions_{prefix}.json")
    output_nbest_file = os.path.join(args.output_dir,
                                     f"nbest_predictions_{prefix}.json")

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 f"null_odds_{prefix}.json")
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        False,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    # Write the result
    # Write the evaluation result on file
    output_dir = os.path.join(args.output_dir, "eval")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_eval_file = os.path.join(output_dir,
                                    f"eval_result_{global_step}.txt")

    logger.info("***** Official Eval results *****")
    with open(output_eval_file, "w", encoding="utf-8") as f:
        official_eval_results = eval_during_train(args)
        for key in sorted(official_eval_results.keys()):
            logger.info(f"  {key} = {official_eval_results[key]}")
            f.write(f" {key} = {official_eval_results[key]}\n")
    return results
Exemple #14
0
def train(EXP: str, MODEL_NAME: str, DELTA: float, WEIGHT_DECAY: float, DEVICE: str) -> float:
    EPOCHS            = 3
    BATCH_SIZE        = 13
    SAMPLES           = 10
    FREEZE            = True
    LOGS              = "logs"
    DOC_STRIDE        = 128
    MAX_SEQ_LENGTH    = 384
    MAX_QUERY_LENGTH  = 64
    MAX_ANSWER_LENGTH = 30
    N_BEST_SIZE       = 20
    NULL_SCORE_THRESH = 0.0
    LOWER_CASE        = True
    THREADS           = 4
    LOADER_OPTIONS    = { "num_workers": 10, "pin_memory": True }
    LR                = 5e-5
    ADAM_EPSILON      = 1e-8
    N_WARMUP_STEPS    = 0
    MAX_GRAD_NORM     = 1
    DATA_DIR          = os.path.join("./dataset/squadv1")

    dumper = Dumper(f'dumps/dump_{EXP}_{MODEL_NAME}_{DELTA}.dump')

    os.makedirs(LOGS, exist_ok=True)
    writer_name = f"bayeformers_bert_squad.{EXP}"
    writer_path = os.path.join(LOGS, writer_name)
    writer_suff = f".DELTA_{DELTA}.WEIGHT_DECAY_{WEIGHT_DECAY}"
    writer      = SummaryWriter(writer_path + writer_suff)

    o_model, tokenizer = setup_model(MODEL_NAME, LOWER_CASE)
    o_model = torch.nn.DataParallel(o_model, device_ids=[0, 1, 2, 3])
    o_model.to(DEVICE)

    squadv1 = {
        "max_seq_length"  : MAX_SEQ_LENGTH,
        "doc_stride"      : DOC_STRIDE,
        "max_query_length": MAX_QUERY_LENGTH,
        "threads"         : THREADS
    }
    
    train_dataset, train_examples, train_features = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=False, **squadv1)
    test_dataset,  test_examples,  test_features  = setup_squadv1_dataset(DATA_DIR, tokenizer=tokenizer, test=True,  **squadv1)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  **LOADER_OPTIONS)
    test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, **LOADER_OPTIONS)

    decay           = [param for name, param in o_model.named_parameters() if name     in ["bias", "LayerNorm.weight"]]
    no_decay        = [param for name, param in o_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]]
    params_decay    = { "params": decay,    "weight_decay": WEIGHT_DECAY }
    params_no_decay = { "params": no_decay, "weight_decay": 0.0 }
    parameters      = [params_decay, params_no_decay]

    criterion = nn.CrossEntropyLoss().to(DEVICE)
    optim     = AdamW(parameters, lr=LR, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS)

    # =========================== FREQUENTIST ==================================
    
    report = Report()
    with dumper("frequentist_train"):
        for epoch in tqdm(range(EPOCHS), desc="Epoch"):

            # ============================ TRAIN ======================================
            o_model.train()
            report.reset()

            with dumper("epoch", epoch):
                pbar = tqdm(train_loader, desc="Train")
                for inputs in pbar:
                    inputs = setup_inputs(inputs, MODEL_NAME, o_model)
                    inputs = dic2cuda(inputs, DEVICE)
                    
                    start_positions = inputs["start_positions"]
                    end_positions   = inputs["end_positions"]

                    optim.zero_grad()
                    
                    outputs      = o_model(**inputs)
                    start_logits = outputs[1]
                    end_logits   = outputs[2]
                    
                    ignored_idx            = start_logits.size(1)
                    start_logits           = start_logits.clamp_(0, ignored_idx)
                    end_logits             = end_logits.clamp_(0, ignored_idx)
                    criterion.ignore_index = ignored_idx

                    with dumper():
                        dumper['start_positions'] = start_positions
                        dumper['end_positions']   = end_positions
                        dumper['start_logits']    = start_logits
                        dumper['end_logits']      = end_logits

                    start_loss = criterion(start_logits, start_positions)
                    end_loss   = criterion(  end_logits,   end_positions)
                    start_acc  = (torch.argmax(start_logits, dim=1) == start_positions).float().sum()
                    end_acc    = (torch.argmax(  end_logits, dim=1) ==   end_positions).float().sum()

                    loss = 0.5 * (start_loss + end_loss)
                    acc  = 0.5 * (start_acc  + end_acc)

                    loss.backward()
                    nn.utils.clip_grad_norm_(o_model.parameters(), MAX_GRAD_NORM)
                    optim.step()

                    report.total += loss.item()      / len(train_loader)
                    report.acc   += acc.item() * 100 / len(train_dataset)

                    pbar.set_postfix(total=report.total, acc=report.acc)

            scheduler.step()
            writer.add_scalar("train_nll", report.total, epoch)
            writer.add_scalar("train_acc", report.acc,   epoch)

        # ============================ TEST =======================================
        o_model.eval()
        report.reset()
        
        with dumper.section("frequentist_test"):
            with torch.no_grad():
                results = []
                pbar    = tqdm(test_loader, desc="Test")
                for inputs in pbar:
                    inputs          = setup_inputs(inputs, MODEL_NAME, o_model, True)
                    inputs          = dic2cuda(inputs, DEVICE)
                    feature_indices = inputs["feature_indices"]
                    
                    del inputs["feature_indices"]
                    outputs = o_model(**inputs)

                    for i, feature_idx in enumerate(feature_indices):
                        eval_feature             = test_features[feature_idx.item()]
                        unique_id                = int(eval_feature.unique_id)
                        output                   = [to_list(output[i]) for output in outputs]
                        start_logits, end_logits = output
                        result                   = SquadResult(unique_id, start_logits, end_logits)
                        results.append(result)
                        
                        with dumper():
                            dumper['unique_id']     = unique_id
                            dumper['start_logits']  = start_logits
                            dumper['end_logits']    = end_logits

                predictions = compute_predictions_logits(
                    test_examples, test_features, results,
                    N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE,
                    os.path.join(LOGS, f"preds.frequentist.test.{writer_name + writer_suff}.json"),
                    os.path.join(LOGS, f"nbestpreds.frequentist.test.{writer_name + writer_suff}.json"),
                    None, True, False, NULL_SCORE_THRESH, tokenizer,
                )

            results      = squad_evaluate(test_examples, predictions)
            report.em    = results["exact"]
            report.f1    = results["f1"]
            report.total = results["total"]
            
            print(f'em={report.em}, f1={report.f1}, total={report.total}')
            writer.add_scalar("test_em",    report.em,    epoch)
            writer.add_scalar("test_f1",    report.f1,    epoch)
            writer.add_scalar("test_total", report.total, epoch)

    # ============================ EVALUTATION ====================================
    b_model = to_bayesian(o_model, delta=DELTA, freeze=FREEZE)
    b_model = b_model.to(DEVICE)

    b_model.eval()
    report.reset()

    with dumper("bayesian_eval_before_train"):
        with torch.no_grad():
            results = []
            pbar    = tqdm(test_loader, desc="Bayesian Eval")
            for inputs in pbar:
                inputs          = setup_inputs(inputs, MODEL_NAME, o_model, True)
                inputs          = dic2cuda(inputs, DEVICE)
                feature_indices = inputs["feature_indices"]
                B               = inputs["input_ids"].size(0)

                del inputs["feature_indices"]
                samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE)
                _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples
                
                start_logits_list = start_logits.tolist()
                end_logits_list   = end_logits.tolist()

                for i, feature_idx in enumerate(feature_indices):
                    eval_feature = test_features[feature_idx.item()]
                    unique_id    = int(eval_feature.unique_id)
                    result       = SquadResult(unique_id, start_logits_list[i], end_logits_list[i])
                    results.append(result)

                    with dumper():
                        dumper['unique_id']     = unique_id
                        dumper['start_logits']  = start_logits_list[i]
                        dumper['end_logits']    = end_logits_list[i]

            predictions = compute_predictions_logits(
                test_examples, test_features, results,
                N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE,
                os.path.join(LOGS, f"preds.bayesian.eval.{writer_name + writer_suff}.json"),
                os.path.join(LOGS, f"nbestpreds.bayesian.eval.{writer_name + writer_suff}.json"),
                None, True, False, NULL_SCORE_THRESH, tokenizer,
            )

            results      = squad_evaluate(test_examples, predictions)
            report.em    = results["exact"]
            report.f1    = results["f1"]
            report.total = results["total"]
            
            print(f'em={report.em}, f1={report.f1}, total={report.total}')
            writer.add_scalar("bayesian_eval_em",    report.em,    epoch)
            writer.add_scalar("bayesian_eval_f1",    report.f1,    epoch)
            writer.add_scalar("bayesian_eval_total", report.total, epoch)

    # ============================ BAYESIAN ======================================

    decay           = [param for name, param in b_model.named_parameters() if name     in ["bias", "LayerNorm.weight"]]
    no_decay        = [param for name, param in b_model.named_parameters() if name not in ["bias", "LayerNorm.weight"]]
    params_decay    = { "params": decay,    "weight_decay": WEIGHT_DECAY }
    params_no_decay = { "params": no_decay, "weight_decay": 0.0 }
    parameters      = [params_decay, params_no_decay]

    criterion = nn.CrossEntropyLoss().to(DEVICE)
    optim     = AdamW(parameters, lr=LR, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optim, N_WARMUP_STEPS, EPOCHS)

    with dumper("bayesian_train"):
        for epoch in tqdm(range(EPOCHS), desc="Bayesian Epoch"):
            with dumper("epoch", epoch):
                # ============================ TRAIN ======================================
                b_model.train()
                report.reset()
                
                pbar = tqdm(train_loader, desc="Bayesian Train")
                for inputs in pbar:
                    inputs = setup_inputs(inputs, MODEL_NAME, o_model)
                    inputs = dic2cuda(inputs, DEVICE)

                    start_positions = inputs["start_positions"]
                    end_positions   = inputs["end_positions"]
                    B               = inputs["input_ids"].size(0)

                    optim.zero_grad()

                    samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE)
                    raw_start_logits, raw_end_logits, start_logits, end_logits, log_prior, log_variational_posterior = samples
                    
                    ignored_idx            = start_logits.size(1)
                    start_logits           = start_logits.clamp_(0, ignored_idx)
                    end_logits             =   end_logits.clamp_(0, ignored_idx)
                    criterion.ignore_index = ignored_idx

                    with dumper():
                        dumper['start_positions']           = start_positions
                        dumper['end_positions']             = end_positions
                        dumper['start_logits']              = start_logits
                        dumper['end_logits']                = end_logits
                        dumper['log_prior']                 = log_prior
                        dumper['log_variational_posterior'] = log_variational_posterior

                    start_loss    = criterion(start_logits, start_positions)
                    end_loss      = criterion(  end_logits,   end_positions)
                    start_acc     = (torch.argmax(start_logits, dim=1) == start_positions).float().sum()
                    end_acc       = (torch.argmax(  end_logits, dim=1) ==   end_positions).float().sum()
                    start_acc_std = np.std([(torch.argmax(start_logits.clamp(0, ignored_idx), dim=1) == start_positions).float().sum().item() for start_logits in raw_start_logits])
                    end_acc_std   = np.std([(torch.argmax(  end_logits.clamp(0, ignored_idx), dim=1) ==   end_positions).float().sum().item() for   end_logits in raw_end_logits])

                    nll     = 0.5 * (start_loss    + end_loss)
                    acc     = 0.5 * (start_acc     + end_acc)
                    acc_std = 0.5 * (start_acc_std + end_acc_std)
                    loss    = (log_variational_posterior - log_prior) / len(train_loader) + nll

                    loss.backward()
                    nn.utils.clip_grad_norm_(b_model.parameters(), MAX_GRAD_NORM)
                    optim.step()

                    report.total                     += loss.item()                      / len(train_loader)
                    report.nll                       += nll.item()                       / len(train_loader)
                    report.log_prior                 += log_prior.item()                 / len(train_loader)
                    report.log_variational_posterior += log_variational_posterior.item() / len(train_loader)
                    report.acc                       += acc.item() * 100                 / len(train_dataset)
                    report.acc_std                   += acc_std                          / len(train_loader)

                    pbar.set_postfix(
                        total=report.total,
                        nll=report.nll,
                        log_prior=report.log_prior,
                        log_variational_posterior=report.log_variational_posterior,
                        acc=report.acc,
                        acc_std=report.acc_std,
                    )

                scheduler.step()
                writer.add_scalar("bayesian_train_nll",     report.nll,     epoch)
                writer.add_scalar("bayesian_train_acc",     report.acc,     epoch)
                writer.add_scalar("bayesian_train_acc_std", report.acc_std, epoch)

    # ============================ TEST =======================================
    b_model.eval()
    report.reset()
    
    with dumper("bayesian_test_after_train"):
        with torch.no_grad():
            results = []
            pbar    = tqdm(test_loader, desc="Bayesian Test")
            for inputs in pbar:
                inputs          = setup_inputs(inputs, MODEL_NAME, o_model, True)
                inputs          = dic2cuda(inputs, DEVICE)
                feature_indices = inputs["feature_indices"]
                B               = inputs["input_ids"].size(0)

                del inputs["feature_indices"]
                samples = sample_bayesian(b_model, inputs, SAMPLES, B, MAX_SEQ_LENGTH, DEVICE)
                _, _, start_logits, end_logits, log_prior, log_variational_posterior = samples
                
                start_logits_list   = start_logits.tolist()
                end_logits_list     = end_logits.tolist()

                for i, feature_idx in enumerate(feature_indices):
                    eval_feature = test_features[feature_idx.item()]
                    unique_id    = int(eval_feature.unique_id)
                    result       = SquadResult(unique_id, start_logits_list[i], end_logits_list[i])
                    results.append(result)

                    with dumper():
                        dumper['unique_id']     = unique_id
                        dumper['start_logits']  = start_logits_list[i]
                        dumper['end_logits']    = end_logits_list[i]

            predictions = compute_predictions_logits(
                test_examples, test_features, results,
                N_BEST_SIZE, MAX_ANSWER_LENGTH, LOWER_CASE,
                os.path.join(LOGS, f"preds.bayesian.test.{writer_name + writer_suff}.json"),
                os.path.join(LOGS, f"nbestpreds.bayesian.test.{writer_name + writer_suff}.json"),
                None, True, False, NULL_SCORE_THRESH, tokenizer,
            )

            results      = squad_evaluate(test_examples, predictions)
            report.em    = results["exact"]
            report.f1    = results["f1"]
            report.total = results["total"]
            
            print(f'em={report.em}, f1={report.f1}, total={report.total}')
            writer.add_scalar("bayesian_test_em",    report.em,    epoch)
            writer.add_scalar("bayesian_test_f1",    report.f1,    epoch)
            writer.add_scalar("bayesian_test_total", report.total, epoch)

    # ============================ SAVE =======================================

    torch.save({
        "weight_decay": WEIGHT_DECAY,
        "delta"       : DELTA,
        "model"       : b_model.state_dict(),
        "em"          : report.em,
        "f1"          : report.f1,
        "total"       : report.total,
    }, f"{writer_path + writer_suff}.pth")

    return report.acc
Exemple #15
0
def process_one_question(features,
                         dataset,
                         model,
                         tokenizer,
                         examples,
                         device,
                         use_ir_score=False,
                         mu=0.0,
                         ir_scores=None):
    all_results = []
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=12)

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }
            example_indices = batch[3]
            outputs = model(**inputs)
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = [to_list(output[i]) for output in outputs]
            start_logits, end_logits = output
            if (use_ir_score):
                ir_scores_seq = np.ones(
                    len(start_logits)) * ir_scores[eval_feature.example_index]
                start_logits = list(
                    np.array(start_logits) * (1 - mu) + mu * ir_scores_seq)
                end_logits = list(
                    np.array(end_logits) * (1 - mu) + mu * ir_scores_seq)
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)

    prefix = ""
    output_dir = "./tmp_dir"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    output_prediction_file = os.path.join(
        output_dir, curr_date_str + "_predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        output_dir,
        curr_date_str + "_nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(
        output_dir, curr_date_str + "_null_odds_{}.json".format(prefix))

    compute_predictions_logits_all(
        examples,
        features,
        all_results,
        20,  # 20 args.n_best_size,
        384,  # args.max_answer_length,
        True,  # args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # args.verbose_logging,
        False,  # args.version_2_with_negative,
        0.0,  # args.null_score_diff_threshold,
        tokenizer,
    )

    predictions = json.load(
        open(
            os.path.join(
                output_dir,
                curr_date_str + "_nbest_predictions_{}.json".format(prefix)),
            'r'))
    return all_results, predictions
Exemple #16
0
    async def _custom_accuracy(self, examples, features, dataset, prefix=""):

        if not os.path.exists(self.parent.config.output_dir
                              ) and self.parent.config.local_rank in [-1, 0]:
            os.makedirs(self.parent.config.output_dir)

        self.parent.config.eval_batch_size = (
            self.parent.config.per_gpu_eval_batch_size *
            max(1, self.parent.config.n_gpu))

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(
            dataset,
            sampler=eval_sampler,
            batch_size=self.parent.config.eval_batch_size,
        )

        # multi-gpu evaluate
        if self.parent.config.n_gpu > 1 and not isinstance(
                self.model, torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.parent.config.eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            self.model.eval()
            batch = tuple(t.to(self.parent.config.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.parent.config.model_type in [
                        "xlm",
                        "roberta",
                        "distilbert",
                        "camembert",
                ]:
                    del inputs["token_type_ids"]

                feature_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.parent.config.model_type in ["xlnet", "xlm"]:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                    # for lang_id-sensitive xlm models
                    if hasattr(self.model, "config") and hasattr(
                            self.model.config, "lang2id"):
                        inputs.update({
                            "langs":
                            (torch.ones(batch[0].shape, dtype=torch.int64) *
                             self.parent.config.lang_id).to(
                                 self.parent.config.device)
                        })

                outputs = self.model(**inputs)

            for i, feature_index in enumerate(feature_indices):
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [self.to_list(output[i]) for output in outputs]

                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )
                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        evalTime = timeit.default_timer() - start_time
        logger.info(
            "  Evaluation done in total %f secs (%f sec per example)",
            evalTime,
            evalTime / len(dataset),
        )

        # Compute predictions
        output_prediction_file = os.path.join(
            self.parent.config.output_dir,
            "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.parent.config.output_dir,
            "nbest_predictions_{}.json".format(prefix),
        )

        # XLNet and XLM use a more complex post-processing procedure
        if self.parent.config.model_type in ["xlnet", "xlm"]:
            start_n_top = (self.model.config.start_n_top if hasattr(
                self.model, "config") else
                           self.model.module.config.start_n_top)
            end_n_top = (self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top)

            predictions = compute_predictions_log_probs(
                examples,
                features,
                all_results,
                self.parent.config.n_best_size,
                self.parent.config.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                None,
                start_n_top,
                end_n_top,
                False,
                self.tokenizer,
                True,
            )
        else:
            predictions = compute_predictions_logits(
                examples,
                features,
                all_results,
                self.parent.config.n_best_size,
                self.parent.config.max_answer_length,
                self.parent.config.do_lower_case,
                output_prediction_file,
                output_nbest_file,
                None,
                True,
                False,
                self.parent.config.null_score_diff_threshold,
                self.tokenizer,
            )

        return predictions
Exemple #17
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Exemple #18
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples = %d", len(dataset))
    print("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    eval_pbar = tqdm(total=len(dataset),
                     position=0,
                     leave=True,
                     file=sys.stdout,
                     bar_format="{l_bar}%s{bar}%s{r_bar}" %
                     (Fore.GREEN, Fore.RESET))
    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)
        eval_pbar.update(batch[0].size(0))  # hiepnh
    eval_pbar.close()  # hiepnh

    evalTime = timeit.default_timer() - start_time
    print("  Evaluation done in total %f secs (%f sec per example)", evalTime,
          evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Exemple #19
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        seq_lens = torch.sum((batch[0] != 0).to(torch.int32), dim=1).numpy()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            # inputs = {
            #     "input_ids": batch[0],
            #     "attention_mask": batch[1].half() if args.data_type == 'fp16' else batch[1],
            #     "token_type_ids": batch[2],
            # }
            inputs = [
                batch[0],
                batch[1].half() if args.data_type == 'fp16' else batch[1],
                batch[2]
            ]

            example_indices = batch[3]

            # outputs = model(**inputs)
            outputs = model(*inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits[:seq_lens[i]],
                                     end_logits[:seq_lens[i]])

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Exemple #20
0
    def predict(self, question: Question,
                contexts: List[Context]) -> List[Answer]:
        examples = craft_squad_examples(question, contexts)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.args["max_seq_length"],
            doc_stride=self.args["doc_stride"],
            max_query_length=self.args["max_query_length"],
            is_training=False,
            return_dataset="pt",
            threads=self.args["threads"],
            tqdm_enabled=self.args["tqdm_enabled"])

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=32)

        all_results = []

        for batch in eval_dataloader:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }
                feature_indices = batch[3]
                outputs = self.model(**inputs)

            for i, feature_index in enumerate(feature_indices):
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        answers, _ = compute_predictions_logits(
            all_examples=examples,
            all_features=features,
            all_results=all_results,
            n_best_size=self.args["n_best_size"],
            max_answer_length=self.args["max_answer_length"],
            do_lower_case=self.args["do_lower_case"],
            output_prediction_file=self.args["output_prediction_file"],
            output_nbest_file=self.args["output_nbest_file"],
            output_null_log_odds_file=self.args["output_null_log_odds_file"],
            verbose_logging=self.args["verbose_logging"],
            version_2_with_negative=self.args["version_2_with_negative"],
            null_score_diff_threshold=self.args["null_score_diff_threshold"],
            tokenizer=self.tokenizer,
            language=question.language)

        all_answers = []
        for idx, ans in enumerate(answers):
            all_answers.append(
                Answer(text=answers[ans][0],
                       score=answers[ans][1],
                       ctx_score=contexts[idx].score,
                       language=question.language))
        return all_answers
Exemple #21
0
def run_prediction_multi(question_texts, context_texts):
    """
    Modified from run_squad.py to only produce predicted answer given the question and context.
    This  function will produce multiple answers by splitting the context into paragraphs

    Input: 
        1. List of questions
        2. List of Context
    Output: 
        1. Predicted answer
    """
    examples = []

    for i, question_text in enumerate(question_texts):
        for j, context_text in enumerate(context_texts):
            example = SquadExample(
                qas_id=str(i) + str(j),
                question_text=question_text,
                context_text=context_text,
                answer_text=None,
                start_position_character=None,
                title="Predict",
                is_impossible=False,
                answers=None,
            )

            examples.append(example)
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )
    return predictions
Exemple #22
0
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             save_dir='',
             save_log_path=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    # y_cls_correct = 0
    # y_cls_incorrect = 0
    y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]
            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            is_impossible = eval_feature.is_impossible

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits, logits_cls, prob_cls = output

                prob_cls = np.asarray(prob_cls, dtype=np.float)
                predict_cls = np.argmax(prob_cls)

                if predict_cls == int(not is_impossible):
                    if is_impossible:
                        y_cls_tn += 1
                    else:
                        y_cls_tp += 1
                else:
                    if is_impossible:
                        y_cls_fp += 1
                    else:
                        y_cls_fn += 1
                result = SquadResult(unique_id, start_logits, end_logits)
                # Add cls prediction
                if args.force_cls_pred:
                    result.prob_cls = prob_cls

            all_results.append(result)

    # print(y_cls_correct, y_cls_incorrect)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    if args.force_cls_pred:
        example_index_to_features = collections.defaultdict(list)
        for feature in features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        n_force = 0
        for example_index, example in enumerate(examples):
            eval_features = example_index_to_features[example_index]
            prob = []
            for eval_feature in eval_features:
                eval_result = unique_id_to_result[eval_feature.unique_id]
                prob.append(eval_result.prob_cls[0])

            if np.mean(prob) >= 0.8:
                predictions[example.qas_id] = ""
                n_force += 1

        print("\n")
        print("num of force prediction:", n_force)
    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn +
                                            y_cls_fp)
    cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp)
    cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn)
    # Add CLS accuracy to result
    results.update({
        'cls_accuracy': cls_accuracy,
        'cls_no_ans_accuracy': cls_no_ans_accuracy,
        'cls_has_ans_accuracy': cls_has_ans_accuracy
    })
    # save log to file
    if save_log_path:
        util.save_json_file(save_log_path, results)

    return results
Exemple #23
0
def evaluate(args, model, tokenizer, prefix="", global_step=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in progress_bar(eval_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "distilkobert",
                    "xlm-roberta"
            ]:
                del inputs["token_type_ids"]

            # reforbert인 경우
            if args.model_type in ["reforbert"]:
                del inputs["attention_mask"]
            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    # Write the result
    # Write the evaluation result on file
    output_dir = os.path.join(args.output_dir, 'eval')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_eval_file = os.path.join(
        output_dir, "eval_result_{}_{}.txt".format(
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            global_step))

    logger.info("***** Official Eval results *****")
    with open(output_eval_file, "w", encoding='utf-8') as f:
        official_eval_results = eval_during_train(args)
        for key in sorted(official_eval_results.keys()):
            logger.info("  %s = %s", key, str(official_eval_results[key]))
            f.write(" {} = {}\n".format(key, str(official_eval_results[key])))
    return results
Exemple #24
0
    def predict(self, id_, question, paragraph_texts, paragraph_scores):

        # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True)

        # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor()
        # todo convert to single query examples
        examples = create_inference_examples(question,
                                             paragraph_texts,
                                             paragraph_scores,
                                             chinese=self.args.chinese,
                                             tokenizer=self.tokenizer)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.args.max_seq_length,
            doc_stride=self.args.doc_stride,
            max_query_length=self.args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=self.args.threads,
            tqdm_enabled=False)

        # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        #     os.makedirs(args.output_dir)

        self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max(
            1, self.args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=self.args.eval_batch_size)

        # multi-gpu evaluate
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        # logger.info("***** Running evaluation {} *****".format(prefix))
        # logger.info("  Num examples = %d", len(dataset))
        # logger.info("  Batch size = %d", args.eval_batch_size)

        all_results = []
        # start_time = timeit.default_timer()

        for batch in eval_dataloader:
            self.model.eval()
            batch = tuple(t.to(self.args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
                #     del inputs["token_type_ids"]

                feature_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                # if args.model_type in ["xlnet", "xlm"]:
                #     inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                #     # for lang_id-sensitive xlm models
                #     if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                #         inputs.update(
                #             {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                #         )

                outputs = self.model(**inputs)

            for i, feature_index in enumerate(feature_indices):
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )

                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        # Compute predictions
        prefix = ""
        output_prediction_file = os.path.join(
            self.args.output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.args.output_dir, "nbest_predictions_{}.json".format(prefix))

        if self.args.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                self.args.output_dir, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure
        if self.args.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(
                self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top

            answers, nbest_answers = compute_predictions_log_probs(
                examples, features, all_results, self.args.n_best_size,
                self.args.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file, start_n_top,
                end_n_top, self.args.version_2_with_negative, self.tokenizer,
                self.args.verbose_logging, self.args.chinese)
        else:
            answers, nbest_answers = compute_predictions_logits(
                examples, features, all_results, self.args.n_best_size,
                self.args.max_answer_length, self.args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, self.args.verbose_logging,
                self.args.version_2_with_negative,
                self.args.null_score_diff_threshold, self.tokenizer,
                self.args.chinese)

        all_answers = []
        for answer_id, ans in enumerate(answers):
            ans_dict = {
                "id": id_,
                "answer": answers[ans][0],
                "phrase_score": answers[ans][1],
                "paragraph_score": paragraph_scores[answer_id],
            }
            all_answers.append(ans_dict)
        return all_answers
Exemple #25
0
def evaluate(args, model_path1, model1, model2, model3, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          model_path1,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model1, torch.nn.DataParallel):
        model1 = torch.nn.DataParallel(model1)

    if args.n_gpu > 1 and not isinstance(model2, torch.nn.DataParallel):
        model2 = torch.nn.DataParallel(model2)

    if args.n_gpu > 1 and not isinstance(model3, torch.nn.DataParallel):
        model3 = torch.nn.DataParallel(model3)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model1.eval()
        model2.eval()
        model3.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]

            outputs1 = model1(**inputs)
            outputs2 = model2(**inputs)
            outputs3 = model3(**inputs)
            # print("outputs1", outputs1)
        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output1 = [to_list(output1[i]) for output1 in outputs1]
            # print("output1", output1)
            # print("len(output1)", len(output1[0]))
            output2 = [to_list(output2[i]) for output2 in outputs2]
            output3 = [to_list(output3[i]) for output3 in outputs3]

            start_logits1, end_logits1 = output1
            start_logits2, end_logits2 = output2
            start_logits3, end_logits3 = output3

            # 第一种加权加和形式集成
            weights = [0.4, 0.2, 0.4]
            start_logits = [
                weights[0] * log1 + weights[1] * log2 + weights[2] * log3
                for log1, log2, log3 in zip(start_logits1, start_logits2,
                                            start_logits3)
            ]
            end_logits = [
                weights[0] * log1 + weights[1] * log2 + weights[2] * log3 for
                log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3)
            ]
            # # 第二种算数平均是集成
            # start_logits = [
            #     (log1 + log2 + log3)/3
            #     for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3)
            # ]
            # end_logits = [
            #     (log1 + log2 + log3) / 3
            #     for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3)
            # ]
            # # 第三种位置形式
            # start_logits = [
            #     max(log1, log2, log3)
            #     for log1, log2, log3 in zip(start_logits1, start_logits2, start_logits3)
            # ]
            # end_logits = [
            #     max(log1, log2, log3)
            #     for log1, log2, log3 in zip(end_logits1, end_logits2, end_logits3)
            # ]

            # print("start_logits1", start_logits1[0])
            # print("start_logits2", start_logits2[0])
            # print("start_logits3", start_logits3[0])
            # print("start_logits", start_logits[0])
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info(
        "  Evaluation done in total %f secs (%f sec per example)",
        evalTime,
        evalTime / len(dataset),
    )

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Exemple #26
0
def evaluate_ensemble(args, checkpoints, tokenizer, model_class, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    model_list = []    
    for ckpt in checkpoints:
        logger.info("Evaluate the following fine_tuned_model: %s", ckpt)
        model_list.append(model_class.from_pretrained(ckpt))

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in ["xlm", "roberta", "distilbert"]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                raise NotImplementedError
                # inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # # for lang_id-sensitive xlm models
                # if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                #     inputs.update(
                #         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                #     )

        outputs_list = []
        for model in model_list:
            model.to(args.device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs)
            outputs_list.append(outputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits_list, end_logits_list = [], []
            for outputs in outputs_list:
                output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
                if len(output) >= 5:
                    raise NotImplementedError
                    # start_logits = output[0]
                    # start_top_index = output[1]
                    # end_logits = output[2]
                    # end_top_index = output[3]
                    # cls_logits = output[4]

                    # result = SquadResult(
                    #     unique_id,
                    #     start_logits,
                    #     end_logits,
                    #     start_top_index=start_top_index,
                    #     end_top_index=end_top_index,
                    #     cls_logits=cls_logits,
                    # )

                else:
                    start_logits, end_logits = output
                    start_logits_list.append(start_logits)
                    end_logits_list.append(end_logits)
                    
            if args.model_type in ["xlnet", "xlm"]:
                raise NotImplementedError
            else:
                start_logits_list = np.array(start_logits_list)
                end_logits_list = np.array(end_logits_list)
                #Ensembling method (eg max/avg/etc)
                start_logits = list(start_logits_list.mean(axis=0))
                end_logits = list(end_logits_list.mean(axis=0))
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        raise NotImplementedError
        # start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
        # end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Exemple #27
0
def evaluate(args, model, tokenizer, prefix="", calibration=False):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    dataset_cached = "./dataset_cached"
    if not os.path.exists(dataset_cached):
        os.makedirs(dataset_cached)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    calibation_iteration = int(
        (len(dataset) * 0.05 + args.eval_batch_size - 1) /
        args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    print("  Batch size = %d" % args.eval_batch_size)

    if args.mkldnn_eval:
        from torch.utils import mkldnn as mkldnn_utils
        model = mkldnn_utils.to_mkldnn(model)
        print(model)

    all_results = []
    evalTime = 0
    nb_eval_steps = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        if calibration and nb_eval_steps >= calibation_iteration:
            break

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            if nb_eval_steps >= args.warmup:
                start_time = timeit.default_timer()
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

        if nb_eval_steps >= args.warmup:
            evalTime += (timeit.default_timer() - start_time)

        nb_eval_steps += 1

        if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter):
            break

    if nb_eval_steps >= args.warmup:
        perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime
        if args.eval_batch_size == 1:
            print('Latency: %.3f ms' % (evalTime /
                                        (nb_eval_steps - args.warmup) * 1000))
        print("Evaluation done in total %f secs (Throughput: %f samples/sec)" %
              (evalTime, perf))
    else:
        logger.info(
            "*****no performance, please check dataset length and warmup number *****"
        )

    # Compute predictions
    output_prediction_file = os.path.join(dataset_cached,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        dataset_cached, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            dataset_cached, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    elif not calibration and args.iter == 0:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    if not calibration and args.iter == 0:
        results = squad_evaluate(examples, predictions)
        bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc']
        for key in bert_task_acc_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
    else:
        results = None
    return results, perf
Exemple #28
0
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
    eval_dataset,features, examples = load_and_cache_examples(
        args, tokenizer, labels, pad_token_label_id, mode=mode
    )

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = (
        SequentialSampler(eval_dataset)
        if args.local_rank == -1
        else DistributedSampler(eval_dataset)
    )
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
    )

    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    all_results = []
    start_time = timeit.default_timer()
    model.eval()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                }
                inputs["bbox"] = batch[5]
                inputs["token_type_ids"] = (batch[6])
                outputs = model(**inputs)
                example_indices = batch[7]
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        20, 
        30, 
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        True,
        True,
        0.0,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
def evaluate(args, model, tokenizer, prefix="", adapter_names=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "adapter_names": adapter_names,
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
def generate_model_outputs(args,
                           model,
                           tokenizer,
                           is_dev=False,
                           prefix='',
                           save_dir=''):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=is_dev,
                                                          output_examples=True)
    logger.info(
        f'REAL number of examples {len(examples)} and features {len(features)}!'
    )

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset,
                            sampler=sampler,
                            batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Output!
    logger.info("***** Generating outputs {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    # all_results = collections.defaultdict(list)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    print('# of resuls in all_results:', len(all_results))
    # Save feaures
    with open(os.path.join(save_dir, 'features.pkl'), 'wb') as f:
        pickle.dump(features, f)

    # Save all_results
    with open(os.path.join(save_dir, 'all_results.pkl'), 'wb') as f:
        pickle.dump(all_results, f)

    # Save tokenizer
    with open(os.path.join(save_dir, 'tokenizer.pkl'), 'wb') as f:
        pickle.dump(tokenizer, f)

    json_to_save = {
        'model_name': args.name,
        'type': 'dev' if is_dev else 'train',
        'num_examples': len(examples),
        'num_features': len(features)
    }
    util.save_json_file(os.path.join(save_dir, 'config.json'), json_to_save)