Example #1
0
    async def _custom_accuracy(self, examples, features, dataset, prefix=""):

        if not os.path.exists(self.parent.config.output_dir
                              ) and self.parent.config.local_rank in [-1, 0]:
            os.makedirs(self.parent.config.output_dir)

        self.parent.config.eval_batch_size = (
            self.parent.config.per_gpu_eval_batch_size *
            max(1, self.parent.config.n_gpu))

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(
            dataset,
            sampler=eval_sampler,
            batch_size=self.parent.config.eval_batch_size,
        )

        # multi-gpu evaluate
        if self.parent.config.n_gpu > 1 and not isinstance(
                self.model, torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.parent.config.eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            self.model.eval()
            batch = tuple(t.to(self.parent.config.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.parent.config.model_type in [
                        "xlm",
                        "roberta",
                        "distilbert",
                        "camembert",
                ]:
                    del inputs["token_type_ids"]

                feature_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.parent.config.model_type in ["xlnet", "xlm"]:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                    # for lang_id-sensitive xlm models
                    if hasattr(self.model, "config") and hasattr(
                            self.model.config, "lang2id"):
                        inputs.update({
                            "langs":
                            (torch.ones(batch[0].shape, dtype=torch.int64) *
                             self.parent.config.lang_id).to(
                                 self.parent.config.device)
                        })

                outputs = self.model(**inputs)

            for i, feature_index in enumerate(feature_indices):
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [self.to_list(output[i]) for output in outputs]

                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )
                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        evalTime = timeit.default_timer() - start_time
        logger.info(
            "  Evaluation done in total %f secs (%f sec per example)",
            evalTime,
            evalTime / len(dataset),
        )

        # Compute predictions
        output_prediction_file = os.path.join(
            self.parent.config.output_dir,
            "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.parent.config.output_dir,
            "nbest_predictions_{}.json".format(prefix),
        )

        # XLNet and XLM use a more complex post-processing procedure
        if self.parent.config.model_type in ["xlnet", "xlm"]:
            start_n_top = (self.model.config.start_n_top if hasattr(
                self.model, "config") else
                           self.model.module.config.start_n_top)
            end_n_top = (self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top)

            predictions = compute_predictions_log_probs(
                examples,
                features,
                all_results,
                self.parent.config.n_best_size,
                self.parent.config.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                None,
                start_n_top,
                end_n_top,
                False,
                self.tokenizer,
                True,
            )
        else:
            predictions = compute_predictions_logits(
                examples,
                features,
                all_results,
                self.parent.config.n_best_size,
                self.parent.config.max_answer_length,
                self.parent.config.do_lower_case,
                output_prediction_file,
                output_nbest_file,
                None,
                True,
                False,
                self.parent.config.null_score_diff_threshold,
                self.tokenizer,
            )

        return predictions
Example #2
0
def evaluate(args, model, tokenizer, prefix="", calibration=False):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    dataset_cached = "./dataset_cached"
    if not os.path.exists(dataset_cached):
        os.makedirs(dataset_cached)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    calibation_iteration = int(
        (len(dataset) * 0.05 + args.eval_batch_size - 1) /
        args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    print("  Batch size = %d" % args.eval_batch_size)

    if args.mkldnn_eval:
        from torch.utils import mkldnn as mkldnn_utils
        model = mkldnn_utils.to_mkldnn(model)
        print(model)

    all_results = []
    evalTime = 0
    nb_eval_steps = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        if calibration and nb_eval_steps >= calibation_iteration:
            break

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            if nb_eval_steps >= args.warmup:
                start_time = timeit.default_timer()
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

        if nb_eval_steps >= args.warmup:
            evalTime += (timeit.default_timer() - start_time)

        nb_eval_steps += 1

        if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter):
            break

    if nb_eval_steps >= args.warmup:
        perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime
        if args.eval_batch_size == 1:
            print('Latency: %.3f ms' % (evalTime /
                                        (nb_eval_steps - args.warmup) * 1000))
        print("Evaluation done in total %f secs (Throughput: %f samples/sec)" %
              (evalTime, perf))
    else:
        logger.info(
            "*****no performance, please check dataset length and warmup number *****"
        )

    # Compute predictions
    output_prediction_file = os.path.join(dataset_cached,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        dataset_cached, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            dataset_cached, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    elif not calibration and args.iter == 0:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    if not calibration and args.iter == 0:
        results = squad_evaluate(examples, predictions)
        bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc']
        for key in bert_task_acc_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
    else:
        results = None
    return results, perf
Example #3
0
def evaluate(args, model, tokenizer, device, prefix=""):
    eval_dataset, examples, features = data.load_and_cache_examples(
        args.validation,
        tokenizer,
        args,
        evaluate=True,
        output_examples=True,
    )
    eval_dataloader = data.get_dataloader(eval_dataset,
                                          args.per_gpu_eval_batch_size,
                                          evaluate=True)

    all_results = []
    start_time = timeit.default_timer()
    eval_batches = 0

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        eval_batches += 1

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime,
                evalTime / (eval_batches * args.per_gpu_eval_batch_size))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_data_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_data_dir, "nbest_predictions_{}.json".format(prefix))

    if args.has_unanswerable:
        output_null_log_odds_file = os.path.join(
            args.output_data_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = squad_metrics.compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_len,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.has_unanswerable,
            tokenizer,
            logger.level < logging.INFO,
        )
    else:
        predictions = squad_metrics.compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_len,
            args.uncased_model,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            logger.level < logging.INFO,
            args.has_unanswerable,
            args.null_score_diff_thresh,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_metrics.squad_evaluate(examples, predictions)
    return results
Example #4
0
    def find_answer(self,
                    question,
                    context,
                    n_best_size=20,
                    max_answer_length=30,
                    full_sentence=False):
        # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317"
        example_id = '55555'
        example = SquadExample(example_id, question, context, None, None, None)

        features, dataset = squad_convert_examples_to_features(
            [example],
            self.tokenizer,
            self.max_seq_length,
            self.doc_stride,
            self.max_query_length,
            False,
            return_dataset='pt')

        sampler = SequentialSampler(dataset)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)

        all_results = []
        for batch in dataloader:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in {"xlm", "roberta", "distilbert"}:
                    del inputs["token_type_ids"]

                example_index = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in {"xlnet", "xlm"}:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

                outputs = self.model(**inputs)
                output = [o.detach().cpu().tolist() for o in outputs]

                unique_id = int(features[example_index].unique_id)

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    squad_result = SquadResult(
                        unique_id,
                        start_logits[0],
                        end_logits[0],
                        start_top_index=start_top_index[0],
                        end_top_index=end_top_index[0],
                        cls_logits=cls_logits[0],
                    )

                else:
                    start_logits, end_logits = output
                    squad_result = SquadResult(unique_id, start_logits[0],
                                               end_logits[0])

                all_results.append(squad_result)

        # XLNet and XLM use a more complex post-processing procedure
        if self.model_type in {"xlnet", "xlm"}:
            if hasattr(model, "config"):
                start_n_top = self.model.config.start_n_top
                end_n_top = self.model.config.end_n_top
            else:
                start_n_top = self.model.module.config.start_n_top
                end_n_top = self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                start_n_top,
                end_n_top,
                self.version_2_with_negative,
                tokenizer,
                self.verbose,
            )
        else:
            predictions = compute_predictions_logits(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                self.do_lower_case,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                self.verbose,
                self.version_2_with_negative,
                self.null_score_diff_threshold,
            )

        prediction = predictions[example_id]

        logger.debug(f'found prediction: "{prediction}"')

        # empty prediction indicates unknown answer
        if not prediction:
            logger.debug('empty prediction')
            return None

        if full_sentence:
            doc = self.nlp(context)
            for sent in doc.sents:
                if prediction in sent.text:
                    prediction = sent.text
                    break

        return prediction
Example #5
0
    def predict(self, id_, question, paragraph_texts, paragraph_scores):

        # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True)

        # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor()
        # todo convert to single query examples
        examples = create_inference_examples(question,
                                             paragraph_texts,
                                             paragraph_scores,
                                             chinese=self.args.chinese,
                                             tokenizer=self.tokenizer)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.args.max_seq_length,
            doc_stride=self.args.doc_stride,
            max_query_length=self.args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=self.args.threads,
            tqdm_enabled=False)

        # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        #     os.makedirs(args.output_dir)

        self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max(
            1, self.args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=self.args.eval_batch_size)

        # multi-gpu evaluate
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        # logger.info("***** Running evaluation {} *****".format(prefix))
        # logger.info("  Num examples = %d", len(dataset))
        # logger.info("  Batch size = %d", args.eval_batch_size)

        all_results = []
        # start_time = timeit.default_timer()

        for batch in eval_dataloader:
            self.model.eval()
            batch = tuple(t.to(self.args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
                #     del inputs["token_type_ids"]

                feature_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                # if args.model_type in ["xlnet", "xlm"]:
                #     inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                #     # for lang_id-sensitive xlm models
                #     if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                #         inputs.update(
                #             {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                #         )

                outputs = self.model(**inputs)

            for i, feature_index in enumerate(feature_indices):
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )

                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        # Compute predictions
        prefix = ""
        output_prediction_file = os.path.join(
            self.args.output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.args.output_dir, "nbest_predictions_{}.json".format(prefix))

        if self.args.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                self.args.output_dir, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure
        if self.args.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(
                self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top

            answers, nbest_answers = compute_predictions_log_probs(
                examples, features, all_results, self.args.n_best_size,
                self.args.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file, start_n_top,
                end_n_top, self.args.version_2_with_negative, self.tokenizer,
                self.args.verbose_logging, self.args.chinese)
        else:
            answers, nbest_answers = compute_predictions_logits(
                examples, features, all_results, self.args.n_best_size,
                self.args.max_answer_length, self.args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, self.args.verbose_logging,
                self.args.version_2_with_negative,
                self.args.null_score_diff_threshold, self.tokenizer,
                self.args.chinese)

        all_answers = []
        for answer_id, ans in enumerate(answers):
            ans_dict = {
                "id": id_,
                "answer": answers[ans][0],
                "phrase_score": answers[ans][1],
                "paragraph_score": paragraph_scores[answer_id],
            }
            all_answers.append(ans_dict)
        return all_answers
Example #6
0
def evaluate_ensemble(args, checkpoints, tokenizer, model_class, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    model_list = []    
    for ckpt in checkpoints:
        logger.info("Evaluate the following fine_tuned_model: %s", ckpt)
        model_list.append(model_class.from_pretrained(ckpt))

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in ["xlm", "roberta", "distilbert"]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                raise NotImplementedError
                # inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # # for lang_id-sensitive xlm models
                # if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                #     inputs.update(
                #         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                #     )

        outputs_list = []
        for model in model_list:
            model.to(args.device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs)
            outputs_list.append(outputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits_list, end_logits_list = [], []
            for outputs in outputs_list:
                output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
                if len(output) >= 5:
                    raise NotImplementedError
                    # start_logits = output[0]
                    # start_top_index = output[1]
                    # end_logits = output[2]
                    # end_top_index = output[3]
                    # cls_logits = output[4]

                    # result = SquadResult(
                    #     unique_id,
                    #     start_logits,
                    #     end_logits,
                    #     start_top_index=start_top_index,
                    #     end_top_index=end_top_index,
                    #     cls_logits=cls_logits,
                    # )

                else:
                    start_logits, end_logits = output
                    start_logits_list.append(start_logits)
                    end_logits_list.append(end_logits)
                    
            if args.model_type in ["xlnet", "xlm"]:
                raise NotImplementedError
            else:
                start_logits_list = np.array(start_logits_list)
                end_logits_list = np.array(end_logits_list)
                #Ensembling method (eg max/avg/etc)
                start_logits = list(start_logits_list.mean(axis=0))
                end_logits = list(end_logits_list.mean(axis=0))
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        raise NotImplementedError
        # start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
        # end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Example #7
0
    def test_epoch_end(self, outputs):
        example_indices = torch.cat([x["example_indices"] for x in outputs
                                     ]).detach().cpu().tolist()
        start_logits = torch.cat([x["start_logits"]
                                  for x in outputs]).detach().cpu().tolist()
        end_logits = torch.cat([x["end_logits"]
                                for x in outputs]).detach().cpu().tolist()

        if "cls_logits" in list(outputs[0].keys()):
            start_top_index = torch.cat([
                x["start_top_index"] for x in outputs
            ]).detach().cpu().tolist()
            end_top_index = torch.cat([x["end_top_index"] for x in outputs
                                       ]).detach().cpu().tolist()
            cls_logits = torch.cat([x["cls_logits"]
                                    for x in outputs]).detach().cpu().tolist()

        examples = self.trainer.datamodule.test_examples
        features = self.trainer.datamodule.test_features

        all_results = []
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index]
            unique_id = int(eval_feature.unique_id)

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            from transformers.data.processors.squad import SquadResult
            if "cls_logits" in list(outputs[0].keys()):
                result = SquadResult(
                    unique_id,
                    start_logits[i],
                    end_logits[i],
                    start_top_index=start_top_index[i],
                    end_top_index=end_top_index[i],
                    cls_logits=cls_logits[i],
                )

            else:
                result = SquadResult(unique_id, start_logits[i], end_logits[i])

            all_results.append(result)

        # Compute predictions
        output_prediction_file = os.path.join(
            self.trainer.checkpoint_callback.dirpath, "predictions_eval.json")
        output_nbest_file = os.path.join(
            self.trainer.checkpoint_callback.dirpath,
            "nbest_predictions_eval.json")

        if self.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                self.trainer.checkpoint_callback.dirpath,
                "null_odds_eval.json")
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure
        if self.hparams.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(
                self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top

            from transformers.data.metrics.squad_metrics import compute_predictions_log_probs
            predictions = compute_predictions_log_probs(
                examples,
                features,
                all_results,
                self.hparams.n_best_size,
                self.hparams.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                start_n_top,
                end_n_top,
                self.version_2_with_negative,
                self.trainer.datamodule.tokenizer,
                False  # Not want to do verbose logging
            )
        else:
            from transformers.data.metrics.squad_metrics import compute_predictions_logits
            predictions = compute_predictions_logits(
                examples,
                features,
                all_results,
                self.hparams.n_best_size,
                self.hparams.max_answer_length,
                self.hparams.do_lower_case,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                False,  # Not want to do verbose logging
                self.version_2_with_negative,
                self.hparams.null_score_diff_threshold,
                self.trainer.datamodule.tokenizer)

        # Compute the F1 and exact scores.
        from transformers.data.metrics.squad_metrics import squad_evaluate
        results = squad_evaluate(examples, predictions)
        return results
Example #8
0
def evaluate(args, config, model, tokenizer, prefix="", global_step=0):
    dataset, examples, features = load_and_cache_examples(args,
                                                          config,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    config['eval'][
        'eval_batch_size'] = config.eval.per_gpu_eval_batch_size * max(
            1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=config.eval.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", config.eval.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if config.model.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if config.model.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         config.input.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(config.output.log_dir,
                                          f"predictions_{prefix}.json")
    output_nbest_file = os.path.join(
        config.output.log_dir,
        f"nbest_{config.model.n_best_size}_predictions_{prefix}.json")

    if config.input.version_2_with_negative:
        output_null_log_odds_file = os.path.join(config.output.log_dir,
                                                 f"null_odds_{prefix}.json")
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if config.model.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            config.model.n_best_size,
            config.model.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            config.input.version_2_with_negative,
            tokenizer,
            config.output.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            config.model.n_best_size,
            config.model.max_answer_length,
            config.model.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            config.output.verbose_logging,
            config.input.version_2_with_negative,
            config.model.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    # Save eval results to output file as well
    if prefix == "-1":
        # evaluate at the end of training, store in the log_dir directly
        output_eval_file = os.path.join(config.output.log_dir,
                                        "eval_results.tsv")
    else:
        # there is a 'prefix' subfolder
        output_eval_file = os.path.join(config.output.log_dir, prefix,
                                        "eval_results.tsv")

    if not os.path.exists(
            output_eval_file):  # file does not exist yet. write header first
        with open(output_eval_file, "a") as writer:
            writer.write("global_step\t" + "\t".join(results.keys()) +
                         "\n")  # write header

    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
        line = [str(global_step)] + [str(r) for r in results.values()]
        writer.write("\t".join(line) + "\n")

    return results
Example #9
0
def evaluate(args: Args,
             model,
             tokenizer,
             dataset,
             examples,
             features,
             suffix="",
             return_raw=False):
    if args.no_cuda is None:
        args.no_cuda = not _is_gpu_available()
    if args.predictions_folder:
        assert args.eval_file, "Need name of the eval file to save predictions!"
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu)

    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=eval_batch_size)
    model.to(device)
    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    click.echo(
        f"Generating predictions for model {click.style(args.model_path, fg='blue')}, "
        f"running on {click.style(str(device), fg='green')}")
    click.echo("  Num examples = %d" % len(dataset))
    click.echo("  Batch size = %d" % eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(device)
                    })
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    eval_time = timeit.default_timer() - start_time
    logger.info(
        f"Evaluation done in total {eval_time} secs ({eval_time / len(dataset)} sec per example)"
    )
    eval_file = args.eval_file
    predictions_folder = args.predictions_folder
    v2 = args.v2
    if predictions_folder:
        out_file = get_output_predictions_file_name(eval_file,
                                                    predictions_folder, suffix)
        logger.info(f"Saving predictions in {out_file}")

        # Compute predictions
        file_name = os.path.basename(out_file)
        output_prediction_file = os.path.join(predictions_folder, file_name)
        # output_nbest_file = os.path.join(predictions_folder, f"nbest-{file_name}")
        output_nbest_file = None

        if v2:
            output_null_log_odds_file = os.path.join(predictions_folder,
                                                     f"null-odds-{file_name}")
        else:
            output_null_log_odds_file = None
    else:
        logger.info("Not saving predictions...")
        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.v2,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.v2,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    # results = squad_evaluate(examples, predictions)
    # return results
    if return_raw:
        return predictions
    else:
        return squad_evaluate(examples, predictions)
Example #10
0
    def answer_question(self, ranked_examples):
        squad_examples = [SquadExample(
            qas_id=str(x['id']),
            question_text=x['question'],
            context_text=x['document'],
            answer_text=None,
            start_position_character=None,
            title='',
            answers=[],
        ) for x in ranked_examples]

        squad_features, squad_dataset = squad_convert_examples_to_features(
            examples=squad_examples,
            tokenizer=self.tokenizer,
            max_seq_length=512,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=cpu_count(),
        )

        eval_batch_size = self.per_gpu_eval_batch_size * max(1, self.n_gpu)
        eval_sampler = SequentialSampler(squad_dataset)
        eval_dataloader = DataLoader(squad_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

        # multi-gpu evaluate
        if self.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        logger.info("***** Running evaluation of QA *****")
        logger.info("  Num examples = %d", len(squad_dataset))
        logger.info("  Batch size = %d", eval_batch_size)

        all_results = []

        for batch in tqdm(eval_dataloader, desc="Evaluating reader"):
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                example_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in ["xlnet", "xlm"]:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                    # for lang_id-sensitive xlm self.models
                    if hasattr(self.model, "config") and hasattr(self.model.config, "lang2id"):
                        inputs.update(
                            {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * self.lang_id).to(self.device)}
                        )

                outputs = self.model(**inputs)
            for i, example_index in enumerate(example_indices):
                eval_feature = squad_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )

                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        # Compute predictions
        output_prediction_file = os.path.join(self.model_name_or_path, "predictions.json")
        output_nbest_file = os.path.join(self.model_name_or_path, "nbest_predictions.json")

        if True:
            output_null_log_odds_file = os.path.join(self.model_name_or_path, "null_odds.json")
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure

        if self.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(self.model, "config") else self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                squad_examples,
                squad_features,
                all_results,
                n_best_size=self.n_best_size,
                max_answer_length=self.max_answer_length,
                output_prediction_file=output_prediction_file,
                output_nbest_file=output_nbest_file,
                output_null_log_odds_file=output_null_log_odds_file,
                start_n_top=start_n_top,
                end_n_top=end_n_top,
                version_2_with_negative=True,
                tokenizer=self.okenizer,
                verbose_logging=True,
            )
        else:
            predictions = compute_predictions_logits(
                squad_examples,
                squad_features,
                all_results,
                n_best_size=self.n_best_size,
                max_answer_length=self.max_answer_length,
                do_lower_case=True,
                output_prediction_file=output_prediction_file,
                output_nbest_file=output_nbest_file,
                output_null_log_odds_file=output_null_log_odds_file,
                verbose_logging=True,
                version_2_with_negative=True,
                null_score_diff_threshold=0.0,
                tokenizer=self.tokenizer,
            )
        logger.info('predictions: {}'.format(predictions))
        with open(output_nbest_file) as f:
            output_nbest = json.load(f)
        return output_nbest
Example #11
0
    def __get_predictions(self, dataloader, features, samples, prefix=""):
        self.model.eval()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        all_results = []

        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                example_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in ["xlnet", "xlm"]:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

                outputs = self.model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )

                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        #compute predictions

        output_prediction_file = os.path.join(
            self.output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            self.output_dir, "nbest_predictions_{}.json".format(prefix))

        output_null_log_odds_file = os.path.join(
            self.output_dir, "null_odds_{}.json".format(prefix))
        # XLNet and XLM use a more complex post-processing procedure
        if self.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(
                self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(
                self.model, "config") else self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                samples,
                features,
                all_results,
                self.n_best_size,
                self.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                start_n_top,
                end_n_top,
                True,
                self.tokenizer,
                False,
            )
        else:
            predictions = compute_predictions_logits(
                samples, features, all_results, self.n_best_size,
                self.max_answer_length, True, output_prediction_file,
                output_nbest_file, output_null_log_odds_file, False, True,
                self.null_score_diff_threshold, self.tokenizer)

            return predictions
Example #12
0
def evaluate(model, tokenizer, output_dir, prefix="", bs=2):
    dataset, examples, features = load_and_cache_examples(tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    eval_batch_size = bs

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=eval_batch_size)

    # Eval!
    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples = %d" % len(dataset))
    print("  Batch size = %d" % eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }
            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
            # for lang_id-sensitive xlm models
            if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                inputs.update({
                    "langs": (torch.ones(batch[0].shape, dtype=torch.int64) *
                              args.lang_id).to(device)
                })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    print("  Evaluation done in total %f secs (%f sec per example)" %
          (evalTime, evalTime / len(dataset)))

    # Compute predictions
    output_prediction_file = os.path.join(output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        output_dir, "nbest_predictions_{}.json".format(prefix))

    output_null_log_odds_file = os.path.join(
        output_dir, "null_odds_{}.json".format(prefix))

    # XLNet and XLM use a more complex post-processing procedure
    start_n_top = model.config.start_n_top if hasattr(
        model, "config") else model.module.config.start_n_top
    end_n_top = model.config.end_n_top if hasattr(
        model, "config") else model.module.config.end_n_top

    predictions = compute_predictions_log_probs(
        examples,
        features,
        all_results,
        20,
        30,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        start_n_top,
        end_n_top,
        True,
        tokenizer,
        False,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Example #13
0
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
Example #14
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Example #15
0
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             save_dir='',
             save_log_path=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    # y_cls_correct = 0
    # y_cls_incorrect = 0
    y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]
            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            is_impossible = eval_feature.is_impossible

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits, logits_cls, prob_cls = output

                prob_cls = np.asarray(prob_cls, dtype=np.float)
                predict_cls = np.argmax(prob_cls)

                if predict_cls == int(not is_impossible):
                    if is_impossible:
                        y_cls_tn += 1
                    else:
                        y_cls_tp += 1
                else:
                    if is_impossible:
                        y_cls_fp += 1
                    else:
                        y_cls_fn += 1
                result = SquadResult(unique_id, start_logits, end_logits)
                # Add cls prediction
                if args.force_cls_pred:
                    result.prob_cls = prob_cls

            all_results.append(result)

    # print(y_cls_correct, y_cls_incorrect)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    if args.force_cls_pred:
        example_index_to_features = collections.defaultdict(list)
        for feature in features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        n_force = 0
        for example_index, example in enumerate(examples):
            eval_features = example_index_to_features[example_index]
            prob = []
            for eval_feature in eval_features:
                eval_result = unique_id_to_result[eval_feature.unique_id]
                prob.append(eval_result.prob_cls[0])

            if np.mean(prob) >= 0.8:
                predictions[example.qas_id] = ""
                n_force += 1

        print("\n")
        print("num of force prediction:", n_force)
    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn +
                                            y_cls_fp)
    cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp)
    cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn)
    # Add CLS accuracy to result
    results.update({
        'cls_accuracy': cls_accuracy,
        'cls_no_ans_accuracy': cls_no_ans_accuracy,
        'cls_has_ans_accuracy': cls_has_ans_accuracy
    })
    # save log to file
    if save_log_path:
        util.save_json_file(save_log_path, results)

    return results
def evaluate(args, model, tokenizer, prefix="", adapter_names=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "adapter_names": adapter_names,
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Example #17
0
    def evaluate(self, dataset, examples, features, prefix=""):
        eval_batch_size = 8
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)

        all_results = []
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            self._model.eval()
            batch = tuple(t.to(self._device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                example_indices = batch[3]
                if self._model_name in ['xlnet']:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                print("Coding: inputs ", inputs)
                outputs = self._model(**inputs)

            # feature is needed
            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                print("Coding: unique_id ", unique_id)

                output = [self._to_list(output[i]) for output in outputs]

                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]
                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)
                all_results.append(result)

        # Compute predictions
        output_dir = os.getcwd()
        output_prediction_file = os.path.join(
            output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            output_dir, "nbest_predictions_{}.json".format(prefix))
        version_2_with_negative = True
        output_null_log_odds_file = None
        if version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                output_dir, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        n_best_size = 20
        max_answer_length = 30
        verbose_logging = True
        if self._model_name in ['xlnet']:
            start_n_top = self._model.config.start_n_top
            end_n_top = self._model.config.end_n_top

            predictions = compute_predictions_log_probs(
                examples, features, all_results, n_best_size,
                max_answer_length, output_prediction_file, output_nbest_file,
                output_null_log_odds_file, start_n_top, end_n_top,
                version_2_with_negative, self._tokenizer, verbose_logging)
        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results