Beispiel #1
0
def evalute(args, model, tokenizer, prefix=""):
    eval_task_names = ("span_detection", )
    eval_dataset, examples, features = load_and_cache_examples(
        args, eval_task_names, tokenizer, evaluate=True, output_examples=True)

    args.eval_batch_size = 1
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader,
                      desc="Evaluating",
                      position=0,
                      leave=True,
                      ncols=100):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "cls_index": batch[4],
                "p_mask": batch[5],
                "task": 2,
            }

        feature_indices = batch[3]

        outputs = model(**inputs)

    for i, feature_index in enumerate(feature_indices):
        eval_feature = features[feature_index.item()]
        unique_id = int(eval_feature.unique_id)

        output = [to_list(output[i]) for output in outputs]

        start_logits = output[0]
        start_top_index = output[1]
        end_logits = output[2]
        end_top_index = output[3]
        cls_logits = output[4]

        result = SpanDetectionResult(
            unique_id,
            start_logits,
            end_logits,
            start_top_index=start_top_index,
            end_top_index=end_top_index,
            cls_logits=cls_logits,
            top_n=model.config.start_n_top,
        )

    all_results.append(result)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(eval_dataset))

    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir,
                                     "nbest_predictions{}.json".format(prefix))
    output_best_file = os.path.join(args.output_dir,
                                    "best_predictions{}.json".format(prefix))

    start_n_top = model.config.start_n_top
    end_n_top = model.config.end_n_top

    predictions = compute_predictions_log_probs(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.min_answer_length,
        output_prediction_file,
        output_nbest_file,
        start_n_top,
        end_n_top,
        tokenizer,
        args.verbose_logging,
    )

    results = span_detection_evaluate(examples, predictions, output_best_file)
    return results
Beispiel #2
0
def evaluate(args, model, tokenizer, prefix="dev", step=0):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          set_type=prefix,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    if prefix == 'test':
        with open(os.path.join(args.output_dir, args.test_prob_file),
                  'wb') as f:
            pickle.dump(all_results, f)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(
        args.output_dir, "predictions_{}_{}.json".format(prefix, step))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}_{}.json".format(prefix, step))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}_{}.json".format(prefix, step))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold,
            tokenizer)

    if prefix == 'dev':
        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results
    else:
        return None
Beispiel #3
0
def evalute(args, model, tokenizer, prefix=""):
    eval_task_names = ("span_detection", )
    eval_dataset = load_and_cache_examples(args,
                                           eval_task_names,
                                           tokenizer,
                                           evaluate=True)

    args.eval_batch_size = args.train_batch_size
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)
    device = 'cpu'
    #model.to(device)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    all_examples = []
    all_features = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader,
                      desc="Evaluating",
                      position=0,
                      leave=True,
                      ncols=100):
        model.eval()
        input_ids, attention_mask, token_type_ids, cls_index, p_mask = [
            t.squeeze(0).to(args.device) for t in batch[0:5]
        ]

        with torch.no_grad():
            inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                "cls_index": cls_index,
                "p_mask": p_mask,
                "task": 2,
            }

            example_index = batch[5]
            unique_id = batch[6]

            outputs = model(**inputs)

            description_text, context_text, span_text, start_position_character = [
                t[0] for t in batch[-5:-1]
            ]

            example = SpanDetectionExample(
                description_text=description_text,
                context_text=context_text,
                span_text=span_text,
                start_position_character=start_position_character,
                unique_id=unique_id,
            )

            feature = span_detection_convert_example_to_features(
                example,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=False,
                example_index=example_index,
                unique_id=unique_id,
            )

            start_logits = outputs[0]
            start_top_index = outputs[1]
            end_logits = outputs[2]
            end_top_index = outputs[3]
            cls_logits = outputs[4]

            result = SpanDetectionResult(
                unique_id,
                start_logits,
                end_logits,
                start_top_index=start_top_index,
                end_top_index=end_top_index,
                cls_logits=cls_logits,
                top_n=model.config.start_n_top,
            )

            all_results.append(result)
            all_examples.append(example)
            all_features.append(feature)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(eval_dataset))

    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir,
                                     "nbest_predictions{}.json".format(prefix))
    output_best_file = os.path.join(args.output_dir,
                                    "best_predictions{}.json".format(prefix))

    start_n_top = model.config.start_n_top
    end_n_top = model.config.end_n_top

    predictions = compute_predictions_log_probs(
        all_examples,
        all_features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.min_answer_length,
        output_prediction_file,
        output_nbest_file,
        start_n_top,
        end_n_top,
        tokenizer,
        args.verbose_logging,
    )

    results = span_detection_evaluate(all_examples, predictions,
                                      output_best_file)
    return results