Esempio n. 1
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    if output_null_log_odds_file:
        with open(output_null_log_odds_file, "r") as fin:
            import json
            no_answer_probs = json.load(fin)
    else:
        no_answer_probs = None

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions, no_answer_probs)

    output_eval_file = os.path.join(args.output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
            writer.write("%s = %s\n" % (key, str(results[key])))

    return results
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
            example_indices = batch[3]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            model.config.start_n_top,
            model.config.end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Esempio n. 3
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    # 만약 output_dir 존재하지 않는다면 만든다
    if not os.path.exists(output_dir) and local_rank in [-1, 0]:
        os.makedirs(output_dir)

    # eval_batch_size 어차피 ngpu = 1이므로 개당 batch size와 같음
    eval_batch_size = per_gpu_eval_batch_size

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))

    if version_2_with_negative:
        output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            n_best_size,
            max_answer_length,
            do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            verbose_logging,
            version_2_with_negative,
            null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Esempio n. 4
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples = %d" % len(dataset))
    print("  Batch size = %d" % args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    eval_pbar = tqdm(total=len(dataset),
                     position=0,
                     leave=True,
                     file=sys.stdout,
                     bar_format="{l_bar}%s{bar}%s{r_bar}" %
                     (Fore.GREEN, Fore.RESET))
    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2]
            }

            feature_indices = batch[3]
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)
        eval_pbar.update(batch[0].size(0))  # hiepnh
    eval_pbar.close()  # hiepnh

    evalTime = timeit.default_timer() - start_time
    print("  Evaluation done in total %f secs (%f sec per example)" %
          (evalTime, evalTime / len(dataset)))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Esempio n. 5
0
def test_mrc(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                None if args.model_type in ['xlm', 'roberta', 'distilbert']
                else batch[2],
            }
            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            outputs = model.mrc_forward(**inputs, output_embedding=True)
            start_logits, end_logits, classification_logits, emb = outputs

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits, _ = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold,
            tokenizer)

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Esempio n. 6
0
def evaluate(args, config, model, tokenizer, prefix="", global_step=0):
    dataset, examples, features = load_and_cache_examples(args,
                                                          config,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    config['eval'][
        'eval_batch_size'] = config.eval.per_gpu_eval_batch_size * max(
            1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=config.eval.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", config.eval.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if config.model.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if config.model.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         config.input.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(config.output.log_dir,
                                          f"predictions_{prefix}.json")
    output_nbest_file = os.path.join(
        config.output.log_dir,
        f"nbest_{config.model.n_best_size}_predictions_{prefix}.json")

    if config.input.version_2_with_negative:
        output_null_log_odds_file = os.path.join(config.output.log_dir,
                                                 f"null_odds_{prefix}.json")
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if config.model.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            config.model.n_best_size,
            config.model.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            config.input.version_2_with_negative,
            tokenizer,
            config.output.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            config.model.n_best_size,
            config.model.max_answer_length,
            config.model.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            config.output.verbose_logging,
            config.input.version_2_with_negative,
            config.model.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    # Save eval results to output file as well
    if prefix == "-1":
        # evaluate at the end of training, store in the log_dir directly
        output_eval_file = os.path.join(config.output.log_dir,
                                        "eval_results.tsv")
    else:
        # there is a 'prefix' subfolder
        output_eval_file = os.path.join(config.output.log_dir, prefix,
                                        "eval_results.tsv")

    if not os.path.exists(
            output_eval_file):  # file does not exist yet. write header first
        with open(output_eval_file, "a") as writer:
            writer.write("global_step\t" + "\t".join(results.keys()) +
                         "\n")  # write header

    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
        line = [str(global_step)] + [str(r) for r in results.values()]
        writer.write("\t".join(line) + "\n")

    return results
Esempio n. 7
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in ["xlm", "roberta", "distilbert"]:
                del inputs["token_type_ids"]

            # 这里面 的 example_indices 只的是 features 中的哪几个。 (切分问题)
            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            # tensor.item() only works for tensor containing only one element
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                # print("size of start_logits of {0} is {1}".format(unique_id, len(start_logits)))
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        # print('******* length of examples is {0}*******'.format(len(examples)))
        # print(len(examples))
        # print(len(features))
        # print(len(all_results))
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
            real_pred=args.do_real_pred,
        )

    # Compute the F1 and exact scores. Skip this step when we do real prediction (without ground truth)
    # Writing files are done in the previous steps,
    if not args.do_real_pred:
        results = squad_evaluate(examples, predictions)
        # print('ok, {0}'.format(results))
        return results

    else:
        return None
Esempio n. 8
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    if args.use_jit_trace:
        enable_tracing()

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    is_eval_traced = False

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        ## Habana doesn't support Long tensors
        ## Hence we need to convert start and end positions to int
        if args.use_habana:
            batch[0] = batch[0].to(dtype=torch.int32)
            batch[1] = batch[1].to(dtype=torch.int32)
            batch[2] = batch[2].to(dtype=torch.int32)

        position_ids_cpu = compute_position_ids(batch[0])
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        position_ids = position_ids_cpu.to(args.device)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "position_ids": position_ids,
            }
            tensor_dummy = torch.zeros(1).to(args.device)

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })
            if args.use_jit_trace and is_eval_traced == False:
                model_trace = torch.jit.trace(
                    model, (batch[0], batch[1], batch[2], position_ids,
                            tensor_dummy, tensor_dummy, tensor_dummy,
                            tensor_dummy, tensor_dummy, tensor_dummy),
                    check_trace=False)
                is_eval_traced = True
                model_trace.eval()
            if args.use_jit_trace:
                outputs = model_trace(batch[0], batch[1], batch[2],
                                      position_ids, tensor_dummy, tensor_dummy,
                                      tensor_dummy, tensor_dummy, tensor_dummy,
                                      tensor_dummy)
            else:
                outputs = model(**inputs)
            feature_indices = feature_indices.to("cpu")

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Esempio n. 9
0
def evaluate(args, model, tokenizer, prefix="", global_step=None):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in progress_bar(eval_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in ["xlm", "roberta", "distilbert", "distilkobert", "xlm-roberta"]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    # Write the result
    # Write the evaluation result on file
    output_dir = os.path.join(args.output_dir, 'eval')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_eval_file = os.path.join(output_dir, "eval_result_{}_{}.txt".format(list(filter(None, args.model_name_or_path.split("/"))).pop(),
                                                                               global_step))

    logger.info("***** Official Eval results *****")
    with open(output_eval_file, "w", encoding='utf-8') as f:
        official_eval_results = eval_during_train(args)
        for key in sorted(official_eval_results.keys()):
            logger.info("  %s = %s", key, str(official_eval_results[key]))
            f.write(" {} = {}\n".format(key, str(official_eval_results[key])))
    return results
Esempio n. 10
0
def evaluate(args, model, tokenizer, prefix="", calibration=False):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    dataset_cached = "./dataset_cached"
    if not os.path.exists(dataset_cached):
        os.makedirs(dataset_cached)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    calibation_iteration = int(
        (len(dataset) * 0.05 + args.eval_batch_size - 1) /
        args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    if args.mkldnn_eval:
        from torch.utils import mkldnn as mkldnn_utils
        model = mkldnn_utils.to_mkldnn(model)
        print(model)

    all_results = []
    evalTime = 0
    nb_eval_steps = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        if calibration and nb_eval_steps >= calibation_iteration:
            break

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            if nb_eval_steps >= args.warmup:
                start_time = timeit.default_timer()
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)
            if nb_eval_steps >= args.warmup:
                evalTime += (timeit.default_timer() - start_time)

        nb_eval_steps += 1
    if nb_eval_steps >= args.warmup:
        perf = (len(eval_dataloader) -
                args.warmup) * args.eval_batch_size / evalTime
        logger.info("Evaluation done in total %f secs (%f samples/sec)",
                    evalTime, (len(dataset) - args.warmup) *
                    args.eval_batch_size / evalTime)
    else:
        logger.info(
            "*****no perfformance, please check dataset length and warmup number *****"
        )

    # Compute predictions
    output_prediction_file = os.path.join(dataset_cached,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        dataset_cached, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            dataset_cached, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    elif not calibration:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    if not calibration:
        results = squad_evaluate(examples, predictions)
        return results, perf
def run_prediction(question_texts, context_text):
    """Setup function to compute predictions"""
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=4,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions


# context = "New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland."
# questions = ["How many people live in New Zealand?",
#              "What's the largest city?"]
#
# # Run method
# predictions = run_prediction(questions, context)
#
# # Print results
# for key in predictions.keys():
#     print(predictions[key])
Esempio n. 12
0
def evaluate(args, model_path1, model1, model2, model3, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          model_path1,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model1, torch.nn.DataParallel):
        model1 = torch.nn.DataParallel(model1)

    if args.n_gpu > 1 and not isinstance(model2, torch.nn.DataParallel):
        model2 = torch.nn.DataParallel(model2)

    if args.n_gpu > 1 and not isinstance(model3, torch.nn.DataParallel):
        model3 = torch.nn.DataParallel(model3)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model1.eval()
        model2.eval()
        model3.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]

            outputs1 = model1(**inputs)
            outputs2 = model2(**inputs)
            outputs3 = model3(**inputs)
            # print("outputs1", outputs1)
        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output1 = [to_list(output1[i]) for output1 in outputs1]
            output2 = [to_list(output2[i]) for output2 in outputs2]
            output3 = [to_list(output3[i]) for output3 in outputs3]

            start_logits1, end_logits1 = output1
            start_logits2, end_logits2 = output2
            start_logits3, end_logits3 = output3

            # TODO homework 补充模型集成代码 权重为 0.4, 0.2, 0.4 计算:start_logits, end_logits
            weight = [0.4, 0.2, 0.4]

            start_logits = start_logits1 * weight[0] + start_logits2 * weight[
                1] + start_logits3 * weight[2]
            end_logits = end_logits1 * weight[0] + end_logits2 * weight[
                1] + end_logits3 * weight[2]

            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info(
        "  Evaluation done in total %f secs (%f sec per example)",
        evalTime,
        evalTime / len(dataset),
    )

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
Esempio n. 13
0
def evaluate(args, model, tokenizer, prefix=""):
    languages = args.language.split(',')
    all_languages_results = {}
    processor = MLQAProcessor()

    for split, lang in itertools.product(["dev", "test"], languages):
        # for split, lang in itertools.product(["dev", "test"], languages):
        print("evaluating on {0} {1}".format(split, lang))
        dataset, examples, features = load_and_cache_examples(
            args, tokenizer, language=lang, split=split, output_examples=True)

        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # multi-gpu evaluate
        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if args.model_type in [
                        "xlm", "roberta", "distilbert", "camembert"
                ]:
                    del inputs["token_type_ids"]

                example_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if args.model_type in ["xlnet", "xlm"]:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                    # for lang_id-sensitive xlm models
                    if hasattr(model, "config") and hasattr(
                            model.config, "lang2id"):
                        inputs.update({
                            "langs":
                            (torch.ones(batch[0].shape, dtype=torch.int64) *
                             args.lang_id).to(args.device)
                        })

                outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )

                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        evalTime = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)",
                    evalTime, evalTime / len(dataset))

        # Compute predictions
        output_prediction_file = os.path.join(args.output_dir,
                                              "{}.prediction".format(lang))
        output_nbest_file = os.path.join(
            args.output_dir,
            "nbest_predictions_{}_{}_{}.json".format(prefix, split, lang))

        if args.version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                args.output_dir,
                "null_odds_{}_{}_{}.json".format(prefix, split, lang))
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure
        if args.model_type in ["xlnet", "xlm"]:
            start_n_top = model.config.start_n_top if hasattr(
                model, "config") else model.module.config.start_n_top
            end_n_top = model.config.end_n_top if hasattr(
                model, "config") else model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                examples,
                features,
                all_results,
                args.n_best_size,
                args.max_answer_length,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                start_n_top,
                end_n_top,
                args.version_2_with_negative,
                tokenizer,
                args.verbose_logging,
            )
        else:
            predictions = compute_predictions_logits(
                examples,
                features,
                all_results,
                args.n_best_size,
                args.max_answer_length,
                args.do_lower_case,
                output_prediction_file,
                output_nbest_file,
                output_null_log_odds_file,
                args.verbose_logging,
                args.version_2_with_negative,
                args.null_score_diff_threshold,
                tokenizer,
                map_to_origin=not (args.model_type == "xlmr" and lang == 'zh'))

        # Compute the F1 and exact scores.
        # results = squad_evaluate(examples, predictions)
        results = evaluate_with_path(
            processor.get_dataset_path(args.data_dir, split, lang),
            output_prediction_file, lang)
        all_languages_results["{0}_{1}".format(split, lang)] = results
    for split in ["dev", "test"]:
        all_languages_results["{0}_avg".format(split)] = average_dic([
            value for key, value in all_languages_results.items()
            if split in key
        ])

    return all_languages_results
Esempio n. 14
0
def run_prediction(model, tokenizer, device, output_dir, filename, question_texts, context_text):
        """Setup function to compute predictions"""
        examples = []

        for i, question_text in enumerate(question_texts):
            example = SquadExample(
                qas_id=str(i),
                question_text=question_text,
                context_text=context_text,
                answer_text=None,
                start_position_character=None,
                title="Predict",
                is_impossible=False,
                answers=None,
            )

            examples.append(example)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

        all_results = []

        for batch in eval_dataloader:
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                example_indices = batch[3]

                outputs = model(**inputs)

                for i, example_index in enumerate(example_indices):
                    eval_feature = features[example_index.item()]
                    unique_id = int(eval_feature.unique_id)

                    output = [to_list(output[i]) for output in outputs]

                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)
                    all_results.append(result)

        output_prediction_file = os.path.join(output_dir, filename + "_predictions.json")
        output_nbest_file = os.path.join(output_dir, filename + "_nbest_predictions.json")
        output_null_log_odds_file = os.path.join(output_dir, filename + "_null_predictions.json")

        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            n_best_size,
            max_answer_length,
            do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            False,  # verbose_logging
            True,  # version_2_with_negative
            null_score_diff_threshold,
            tokenizer,
        )

        return predictions
Esempio n. 15
0
    def predict(
        self,
        query: Union[List[str], str],
        context: Union[List[str], str],
        n_best_size: int = 5,
        mini_batch_size: int = 32,
        max_answer_length: int = 10,
        do_lower_case: bool = False,
        version_2_with_negative: bool = False,
        verbose_logging: bool = False,
        null_score_diff_threshold: float = 0.0,
        max_seq_length: int = 512,
        doc_stride: int = 128,
        max_query_length: int = 64,
        **kwargs,
    ) -> Tuple[Tuple[str, List[OrderedDict]], Tuple[OrderedDict, OrderedDict]]:
        """ Predict method for running inference using the pre-trained question answering model

        * **query** - String or list of strings that specify the ordered questions corresponding to `context`
        * **context** - String or list of strings that specify the ordered contexts corresponding to `query`
        * **n_best_size** - Number of top n results you want
        * **mini_batch_size** - Mini batch size
        * **max_answer_length** - Maximum token length for answers that are returned
        * **do_lower_case** - Set as `True` if using uncased QA models
        * **version_2_with_negative** - Set as True if using QA model with SQUAD2.0
        * **verbose_logging** - Set True if you want prediction verbose loggings
        * **null_score_diff_threshold** - Threshold for predicting null(no answer) in Squad 2.0 Model.  Default is 0.0.  Raise this if you want fewer null answers
        * **max_seq_length** - Maximum context token length. Check model configs to see max sequence length the model was trained with
        * **doc_stride** - Number of token strides to take when splitting up conext into chunks of size `max_seq_length`
        * **max_query_length** - Maximum token length for queries
        * ****kwargs**(Optional) - Optional arguments for the Transformers model (mostly for saving evaluations)
        """
        # Make string input consistent as list
        if isinstance(query, str):
            query = [query]
            context = [context]
        assert len(query) == len(context)

        examples = self._mini_squad_processor(query=query, context=context)
        features, dataset = squad_convert_examples_to_features(
            examples,
            self.tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )
        all_results = []

        with torch.no_grad():

            dataloader = DataLoader(dataset, batch_size=mini_batch_size)

            for batch in tqdm(dataloader, desc="Predicting answer"):
                self.model.eval()
                batch = tuple(t.to(self.device) for t in batch)

                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }
                example_indices = batch[3]

                if isinstance(
                        self.model,
                    (
                        XLMForQuestionAnswering,
                        RobertaForQuestionAnswering,
                        DistilBertForQuestionAnswering,
                        CamembertForQuestionAnswering,
                    ),
                ):
                    del inputs["token_type_ids"]

                # XLNet and XLM use more arguments for their predictions
                if isinstance(self.model, self.xmodel_instances):
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                    # for lang_id-sensitive xlm models
                    if hasattr(self.model, "config") and hasattr(
                            self.model.config, "lang2id"):
                        # Set language id as 0 for now
                        inputs.update({
                            "langs":
                            (torch.ones(batch[0].shape, dtype=torch.int64) *
                             0).to(self.device)
                        })

                outputs = self.model(**inputs)

                # Iterate through and produce `SquadResults
                for i, example_index in enumerate(example_indices):
                    eval_feature = features[example_index.item()]
                    unique_id = int(eval_feature.unique_id)

                    output = [self.to_list(output[i]) for output in outputs]

                    if isinstance(self.model, self.xmodel_instances):
                        # Some models like the ones in `self.xmodel_instances` use 5 arguments for their predictions
                        start_logits = output[0]
                        start_top_index = output[1]
                        end_logits = output[2]
                        end_top_index = output[3]
                        cls_logits = output[4]

                        result = SquadResult(
                            unique_id,
                            start_logits,
                            end_logits,
                            start_top_index=start_top_index,
                            end_top_index=end_top_index,
                            cls_logits=cls_logits,
                        )

                    else:
                        start_logits, end_logits = output
                        result = SquadResult(unique_id, start_logits,
                                             end_logits)
                    all_results.append(result)

            if isinstance(self.model, self.xmodel_instances):
                start_n_top = (self.model.config.start_n_top if hasattr(
                    self.model, "config") else
                               self.model.module.config.start_n_top)
                end_n_top = (self.model.config.end_n_top if hasattr(
                    self.model, "config") else
                             self.model.module.config.end_n_top)

                answers, n_best = compute_predictions_log_probs(
                    examples,
                    features,
                    all_results,
                    n_best_size,
                    max_answer_length,
                    start_n_top,
                    end_n_top,
                    version_2_with_negative,
                    self.tokenizer,
                    verbose_logging,
                    **kwargs,
                )

            else:
                answers, n_best = compute_predictions_logits(
                    examples,
                    features,
                    all_results,
                    n_best_size,
                    max_answer_length,
                    do_lower_case,
                    verbose_logging,
                    version_2_with_negative,
                    null_score_diff_threshold,
                    self.tokenizer,
                    **kwargs,
                )

        return answers, n_best
Esempio n. 16
0
    def evaluate(self, dataset, examples, features, prefix=""):
        eval_batch_size = 8
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)

        all_results = []
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            self._model.eval()
            batch = tuple(t.to(self._device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                example_indices = batch[3]
                if self._model_name in ['xlnet']:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                print("Coding: inputs ", inputs)
                outputs = self._model(**inputs)

            # feature is needed
            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                print("Coding: unique_id ", unique_id)

                output = [self._to_list(output[i]) for output in outputs]

                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]
                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)
                all_results.append(result)

        # Compute predictions
        output_dir = os.getcwd()
        output_prediction_file = os.path.join(
            output_dir, "predictions_{}.json".format(prefix))
        output_nbest_file = os.path.join(
            output_dir, "nbest_predictions_{}.json".format(prefix))
        version_2_with_negative = True
        output_null_log_odds_file = None
        if version_2_with_negative:
            output_null_log_odds_file = os.path.join(
                output_dir, "null_odds_{}.json".format(prefix))
        else:
            output_null_log_odds_file = None

        n_best_size = 20
        max_answer_length = 30
        verbose_logging = True
        if self._model_name in ['xlnet']:
            start_n_top = self._model.config.start_n_top
            end_n_top = self._model.config.end_n_top

            predictions = compute_predictions_log_probs(
                examples, features, all_results, n_best_size,
                max_answer_length, output_prediction_file, output_nbest_file,
                output_null_log_odds_file, start_n_top, end_n_top,
                version_2_with_negative, self._tokenizer, verbose_logging)
        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        return results