def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    # prepare examples, load model as encoder
    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path,
                                                      relaxed=True)

    # Load Model...
    if args.bert_load_mode == "state_model_only":
        state_dict = all_state['model']
        bert_as_encoder = BertModel.from_state_dict(
            config_file=args.bert_config_json_path, state_dict=state_dict)
    else:
        assert args.bert_load_mode == "from_pretrained"
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
            args.local_rank)
        bert_as_encoder = BertModel.from_pretrained(
            pretrained_model_name_or_path=args.bert_model, cache_dir=cache_dir)

    bert_as_encoder.to(device)

    runner_param = RunnerParameters(
        max_seq_length=args.max_seq_length,
        local_rank=args.local_rank,
        n_gpu=n_gpu,
        fp16=args.fp16,
        learning_rate=args.learning_rate,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        t_total=None,
        warmup_proportion=args.warmup_proportion,
        num_train_epochs=args.num_train_epochs,
        train_batch_size=args.train_batch_size,
        eval_batch_size=args.eval_batch_size,
    )

    runner = EmbeddingTaskRunner(bert_model=bert_as_encoder,
                                 optimizer=None,
                                 tokenizer=tokenizer,
                                 label_list=task.get_labels(),
                                 device=device,
                                 rparams=runner_param)

    # Run training set encoding...
    print("Run training set encoding ... ")
    train_examples = task.get_train_examples()
    train_dataset = runner.run_encoding(train_examples,
                                        verbose=True,
                                        mode='train')
    print("saving embeddings ... ")
    torch.save(train_dataset, os.path.join(args.output_dir, "train.dataset"))

    # Run development set encoding ...
    eval_examples = task.get_dev_examples()
    eval_dataset = runner.run_encoding(eval_examples,
                                       verbose=True,
                                       mode='eval')
    print("saving embeddings ... ")
    torch.save(eval_dataset, os.path.join(args.output_dir, 'dev.dataset'))

    # Run test set encoding ...
    test_examples = task.get_test_examples()
    test_dataset = runner.run_encoding(test_examples,
                                       verbose=True,
                                       mode='test')
    print("saving embeddings ... ")
    torch.save(test_dataset, os.path.join(args.output_dir, "test.dataset"))

    # HACK for MNLI mis-matched set ...
    if args.task_name == 'mnli':
        print("=== Start embedding task for MNLI mis-matched ===")
        mm_eval_examples = MnliMismatchedProcessor().get_dev_examples(
            task.data_dir)
        mm_eval_dataset = runner.run_encoding(mm_eval_examples,
                                              verbose=True,
                                              mode='eval')
        print("=== Saving eval dataset ===")
        torch.save(mm_eval_dataset,
                   os.path.join(args.output_dir, "mm_dev.dataset"))
        print("=== Saved ===")

        mm_test_examples = MnliMismatchedProcessor().get_test_examples(
            task.data_dir)
        mm_test_dataset = runner.run_encoding(mm_test_examples,
                                              verbose=True,
                                              mode='test')
        print("=== Saving tensor dataset ===")
        torch.save(mm_test_dataset,
                   os.path.join(args.output_dir, "mm_test.dataset"))
        print("=== Saved ===")
Esempio n. 2
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    # could cause problem because imdb is not part of the tasks defined
    task = get_task(args.task_name, args.data_dir)

    # create tokenizer using given model input
    # I think xlnet also use the same tokenizer
    tokenizer = shared_model_setup.create_tokenizer(
        xlnet_model_name=args.xlnet_model,  # need to change
        xlnet_load_mode=args.xlnet_load_mode,  # need to change
        do_lower_case=args.do_lower_case,
        xlnet_vocab_path=args.xlnet_vocab_path,  # not sure how to modify
    )
    all_state = shared_model_setup.load_overall_state(
        args.xlnet_load_path,
        relaxed=True)  # probably will be the pre-trained one

    model = glue_model_setup.create_model(
        task_type=task.processor.TASK_TYPE,
        xlnet_model_name=args.xlnet_model,
        xlnet_load_mode=args.xlnet_load_mode,
        xlnet_load_args=args.xlnet_load_args,
        all_state=all_state,
        num_labels=len(task.processor.get_labels()),
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        xlnet_config_json_path=args.xlnet_config_json_path,
    )
    if args.do_train:
        if args.print_trainable_params:
            log_info.print_trainable_params(model)
        train_examples = task.get_train_examples()
        if args.train_examples_number is not None:
            train_examples = random_sample(train_examples,
                                           args.train_examples_number)
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )

        optimizer = shared_model_setup.create_optimizer(
            model=model,
            learning_rate=args.learning_rate,
            t_total=t_total,
            loss_scale=args.loss_scale,
            fp16=args.fp16,
            warmup_proportion=args.warmup_proportion,
            state_dict=all_state["optimizer"]
            if args.bert_load_mode == "state_all" else None,
        )
    else:
        train_examples = None
        t_total = 0
        optimizer = None
    # TODO: what does xlnet runner do???\
    # initial answer: probably do
    runner = GlueTaskRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            local_rank=args.local_rank,
            n_gpu=n_gpu,
            fp16=args.fp16,
            learning_rate=args.learning_rate,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total,
            warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
            eval_batch_size=args.eval_batch_size,
        ))

    if args.do_train:
        assert at_most_one_of([args.do_val_history, args.train_save_every])
        if args.do_val_history:
            val_examples = task.get_dev_examples()
            results = runner.run_train_val(
                train_examples=train_examples,
                val_examples=val_examples,
                task_name=task.name,
            )
            metrics_str = json.dumps(results, indent=2)
            with open(
                    os.path.join(args.output_dir, "val_metrics_history.json"),
                    "w") as f:
                f.write(metrics_str)
        elif args.train_save_every:
            train_dataloader = runner.get_train_dataloader(
                train_examples, verbose=not args.not_verbose)
            for epoch in range(int(args.num_train_epochs)):
                for step, _, _ in runner.run_train_epoch_context(
                        train_dataloader):
                    if step % args.train_save_every == args.train_save_every - 1 \
                            or step == len(train_dataloader) - 1:
                        glue_model_setup.save_xlnet(
                            model=model,
                            optimizer=optimizer,
                            args=args,
                            save_path=os.path.join(
                                args.output_dir,
                                f"all_state___epoch{epoch:04d}___batch{step:06d}.p"
                            ),
                            save_mode=args.bert_save_mode,
                            verbose=not args.not_verbose,
                        )
        else:
            runner.run_train(train_examples)

    if args.do_save:
        # Save a trained model
        glue_model_setup.save_xlnet(
            model=model,
            optimizer=optimizer,
            args=args,
            save_path=os.path.join(args.output_dir, "all_state.p"),
            save_mode=args.bert_save_mode,
        )
    # remove the hack part for MultiNLI Mismatched dataset
    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples,
                                 task_name=task.name,
                                 verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        logits = runner.run_test(test_examples, verbose=not args.not_verbose)
        df = pd.DataFrame(logits)
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"),
                  header=False,
                  index=False)
Esempio n. 3
0
def main():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)

    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True)
    model = lm_model_setup.create_model(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        bert_load_args=args.bert_load_args,
        all_state=all_state,
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        bert_config_json_path=args.bert_config_json_path,
    )
    if args.print_trainable_params:
        log_info.print_trainable_params(model)

    train_dataset = lm_runners.LMDataset(
        args.train_file, tokenizer, seq_len=args.max_seq_length,
        corpus_lines=None, on_memory=args.on_memory,
    )
    t_total = shared_model_setup.get_opt_train_steps(
        num_train_examples=len(train_dataset),
        args=args,
    )
    optimizer = shared_model_setup.create_optimizer(
        model=model,
        learning_rate=args.learning_rate,
        t_total=t_total,
        loss_scale=args.loss_scale,
        fp16=args.fp16,
        warmup_proportion=args.warmup_proportion,
        state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None,
    )
    runner = lm_runners.LMRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        device=device,
        rparams=lm_runners.RunnerParameters(
            select_prob=args.select_prob, max_seq_length=args.max_seq_length,
            local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16,
            learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total, warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
        )
    )

    runner.run_train(train_dataset)
    lm_model_setup.save_bert(
        model=model, optimizer=optimizer, args=args,
        save_path=os.path.join(args.output_dir, "all_state.p"),
        save_mode=args.bert_save_mode,
    )
Esempio n. 4
0
def main():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True)
    model = glue_model_setup.create_model(
        task_type=task.processor.TASK_TYPE,
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        bert_load_args=args.bert_load_args,
        all_state=all_state,
        num_labels=len(task.processor.get_labels()),
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        bert_config_json_path=args.bert_config_json_path,
    )
    if args.do_train:
        if args.print_trainable_params:
            log_info.print_trainable_params(model)
        train_examples = task.get_train_examples()
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )
        optimizer = shared_model_setup.create_optimizer(
            model=model,
            learning_rate=args.learning_rate,
            t_total=t_total,
            loss_scale=args.loss_scale,
            fp16=args.fp16,
            warmup_proportion=args.warmup_proportion,
            state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None,
        )
    else:
        train_examples = None
        t_total = 0
        optimizer = None

    runner = GlueTaskRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16,
            learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total, warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size,
        )
    )

    if args.do_train:
        assert at_most_one_of([args.do_val_history, args.train_save_every])
        if args.do_val_history:
            val_examples = task.get_dev_examples()
            results = runner.run_train_val(
                train_examples=train_examples,
                val_examples=val_examples,
                task_name=task.name,
            )
            metrics_str = json.dumps(results, indent=2)
            with open(os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f:
                f.write(metrics_str)
        elif args.train_save_every:
            train_dataloader = runner.get_train_dataloader(train_examples, verbose=not args.not_verbose)
            for epoch in range(int(args.num_train_epochs)):
                for step, _, _ in runner.run_train_epoch_context(train_dataloader):
                    if step % args.train_save_every == args.train_save_every - 1 \
                            or step == len(train_dataloader) - 1:
                        glue_model_setup.save_bert(
                            model=model, optimizer=optimizer, args=args,
                            save_path=os.path.join(
                                args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p"
                            ),
                            save_mode=args.bert_save_mode,
                            verbose=not args.not_verbose,
                        )
        else:
            runner.run_train(train_examples)

    if args.do_save:
        # Save a trained model
        glue_model_setup.save_bert(
            model=model, optimizer=optimizer, args=args,
            save_path=os.path.join(args.output_dir, "all_state.p"),
            save_mode=args.bert_save_mode,
        )

    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False)
        metrics_str = json.dumps({"loss": results["loss"], "metrics": results["metrics"]}, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            mm_val_examples = MnliMismatchedProcessor().get_dev_examples(task.data_dir)
            mm_results = runner.run_val(mm_val_examples, task_name=task.name, verbose=not args.not_verbose)
            df = pd.DataFrame(results["logits"])
            df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False)
            combined_metrics = {}
            for k, v in results["metrics"]:
                combined_metrics[k] = v
            for k, v in mm_results["metrics"]:
                combined_metrics["mm-"+k] = v
            combined_metrics_str = json.dumps({
                "loss": results["loss"],
                "metrics": combined_metrics,
            }, indent=2)
            with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
                f.write(combined_metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        logits = runner.run_test(test_examples, verbose=not args.not_verbose)
        df = pd.DataFrame(logits)
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            test_examples = MnliMismatchedProcessor().get_test_examples(task.data_dir)
            logits = runner.run_test(test_examples)
            df = pd.DataFrame(logits)
            df.to_csv(os.path.join(args.output_dir, "mm_test_preds.csv"), header=False, index=False)
Esempio n. 5
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    tokenizer = AutoTokenizer.from_pretrained(args.bert_all_dir)

    classification_lm_model = ssl_reg_model_setup.MyBertClassificationLM(
        bert_load_path=args.bert_all_dir,
        num_labels=len(task.processor.get_labels()))

    if args.do_train:
        if args.print_trainable_params:
            print("TRAINABLE PARAMS:")
            print("  SHARED:")
            for param_name, param in classification_lm_model.classification_model.roberta.named_parameters(
            ):
                if param.requires_grad:
                    print("    {}  {}".format(param_name, tuple(param.shape)))
            print("  CLASSIFICATION:")
            for param_name, param in classification_lm_model.classification_model.named_parameters(
            ):
                if param.requires_grad and not param_name.startswith(
                        "roberta."):
                    print("    {}  {}".format(param_name, tuple(param.shape)))
            print("  LM:")
            for param_name, param in classification_lm_model.lm_model.named_parameters(
            ):
                if param.requires_grad and not param_name.startswith(
                        "roberta."):
                    print("    {}  {}".format(param_name, tuple(param.shape)))
        train_examples = task.get_train_examples()
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )

        parameters = list(
            classification_lm_model.classification_model.named_parameters()
        ) + list(classification_lm_model.lm_model.lm_head.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in parameters
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
            {
                "params":
                [p for n, p in parameters if any(nd in n for nd in no_decay)],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_parameters,
                          lr=args.learning_rate,
                          betas=(args.adam_beta1, args.adam_beta2),
                          eps=1e-6,
                          weight_decay=0.1)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_proportion * t_total,
            num_training_steps=t_total)
    else:
        train_examples = None
        t_total = 0
        optimizer = None

    runner = ClassificationLMTaskRunner(
        classification_lm_model=classification_lm_model,
        optimizer=optimizer,
        clip_grad_norm=args.clip_grad_norm,
        scheduler=scheduler,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            classification_loss_weight=args.classification_loss_weight,
            train_lm_loss_weight=args.train_lm_loss_weight,
            learning_rate=args.learning_rate,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total,
            warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
            eval_batch_size=args.eval_batch_size,
        ),
        output_path=args.output_dir)

    if args.do_train:
        if args.do_val_history:
            # for GLUE datasets, we do not have test set labels, it could only be evaluated by submitting to GLUE server.
            if not args.has_test_label:
                val_examples = task.get_dev_examples()
                results = runner.run_train_val(
                    train_examples=train_examples,
                    val_examples=val_examples,
                    task_name=task.name,
                )
                metrics_str = json.dumps(results, indent=2)
                with open(
                        os.path.join(args.output_dir,
                                     "val_metrics_history.json"), "w") as f:
                    f.write(metrics_str)
            else:
                val_examples = task.get_dev_examples()
                test_examples = task.get_test_examples()
                results_val, results_test = runner.run_train_val_test(
                    train_examples=train_examples,
                    val_examples=val_examples,
                    test_examples=test_examples,
                    task_name=task.name,
                    save_best_model=args.save_best_model,
                )
                metrics_str = json.dumps(results_val, indent=2)
                with open(
                        os.path.join(args.output_dir,
                                     "val_metrics_history.json"), "w") as f:
                    f.write(metrics_str)
                metrics_str = json.dumps(results_test, indent=2)
                with open(
                        os.path.join(args.output_dir,
                                     "test_metrics_history.json"), "w") as f:
                    f.write(metrics_str)
        else:
            runner.run_train(train_examples, task_name=task.name)

    if args.do_save:
        if not args.save_best_model:
            # Save a trained model at the last epoch.
            ssl_reg_model_setup.save_bert(
                classification_lm_model=classification_lm_model,
                optimizer=optimizer,
                args=args,
                save_path=os.path.join(args.output_dir, "all_state.p"),
                save_mode=args.bert_save_mode,
            )

    if args.do_val:
        val_examples = task.get_dev_examples()
        runner.load_best_model(os.path.join(args.output_dir, "all_state.p"))
        results = runner.run_evaluate_with_label(val_examples,
                                                 task_name=task.name,
                                                 verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        runner.load_best_model(os.path.join(args.output_dir, "all_state.p"))
        results = runner.run_evaluate_with_label(test_examples,
                                                 task_name=task.name,
                                                 verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "test_metrics.json"),
                  "w") as f:
            f.write(metrics_str)