Beispiel #1
0
def main():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True)
    model = glue_model_setup.create_model(
        task_type=task.processor.TASK_TYPE,
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        bert_load_args=args.bert_load_args,
        all_state=all_state,
        num_labels=len(task.processor.get_labels()),
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        bert_config_json_path=args.bert_config_json_path,
    )
    if args.do_train:
        if args.print_trainable_params:
            log_info.print_trainable_params(model)
        train_examples = task.get_train_examples()
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )
        optimizer = shared_model_setup.create_optimizer(
            model=model,
            learning_rate=args.learning_rate,
            t_total=t_total,
            loss_scale=args.loss_scale,
            fp16=args.fp16,
            warmup_proportion=args.warmup_proportion,
            state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None,
        )
    else:
        train_examples = None
        t_total = 0
        optimizer = None

    runner = GlueTaskRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16,
            learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total, warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size,
        )
    )

    if args.do_train:
        assert at_most_one_of([args.do_val_history, args.train_save_every])
        if args.do_val_history:
            val_examples = task.get_dev_examples()
            results = runner.run_train_val(
                train_examples=train_examples,
                val_examples=val_examples,
                task_name=task.name,
            )
            metrics_str = json.dumps(results, indent=2)
            with open(os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f:
                f.write(metrics_str)
        elif args.train_save_every:
            train_dataloader = runner.get_train_dataloader(train_examples, verbose=not args.not_verbose)
            for epoch in range(int(args.num_train_epochs)):
                for step, _, _ in runner.run_train_epoch_context(train_dataloader):
                    if step % args.train_save_every == args.train_save_every - 1 \
                            or step == len(train_dataloader) - 1:
                        glue_model_setup.save_bert(
                            model=model, optimizer=optimizer, args=args,
                            save_path=os.path.join(
                                args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p"
                            ),
                            save_mode=args.bert_save_mode,
                            verbose=not args.not_verbose,
                        )
        else:
            runner.run_train(train_examples)

    if args.do_save:
        # Save a trained model
        glue_model_setup.save_bert(
            model=model, optimizer=optimizer, args=args,
            save_path=os.path.join(args.output_dir, "all_state.p"),
            save_mode=args.bert_save_mode,
        )

    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False)
        metrics_str = json.dumps({"loss": results["loss"], "metrics": results["metrics"]}, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            mm_val_examples = MnliMismatchedProcessor().get_dev_examples(task.data_dir)
            mm_results = runner.run_val(mm_val_examples, task_name=task.name, verbose=not args.not_verbose)
            df = pd.DataFrame(results["logits"])
            df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False)
            combined_metrics = {}
            for k, v in results["metrics"]:
                combined_metrics[k] = v
            for k, v in mm_results["metrics"]:
                combined_metrics["mm-"+k] = v
            combined_metrics_str = json.dumps({
                "loss": results["loss"],
                "metrics": combined_metrics,
            }, indent=2)
            with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
                f.write(combined_metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        logits = runner.run_test(test_examples, verbose=not args.not_verbose)
        df = pd.DataFrame(logits)
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            test_examples = MnliMismatchedProcessor().get_test_examples(task.data_dir)
            logits = runner.run_test(test_examples)
            df = pd.DataFrame(logits)
            df.to_csv(os.path.join(args.output_dir, "mm_test_preds.csv"), header=False, index=False)
Beispiel #2
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    # could cause problem because imdb is not part of the tasks defined
    task = get_task(args.task_name, args.data_dir)

    # create tokenizer using given model input
    # I think xlnet also use the same tokenizer
    tokenizer = shared_model_setup.create_tokenizer(
        xlnet_model_name=args.xlnet_model,  # need to change
        xlnet_load_mode=args.xlnet_load_mode,  # need to change
        do_lower_case=args.do_lower_case,
        xlnet_vocab_path=args.xlnet_vocab_path,  # not sure how to modify
    )
    all_state = shared_model_setup.load_overall_state(
        args.xlnet_load_path,
        relaxed=True)  # probably will be the pre-trained one

    model = glue_model_setup.create_model(
        task_type=task.processor.TASK_TYPE,
        xlnet_model_name=args.xlnet_model,
        xlnet_load_mode=args.xlnet_load_mode,
        xlnet_load_args=args.xlnet_load_args,
        all_state=all_state,
        num_labels=len(task.processor.get_labels()),
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        xlnet_config_json_path=args.xlnet_config_json_path,
    )
    if args.do_train:
        if args.print_trainable_params:
            log_info.print_trainable_params(model)
        train_examples = task.get_train_examples()
        if args.train_examples_number is not None:
            train_examples = random_sample(train_examples,
                                           args.train_examples_number)
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )

        optimizer = shared_model_setup.create_optimizer(
            model=model,
            learning_rate=args.learning_rate,
            t_total=t_total,
            loss_scale=args.loss_scale,
            fp16=args.fp16,
            warmup_proportion=args.warmup_proportion,
            state_dict=all_state["optimizer"]
            if args.bert_load_mode == "state_all" else None,
        )
    else:
        train_examples = None
        t_total = 0
        optimizer = None
    # TODO: what does xlnet runner do???\
    # initial answer: probably do
    runner = GlueTaskRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            local_rank=args.local_rank,
            n_gpu=n_gpu,
            fp16=args.fp16,
            learning_rate=args.learning_rate,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total,
            warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
            eval_batch_size=args.eval_batch_size,
        ))

    if args.do_train:
        assert at_most_one_of([args.do_val_history, args.train_save_every])
        if args.do_val_history:
            val_examples = task.get_dev_examples()
            results = runner.run_train_val(
                train_examples=train_examples,
                val_examples=val_examples,
                task_name=task.name,
            )
            metrics_str = json.dumps(results, indent=2)
            with open(
                    os.path.join(args.output_dir, "val_metrics_history.json"),
                    "w") as f:
                f.write(metrics_str)
        elif args.train_save_every:
            train_dataloader = runner.get_train_dataloader(
                train_examples, verbose=not args.not_verbose)
            for epoch in range(int(args.num_train_epochs)):
                for step, _, _ in runner.run_train_epoch_context(
                        train_dataloader):
                    if step % args.train_save_every == args.train_save_every - 1 \
                            or step == len(train_dataloader) - 1:
                        glue_model_setup.save_xlnet(
                            model=model,
                            optimizer=optimizer,
                            args=args,
                            save_path=os.path.join(
                                args.output_dir,
                                f"all_state___epoch{epoch:04d}___batch{step:06d}.p"
                            ),
                            save_mode=args.bert_save_mode,
                            verbose=not args.not_verbose,
                        )
        else:
            runner.run_train(train_examples)

    if args.do_save:
        # Save a trained model
        glue_model_setup.save_xlnet(
            model=model,
            optimizer=optimizer,
            args=args,
            save_path=os.path.join(args.output_dir, "all_state.p"),
            save_mode=args.bert_save_mode,
        )
    # remove the hack part for MultiNLI Mismatched dataset
    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples,
                                 task_name=task.name,
                                 verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        logits = runner.run_test(test_examples, verbose=not args.not_verbose)
        df = pd.DataFrame(logits)
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"),
                  header=False,
                  index=False)
Beispiel #3
0
def main():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)

    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True)
    model = lm_model_setup.create_model(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        bert_load_args=args.bert_load_args,
        all_state=all_state,
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        bert_config_json_path=args.bert_config_json_path,
    )
    if args.print_trainable_params:
        log_info.print_trainable_params(model)

    train_dataset = lm_runners.LMDataset(
        args.train_file, tokenizer, seq_len=args.max_seq_length,
        corpus_lines=None, on_memory=args.on_memory,
    )
    t_total = shared_model_setup.get_opt_train_steps(
        num_train_examples=len(train_dataset),
        args=args,
    )
    optimizer = shared_model_setup.create_optimizer(
        model=model,
        learning_rate=args.learning_rate,
        t_total=t_total,
        loss_scale=args.loss_scale,
        fp16=args.fp16,
        warmup_proportion=args.warmup_proportion,
        state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None,
    )
    runner = lm_runners.LMRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        device=device,
        rparams=lm_runners.RunnerParameters(
            select_prob=args.select_prob, max_seq_length=args.max_seq_length,
            local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16,
            learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total, warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
        )
    )

    runner.run_train(train_dataset)
    lm_model_setup.save_bert(
        model=model, optimizer=optimizer, args=args,
        save_path=os.path.join(args.output_dir, "all_state.p"),
        save_mode=args.bert_save_mode,
    )