Example #1
0
def main():
    """Main training program."""

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False
    # Timer.
    timers = Timers()

    # Arguments.
    args = get_args()
    args.mem_length = args.mem_length if args.transformer_xl else 0
    if args.load and not args.new_save_directory:
        args.experiment_name = os.path.basename(os.path.normpath(args.load))
    else:
        args.experiment_name = args.experiment_name + datetime.now().strftime(
            "%m-%d-%H-%M")
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # Data stuff.
    global tokenizer
    tokenizer = prepare_tokenizer(args)
    train_data, val_data, test_data, = get_train_val_test_data(args, tokenizer)
    multi_train_data, multi_val_data = None, None
    if args.multi_task_ratio > 0.0:
        multi_train_data, multi_val_data = build_multi_task_dataset(
            args, tokenizer)

    # Model, optimizer, and learning rate.
    model, optimizer, lr_scheduler = setup_model_and_optimizer(args)

    if args.load is not None:
        with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                      timeout=-1):
            args.iteration = load_checkpoint(model, optimizer, lr_scheduler,
                                             args)
    else:
        args.iteration = 0
    torch.distributed.barrier()
    if args.switch_linear:
        lr_scheduler.switch_linear(args)

    summary_writer = None
    if torch.distributed.get_rank() == 0:
        print('Pretrain GPT2 model')
        args.log_dir = None
        if args.train_iters > 0:
            args.log_dir = get_log_dir(base=args.summary_dir,
                                       name=args.experiment_name)
            summary_writer = get_sample_writer(log_dir=args.log_dir,
                                               iteration=args.iteration)
        print_and_save_args(args, verbose=True, log_dir=args.log_dir)

    # Resume data loader if necessary.
    if args.resume_dataloader:
        print_rank_0("Resume dataloader")
        if train_data is not None:
            train_data.batch_sampler.start_iter = args.iteration % len(
                train_data)
        if val_data is not None:
            start_iter_val = (args.iteration //
                              args.eval_interval) * args.eval_iters
            val_data.batch_sampler.start_iter = start_iter_val % len(val_data)
        if multi_train_data is not None:
            multi_train_data.batch_sampler.start_iter = int(
                args.iteration * args.multi_task_ratio) % len(multi_train_data)
        if multi_val_data is not None:
            start_iter_val = (args.iteration // args.eval_interval
                              ) * args.eval_iters * args.multi_task_ratio
            multi_val_data.batch_sampler.start_iter = start_iter_val % len(
                multi_val_data)
    if train_data is not None:
        train_data_iterator = iter(train_data)
    else:
        train_data_iterator = None
    if multi_train_data is not None:
        multi_train_iterator = iter(multi_train_data)
    else:
        multi_train_iterator = None
    if val_data is not None:
        val_data_iterator = iter(val_data)
    else:
        val_data_iterator = None
    if multi_val_data is not None:
        multi_val_iterator = iter(multi_val_data)
    else:
        multi_val_iterator = None

    # TODO: figure out how to properly set this especially when resuming training
    iteration = 0
    if args.train_iters > 0:
        if args.do_train:
            with ExitStack() as stack:

                def save_on_exit(args_, model_, optimizer_, lr_scheduler_):
                    save_checkpoint(args_.iteration, model_, optimizer_,
                                    lr_scheduler_, args_)

                # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler)
                iteration, skipped = train(
                    model,
                    optimizer,
                    lr_scheduler, (train_data_iterator, multi_train_iterator),
                    (val_data_iterator, multi_val_iterator),
                    timers,
                    args,
                    summary_writer=summary_writer)

        if args.do_valid:
            prefix = 'the end of training for val data'
            val_loss = evaluate_and_print_results(
                prefix,
                val_data_iterator,
                model,
                args,
                timers,
                verbose=False,
                forward_step_func=forward_step)

    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler, args)

    if test_data is not None:
        test_data_iterator = iter(test_data)
    else:
        test_data_iterator = None

    if args.do_test:
        # Run on test data.
        prefix = 'the end of training for test data'
        evaluate_and_print_results(prefix, (test_data_iterator, None),
                                   model,
                                   args,
                                   timers,
                                   verbose=True,
                                   forward_step_func=forward_step)
Example #2
0
    # Create an outputs/ folder in the blob storage
    if args.output_dir is None:
        parent_dir = os.path.join(args.output_dir, 'outputs',
                                  str(experiment_name))
        output_dir = os.path.join(parent_dir, str(run_id))
        os.makedirs(output_dir, exist_ok=True)
        saved_model_path = os.path.join(output_dir, "saved_models", job_name)
        os.makedirs(saved_model_path, exist_ok=True)
    else:
        saved_model_path = args.output_dir

    summary_writer = None
    # Prepare Summary Writer and saved_models path
    if check_write_log():
        #azureml.tensorboard only streams from /logs directory, therefore hardcoded
        summary_writer = get_sample_writer(name=job_name + str(local_rank),
                                           base='./logs')

    # Loading Tokenizer (vocabulary from blob storage, if exists)
    logger.info("Extracting the vocabulary")
    if args.tokenizer_path:
        logger.info(f'Loading tokenizer from {args.tokenizer_path}')
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                  cache_dir=args.output_dir)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            job_config.get_token_file_type(), cache_dir=args.output_dir)
    logger.info("Vocabulary contains {} tokens".format(
        len(list(tokenizer.vocab.keys()))))

    # Loading Model
    logger.info("Initializing BertMultiTask model")
Example #3
0
    # Create an outputs/ folder in the blob storage
    if args.output_dir is None:
        parent_dir = os.path.join(args.output_dir, 'outputs',
                                  str(run.experiment.name))
        output_dir = os.path.join(parent_dir, str(run.id))
        os.makedirs(output_dir, exist_ok=True)
        saved_model_path = os.path.join(output_dir, "saved_models", job_name)
        os.makedirs(saved_model_path, exist_ok=True)
    else:
        saved_model_path = args.output_dir

    summary_writer = None
    # Prepare Summary Writer and saved_models path
    if check_write_log():
        #azureml.tensorboard only streams from /logs directory, therefore hardcoded
        summary_writer = get_sample_writer(name=job_name, base='./logs')

    # Loading Tokenizer (vocabulary from blob storage, if exists)
    logger.info("Extracting the vocabulary")
    if args.tokenizer_path:
        logger.info(f'Loading tokenizer from {args.tokenizer_path}')
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                  cache_dir=args.output_dir)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            job_config.get_token_file_type(), cache_dir=args.output_dir)
    logger.info("Vocabulary contains {} tokens".format(
        len(list(tokenizer.vocab.keys()))))

    # Loading Model
    logger.info("Initializing BertMultiTask model")
def main():
    """Main training program."""

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False
    # Timer.
    timers = Timers()

    # Arguments.
    args = get_args()
    args.mem_length = args.mem_length if args.transformer_xl else 0
    if args.load:
        args.experiment_name = os.path.basename(os.path.normpath(args.load))
    else:
        args.experiment_name = args.experiment_name + datetime.now().strftime(
            "%m-%d-%H-%M")
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # Data stuff.
    train_data, val_data, test_data, args.vocab_size, \
        args.eod_token = get_train_val_test_data(args)

    # Model, optimizer, and learning rate.
    model, optimizer, lr_scheduler = setup_model_and_optimizer(args)

    if args.load is not None:
        with FileLock("/root/checkpoint_lock", timeout=-1):
            args.iteration = load_checkpoint(model, optimizer, lr_scheduler,
                                             args)
    else:
        args.iteration = 0
    torch.distributed.barrier()

    summary_writer = None
    if torch.distributed.get_rank() == 0:
        print('Pretrain GPT2 model')
        print_args(args)
        summary_writer = get_sample_writer(base=args.summary_dir,
                                           name=args.experiment_name,
                                           iteration=args.iteration)

    # Resume data loader if necessary.
    if args.resume_dataloader:
        if train_data is not None:
            train_data.batch_sampler.start_iter = args.iteration % \
                                                  len(train_data)
        if val_data is not None:
            start_iter_val = (args.train_iters // args.save_interval) * \
                             args.eval_interval
            val_data.batch_sampler.start_iter = start_iter_val % \
                                                len(val_data)
    if train_data is not None:
        train_data_iterator = iter(train_data)
    else:
        train_data_iterator = None
    if val_data is not None:
        val_data_iterator = iter(val_data)
    else:
        val_data_iterator = None

    # TODO: figure out how to properly set this especially when resuming training
    iteration = 0
    if args.train_iters > 0:
        if args.do_train:
            with ExitStack() as stack:

                def save_on_exit(args_, model_, optimizer_, lr_scheduler_):
                    save_checkpoint(args_.iteration, model_, optimizer_,
                                    lr_scheduler_, args_)

                # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler)
                iteration, skipped = train(model,
                                           optimizer,
                                           lr_scheduler,
                                           train_data_iterator,
                                           val_data_iterator,
                                           timers,
                                           args,
                                           summary_writer=summary_writer)

        if args.do_valid:
            prefix = 'the end of training for val data'
            val_loss = evaluate_and_print_results(prefix, val_data_iterator,
                                                  model, args, timers, False)

    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler, args)

    if test_data is not None:
        test_data_iterator = iter(test_data)
    else:
        test_data_iterator = None

    if args.do_test:
        # Run on test data.
        prefix = 'the end of training for test data'
        evaluate_and_print_results(prefix, test_data_iterator, model, args,
                                   timers, True)
Example #5
0
def finetune(args,
             train_valid_datasets_provider,
             model_kwargs,
             forward_step=finetune_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
    global tokenizer
    timers = Timers()
    tokenizer = prepare_tokenizer(args)
    pretrain_glm.tokenizer = tokenizer
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Train and validation data loaders.
    timers('train/valid/test dataset/dataloder').start()
    train_dataloader, valid_dataloader = None, None
    train_block_dataloader, valid_block_dataloader = None, None
    if train_valid_datasets_provider is not None and args.epochs > 0:
        if mpu.get_model_parallel_rank() == 0:
            train_dataset, valid_dataset = train_valid_datasets_provider(
                args, tokenizer)
            train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
                train_dataset, valid_dataset, args)
            if args.no_validation:
                valid_dataloader = None
            train_iters = torch.cuda.LongTensor([len(train_dataloader)])
        else:
            train_iters = torch.cuda.LongTensor([0])
        torch.distributed.broadcast(train_iters,
                                    mpu.get_model_parallel_src_rank(),
                                    group=mpu.get_model_parallel_group())
        if mpu.get_model_parallel_rank() != 0:
            args.train_iters_per_epoch = train_iters[0].item()
            args.train_iters = args.epochs * args.train_iters_per_epoch

            train_dataloader = FakeDataloader(args.train_iters_per_epoch)
            if args.no_validation:
                valid_dataloader = None
            else:
                valid_dataloader = FakeDataloader(None)
        if args.block_lm_ratio > 0.0:
            if mpu.get_model_parallel_rank() == 0:
                train_block_dataset, valid_block_dataset = train_valid_datasets_provider(
                    args, tokenizer, pattern_text=True)
                train_block_dataloader = make_data_loader(
                    train_block_dataset,
                    tokenizer,
                    args.batch_size * mpu.get_data_parallel_world_size(),
                    args.train_iters,
                    args,
                    shuffle=True,
                    block_collate=True)
                valid_block_dataloader = make_data_loader(
                    valid_block_dataset,
                    tokenizer,
                    args.batch_size * mpu.get_data_parallel_world_size(),
                    (args.train_iters // args.eval_interval + 1) *
                    args.eval_iters,
                    args,
                    shuffle=True,
                    block_collate=True)
            else:
                train_block_dataloader = FakeDataloader(args.train_iters)
                valid_block_dataloader = FakeDataloader(None)
            train_block_dataloader, valid_block_dataloader = iter(
                train_block_dataloader), iter(valid_block_dataloader)

    timers('train/valid/test dataset/dataloder').stop()
    # Build calback function.
    timers('callback function').start()
    end_of_epoch_callback, end_of_train_callback = None, None
    if end_of_epoch_callback_provider is not None:
        if train_valid_datasets_provider is not None and args.epochs > 0 and not args.no_validation:
            end_of_epoch_callback = end_of_epoch_callback_provider(
                args, tokenizer, is_test=False)
        end_of_train_callback = end_of_epoch_callback_provider(args,
                                                               tokenizer,
                                                               is_test=True)
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(
        args, **model_kwargs)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
    timers('pretrained checkpoint').start()
    if args.load_pretrained is not None and not args.pretrained_bert:
        task_tokens = None
        if args.continuous_prompt and args.prompt_init:
            if mpu.get_model_parallel_rank() == 0:
                dataset = train_dataloader.dataset
                processor, pvp = dataset.processor, dataset.pvp
                task_tokens = []
                for label in processor.get_labels():
                    verbalizer = pvp.verbalize(label)[0]
                    verbalizer_ids = tokenizer.EncodeAsIds(
                        verbalizer).tokenization
                    task_tokens += verbalizer_ids
                print_rank_0("Task tokens: " +
                             tokenizer.DecodeIds(task_tokens))
                num_task_tokens = len(task_tokens)
            else:
                num_task_tokens, task_tokens = 0, []
            num_task_tokens = torch.cuda.LongTensor([num_task_tokens])
            torch.distributed.broadcast(num_task_tokens,
                                        mpu.get_model_parallel_src_rank(),
                                        group=mpu.get_model_parallel_group())
            num_task_tokens = num_task_tokens.item()
            if num_task_tokens > 0:
                if mpu.get_model_parallel_rank() == 0:
                    task_tokens = torch.cuda.LongTensor(task_tokens)
                else:
                    task_tokens = torch.empty(
                        num_task_tokens,
                        device=torch.cuda.current_device(),
                        dtype=torch.long)
                torch.distributed.broadcast(
                    task_tokens,
                    mpu.get_model_parallel_src_rank(),
                    group=mpu.get_model_parallel_group())
                task_tokens = task_tokens.tolist()
        with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                      timeout=-1):
            load_pretrained(model,
                            args.load_pretrained,
                            args,
                            task_tokens=task_tokens)
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16 and optimizer is not None:
            if args.deepspeed:
                optimizer.refresh_fp32_params()
            else:
                optimizer._model_params_to_master_params()
    if args.load is not None:
        with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                      timeout=-1):
            load_checkpoint(model,
                            optimizer,
                            lr_scheduler,
                            args,
                            no_deepspeed=args.no_deepspeed_load)
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16 and optimizer is not None:
            if args.deepspeed:
                optimizer.refresh_fp32_params()
            else:
                optimizer._model_params_to_master_params()
    torch.distributed.barrier()
    timers('pretrained checkpoint').stop()
    args.iteration = 0
    summary_writer = None
    if torch.distributed.get_rank() == 0:
        args.log_dir = get_log_dir(base=args.summary_dir,
                                   name=args.experiment_name)
        if os.path.exists(os.path.join(args.log_dir, "test_results.json")
                          ) and args.load is None and not args.overwrite:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.log_dir))
        summary_writer = get_sample_writer(log_dir=args.log_dir,
                                           iteration=args.iteration)
        print_and_save_args(args, verbose=True, log_dir=args.log_dir)

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log([
        'train/valid/test dataset/dataloder', 'callback function',
        'model and optimizer', 'pretrained checkpoint'
    ])
    print_rank_0('training ...')

    # Finetune the model.
    score_dict = None
    if train_dataloader is not None and args.epochs > 0:
        if args.block_lm_ratio > 0.0:
            forward_step = mix_forward_step
        best_iteration = _train(model,
                                optimizer,
                                lr_scheduler,
                                forward_step,
                                (train_dataloader, train_block_dataloader),
                                (valid_dataloader, valid_block_dataloader),
                                end_of_epoch_callback,
                                args,
                                timers,
                                summary_writer=summary_writer)
        if end_of_train_callback is not None and best_iteration is not None:
            with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"),
                          timeout=-1):
                args.load = os.path.join(args.save, "best")
                load_checkpoint(model,
                                optimizer,
                                lr_scheduler,
                                args,
                                no_load_optim=True,
                                no_deepspeed=True)
                args.load = None
        torch.distributed.barrier()
        if end_of_train_callback is not None:
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    # Or just evaluate.
    else:
        if end_of_train_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    if score_dict is not None and torch.distributed.get_rank() == 0:
        score_dict.update({"type": "test"})
        with open(os.path.join(args.log_dir, "test_results.json"),
                  "w") as output:
            output.write(json.dumps(score_dict) + "\n")

    print_rank_0('done :-)')
Example #6
0
def finetune(args,
             train_valid_datasets_provider,
             model_kwargs,
             forward_step=finetune_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
    global tokenizer
    timers = Timers()
    tokenizer = prepare_tokenizer(args)
    if args.save:
        args.save = os.path.join(args.save, args.experiment_name)
    # Train and validation data loaders.
    timers('train/valid/test dataset/dataloder').start()
    train_dataloader, valid_dataloader = None, None
    if train_valid_datasets_provider is not None and args.epochs > 0:
        train_dataset, valid_dataset = train_valid_datasets_provider(
            args, tokenizer)
        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
            train_dataset, valid_dataset, args)
    timers('train/valid/test dataset/dataloder').stop()
    # Build calback function.
    timers('callback function').start()
    end_of_epoch_callback, end_of_train_callback = None, None
    if end_of_epoch_callback_provider is not None:
        if train_valid_datasets_provider is not None and args.epochs > 0:
            end_of_epoch_callback = end_of_epoch_callback_provider(
                args, tokenizer, is_test=False)
        end_of_train_callback = end_of_epoch_callback_provider(args,
                                                               tokenizer,
                                                               is_test=True)
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(
        args, **model_kwargs)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
    timers('pretrained checkpoint').start()
    if args.load_pretrained is not None and not args.pretrained_bert and not args.load:
        module = model
        if isinstance(module, (LocalDDP, TorchDDP)):
            module = module.module
        if isinstance(module, FP16_Module):
            module = module.module
        if not isinstance(module, GLMModel):
            module = module.model
        args.load = args.load_pretrained
        load_checkpoint(module, optimizer, lr_scheduler, args)
        args.load = None
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
    if args.load is not None:
        load_checkpoint(model, optimizer, lr_scheduler, args)
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
    timers('pretrained checkpoint').stop()
    args.iteration = 0
    summary_writer = None
    if torch.distributed.get_rank() == 0:
        args.log_dir = get_log_dir(base=args.summary_dir,
                                   name=args.experiment_name)
        if os.path.exists(os.path.join(args.log_dir, "test_results.json")
                          ) and args.load is None and not args.overwrite:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.log_dir))
        summary_writer = get_sample_writer(log_dir=args.log_dir,
                                           iteration=args.iteration)
        print_and_save_args(args, verbose=False, log_dir=args.log_dir)

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log([
        'train/valid/test dataset/dataloder', 'callback function',
        'model and optimizer', 'pretrained checkpoint'
    ])
    print_rank_0('training ...')

    # Finetune the model.
    score_dict = None
    if train_dataloader is not None and args.epochs > 0:
        best_iteration = _train(model,
                                optimizer,
                                lr_scheduler,
                                forward_step,
                                train_dataloader,
                                valid_dataloader,
                                end_of_epoch_callback,
                                args,
                                timers,
                                summary_writer=summary_writer)
        if best_iteration is not None and end_of_train_callback is not None:
            args.load = os.path.join(args.save, "best")
            load_checkpoint(model, optimizer, lr_scheduler, args)
            args.load = None
        if end_of_train_callback is not None:
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    # Or just evaluate.
    else:
        if end_of_train_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
            score_dict = end_of_train_callback(model,
                                               epoch=-1,
                                               output_predictions=True)
    if score_dict is not None and torch.distributed.get_rank() == 0:
        score_dict.update({"type": "test"})
        with open(os.path.join(args.log_dir, "test_results.json"),
                  "w") as output:
            output.write(json.dumps(score_dict) + "\n")

    print_rank_0('done :-)')