Ejemplo n.º 1
0
def main():
    # get args
    args = get_args()

    # set up gpus
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    assert torch.cuda.is_available()

    # set up writer, logger, and save directory for models
    save_root = os.path.join('checkpoints', 
        'dverge', 'seed_{:d}'.format(args.seed), '{:d}_{:s}{:d}_eps_{:.2f}'.format(
            args.model_num, args.arch, args.depth, args.distill_eps)
    )
    if args.distill_fixed_layer:
        save_root += '_fixed_layer_{:d}'.format(args.distill_layer)
    if args.plus_adv:
        save_root += '_plus_adv_coeff_{:.1f}'.format(args.dverge_coeff)
    if args.start_from == 'scratch':
        save_root += '_start_from_scratch'
    if not os.path.exists(save_root):
        os.makedirs(save_root)
    else:
        print('*********************************')
        print('* The checkpoint already exists *')
        print('*********************************')

    writer = SummaryWriter(save_root.replace('checkpoints', 'runs'))

    # dump configurations for potential future references
    with open(os.path.join(save_root, 'cfg.json'), 'w') as fp:
        json.dump(vars(args), fp, indent=4, sort_keys=True)
    with open(os.path.join(save_root.replace('checkpoints', 'runs'), 'cfg.json'), 'w') as fp:
        json.dump(vars(args), fp, indent=4, sort_keys=True)

    # set up random seed
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    # initialize models
    if args.start_from == 'baseline':
        args.model_file = os.path.join('checkpoints', 'baseline', 'seed_0', '{:d}_{:s}{:d}'.format(args.model_num, args.arch, args.depth), 'epoch_200.pth')
    elif args.divtrain_start_from == 'scratch':
        args.model_file = None
    models = utils.get_models(args, train=True, as_ensemble=False, model_file=args.model_file)

    # get data loaders
    trainloader, testloader = utils.get_loaders(args)

    # get optimizers and schedulers
    optimizers = utils.get_optimizers(args, models)
    schedulers = utils.get_schedulers(args, optimizers)

    # train the ensemble
    trainer = DVERGE_Trainer(models, optimizers, schedulers, trainloader, testloader, writer, save_root, **vars(args))
    trainer.run()
Ejemplo n.º 2
0
def main():
    # get args
    args = get_args()

    # set up gpus
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    assert torch.cuda.is_available()

    # set up writer, logger, and save directory for models
    save_root = os.path.join(
        'checkpoints', 'baseline', 'seed_{:d}'.format(args.seed),
        '{:d}_{:s}{:d}'.format(args.model_num, args.arch, args.depth))
    if not os.path.exists(save_root):
        os.makedirs(save_root)
    else:
        print('*********************************')
        print('* The checkpoint already exists *')
        print('*********************************')

    writer = SummaryWriter(save_root.replace('checkpoints', 'runs'))

    # dump configurations for potential future references
    with open(os.path.join(save_root, 'cfg.json'), 'w') as fp:
        json.dump(vars(args), fp, indent=4)
    with open(
            os.path.join(save_root.replace('checkpoints', 'runs'), 'cfg.json'),
            'w') as fp:
        json.dump(vars(args), fp, indent=4)

    # set up random seed
    torch.manual_seed(args.seed)

    # initialize models
    models = utils.get_models(args,
                              train=True,
                              as_ensemble=False,
                              model_file=None)

    # get data loaders
    trainloader, testloader = utils.get_loaders(args)

    # get optimizers and schedulers
    optimizers = utils.get_optimizers(args, models)
    schedulers = utils.get_schedulers(args, optimizers)

    # train the ensemble
    trainer = Baseline_Trainer(models, optimizers, schedulers, trainloader,
                               testloader, writer, save_root, **vars(args))
    trainer.run()
                        choices=utils.get_datasets())

    # Restart train or continue
    PARSER.add_argument("--restart", action='store_true')

    # Learning rate decay arguments
    PARSER.add_argument("--lr_decay", action="store_true")
    PARSER.add_argument("--lr_decay_epochs", type=int, default=25)
    PARSER.add_argument("--lr_decay_factor", type=float, default=0.1)

    # L2 regularization arguments
    PARSER.add_argument("--l2_penalty", type=float, default=0.0)

    # Optimization arguments
    PARSER.add_argument("--optimizer",
                        choices=utils.get_optimizers(),
                        default="MomentumOptimizer")
    PARSER.add_argument("--optimizer_args",
                        type=json.loads,
                        default='''
    {
        "learning_rate": 1e-2,
        "momentum": 0.9
    }''')
    PARSER.add_argument("--batch_size", type=int, default=128)
    PARSER.add_argument("--epochs", type=int, default=150)

    # Hardware
    PARSER.add_argument("--train_device", default="/gpu:0")
    PARSER.add_argument("--eval_device", default="/gpu:0")
Ejemplo n.º 4
0
def train_without_trainer(args):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    batch_size = int(args['--batch-size'])
    logging_steps = int(args['--log-every'])

    tokenizer = transformers.AlbertTokenizer.from_pretrained(
        'albert-base-v2', cache_dir=cache_dir)
    albert_for_math_config = transformers.AlbertConfig(
        hidden_size=768,
        num_attention_heads=12,
        intermediate_size=3072,
    )
    print('Loading Data...')
    train_data = torch.load(
        './data/train_data_train-easy_algebra__linear_1d.pt')
    dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt')
    print('Finished loading data')
    data_collator = AnswerMaskDataCollator(tokenizer)
    train_dataloader = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size,
        sampler=torch.utils.data.sampler.RandomSampler(train_data),
        collate_fn=data_collator.collate_batch)

    if args['--load']:
        model = transformers.AlbertForMaskedLM.from_pretrained(
            args['--load-from'])
        optimizer = get_optimizers(model, float(args['--lr']))
        optimizer.load_state_dict(
            torch.load(os.path.join(args['--load-from'], "optimizer.pt"),
                       map_location=device))
        global_step = int(args['--load-from'].split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader))
        steps_trained_in_current_epoch = global_step % len(train_dataloader)
        epoch = epochs_trained
        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)
    else:
        model = transformers.AlbertForMaskedLM(albert_for_math_config)
        optimizer = get_optimizers(model, float(args['--lr']))
        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        epoch = 0
    model.to(device)
    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to(device)
    max_epoch = int(args['--max-epoch'])
    t_total = len(train_dataloader) * max_epoch
    tr_loss = 0.0
    logging_loss = 0.0
    min_eval_loss = 1e20  # might be too high
    valid_niter = int(args['--valid-niter'])
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_data))
    logger.info("  Num Epochs = %d", max_epoch)
    logger.info("  train batch size = %d", batch_size)
    logger.info("  Total optimization steps = %d", t_total)
    num_eval_samples = 4096
    checkpoint_prefix = 'checkpoint'
    while (epoch < max_epoch):

        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, inputs in enumerate(epoch_iterator):
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            tr_loss += train_step(model, inputs, device)
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           float(args['--clip-grad']))
            optimizer.step()
            model.zero_grad()
            global_step += 1
            if global_step % logging_steps == 0:
                logs: Dict[str, float] = {}
                logs["loss"] = (tr_loss - logging_loss) / logging_steps
                logs["lr"] = (optimizer.defaults['lr']
                              )  # possible RuntimeError
                logs["epoch"] = epoch
                logs["step"] = global_step
                logging_loss = tr_loss
                log(logs)
            if global_step % valid_niter == 0:
                eval_loss = 0.0
                description = "Evaluation"
                sampler = torch.utils.data.sampler.SequentialSampler(
                    dev_data[:num_eval_samples])
                eval_dataloader = torch.utils.data.DataLoader(
                    dev_data[:num_eval_samples],
                    sampler=sampler,
                    batch_size=batch_size,
                    collate_fn=data_collator.collate_batch,
                )
                logger.info("***** Running %s *****", description)
                logger.info("   Num Examples = %d", num_eval_samples)
                logger.info("   Batch size = %d", batch_size)
                for inputs in tqdm(eval_dataloader, desc=description):
                    for k, v in inputs.items():
                        inputs[k] = v.to(device)
                    model.eval()
                    with torch.no_grad():
                        outputs = model(**inputs)
                        loss = outputs[0]
                        eval_loss += loss.item()
                print("\nEvaluation loss = %f" %
                      (eval_loss / num_eval_samples))
                if eval_loss / num_eval_samples * batch_size < min_eval_loss:
                    min_eval_loss = eval_loss / num_eval_samples * batch_size
                    # save model and optimizer

                    output_dir = os.path.join(
                        args['--save-to'] + '/validations/',
                        f"{checkpoint_prefix}-{global_step}")
                    os.makedirs(output_dir, exist_ok=True)
                    model.save_pretrained(output_dir)
                    output_dir = os.path.join(args['--save-to'] +
                                              '/validations/')
                    rotate_checkpoints(output_dir)
                    output_dir = os.path.join(
                        args['--save-to'] + '/validations/',
                        f"{checkpoint_prefix}-{global_step}")
                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
            if global_step % int(args['--save-every']) == 0:
                output_dir = os.path.join(
                    args['--save-to'], f"{checkpoint_prefix}-{global_step}")
                os.makedirs(output_dir, exist_ok=True)
                model.save_pretrained(output_dir)
                output_dir = output_dir = os.path.join(args['--save-to'])
                rotate_checkpoints(output_dir)
                output_dir = os.path.join(
                    args['--save-to'], f"{checkpoint_prefix}-{global_step}")
                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
        epoch_iterator.close()
        epoch += 1
    logger.info(
        "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
    )
def main():
    # get args
    args = get_args()

    # set up gpus
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    assert torch.cuda.is_available()

    # set up writer, logger, and save directory for models
    save_root = os.path.join('checkpoints', 'transfer',
                             'seed_{:d}'.format(args.seed),
                             '{:s}{:d}'.format(args.arch, args.depth))

    save_root += "%.2f" % (args.transfer_coeff)

    if not os.path.exists(save_root):
        os.makedirs(save_root)
    else:
        print('*********************************')
        print('* The checkpoint already exists *')
        print('*********************************')

    writer = SummaryWriter(save_root.replace('checkpoints', 'runs'))

    # dump configurations for potential future references
    with open(os.path.join(save_root, 'cfg.json'), 'w') as fp:
        json.dump(vars(args), fp, indent=4)
    with open(
            os.path.join(save_root.replace('checkpoints', 'runs'), 'cfg.json'),
            'w') as fp:
        json.dump(vars(args), fp, indent=4)

    # set up random seed
    torch.manual_seed(args.seed)

    # initialize models
    models = utils.get_models(args,
                              train=True,
                              as_ensemble=False,
                              model_file="/sync_transfer/CIFAR/epoch_200.pth",
                              dataset="CIFAR-10")

    # get data loaders
    source_trainloader, source_testloader = utils.get_loaders(
        args, dataset="CIFAR-10")
    target_trainloader, target_testloader = utils.get_loaders(args,
                                                              dataset="STL-10")
    # get optimizers and schedulers
    optimizers = utils.get_optimizers(args, models)
    schedulers = utils.get_schedulers(args, optimizers)

    surrogate = utils.get_models(args,
                                 train=False,
                                 as_ensemble=False,
                                 model_file="/sync_transfer/STL/epoch_200.pth",
                                 dataset="STL-10")
    trainer = Transfer_Trainer(models, optimizers, schedulers,
                               source_trainloader, source_testloader,
                               target_trainloader, surrogate, writer,
                               save_root, **vars(args))
    trainer.run()