Exemple #1
0
            clip = abs(args.clip)
        #settings.MILESTONES = [120,150,180]
        optimizer = FromageCSV7(net.parameters(),
                                lr=args.lr,
                                weight_decay=args.wd,
                                bias_clip=clip,
                                beta1=args.momentum,
                                beta2=args.beta)
    elif args.optimizer == 'fromage':
        print("using fromage!")
        if args.clip == 0:
            clip = math.inf
        else:
            clip = abs(args.clip)
        #settings.MILESTONES = [120,150,180]
        optimizer = Fromage(net.parameters(), lr=args.lr)
    elif args.optimizer == 'madam':
        print("using madam!")
        if args.clip == 0:
            clip = math.inf
        else:
            clip = abs(args.clip)
        #settings.MILESTONES = [120,150,180]
        optimizer = Madam(net.parameters(), lr=args.lr)

    if args.sch == "step":
        train_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=settings.MILESTONES,
            gamma=args.gamma)  #learning rate decay
    elif args.sch == "poly":
        train_scheduler = PolyLR(optimizer,
Exemple #2
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    # set seeds
    np.random.seed(args.seed)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('args = %s', args)

    # Get data loaders.
    traindir = os.path.join(args.data, 'train')
    validdir = os.path.join(args.data, 'val')

    # data augmentation
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.4,
                               contrast=0.4,
                               saturation=0.4,
                               hue=0.2),
        transforms.ToTensor(),
        normalize,
    ])
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])

    train_data = dset.ImageFolder(traindir, transform=train_transform)
    valid_data = dset.ImageFolder(validdir, transform=val_transform)

    # dataset split
    valid_data, test_data = utils.dataset_split(valid_data, len(valid_data))

    train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=8,
                                              sampler=train_sampler)

    valid_queue = torch.utils.data.DataLoader(valid_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=8)

    test_queue = torch.utils.data.DataLoader(test_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=8)

    # Create model and loss.
    torch.hub.set_dir('/tmp/hub_cache_%d' % args.local_rank)
    model = torch.hub.load('pytorch/vision:v0.4.2',
                           'resnet50',
                           pretrained=False)
    model = model.cuda()
    model = DDP(model, delay_allreduce=True)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth)
    criterion_smooth = criterion_smooth.cuda()

    # Set up network weights optimizer.
    if args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(),
                                    args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    elif args.optimizer == 'fromage':
        optimizer = Fromage(model.parameters(), args.learning_rate)
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     args.learning_rate,
                                     weight_decay=args.weight_decay)
    else:
        raise NotImplementedError

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                gamma=0.1,
                                                step_size=30)

    # Train.
    global_step = 0
    best_acc_top1 = 0
    for epoch in range(args.epochs):
        # Shuffle the sampler, update lrs.
        train_queue.sampler.set_epoch(epoch + args.seed)

        # Training.
        train_acc_top1, train_acc_top5, train_obj, global_step = train(
            train_queue, model, criterion_smooth, optimizer, global_step)
        logging.info('epoch %d train_acc %f', epoch, train_acc_top1)
        writer.add_scalar('train/loss', train_obj, global_step)
        writer.add_scalar('train/acc_top1', train_acc_top1, global_step)
        writer.add_scalar('train/acc_top5', train_acc_top5, global_step)
        writer.add_scalar('train/lr',
                          optimizer.state_dict()['param_groups'][0]['lr'],
                          global_step)

        # Validation.
        valid_acc_top1, valid_acc_top5, valid_obj = infer(
            valid_queue, model, criterion)
        logging.info('valid_acc_top1 %f', valid_acc_top1)
        logging.info('valid_acc_top5 %f', valid_acc_top5)
        writer.add_scalar('val/acc_top1', valid_acc_top1, global_step)
        writer.add_scalar('val/acc_top5', valid_acc_top5, global_step)
        writer.add_scalar('val/loss', valid_obj, global_step)

        # Test
        test_acc_top1, test_acc_top5, test_obj = infer(test_queue, model,
                                                       criterion)
        logging.info('test_acc_top1 %f', test_acc_top1)
        logging.info('test_acc_top5 %f', test_acc_top5)
        writer.add_scalar('test/acc_top1', test_acc_top1, global_step)
        writer.add_scalar('test/acc_top5', test_acc_top5, global_step)
        writer.add_scalar('test/loss', test_obj, global_step)

        is_best = False
        if valid_acc_top1 > best_acc_top1:
            best_acc_top1 = valid_acc_top1
            is_best = True

        if args.local_rank == 0:
            utils.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_acc_top1': best_acc_top1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, args.save)

        # Update LR.
        scheduler.step()

    writer.flush()
stored_loss = 100000000

# At any point you can hit Ctrl + C to break out of training early.
try:
    #optimizer = None
    # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax)
    if args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(params,
                                    lr=args.lr,
                                    weight_decay=args.wdecay)
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(params,
                                     lr=args.lr,
                                     weight_decay=args.wdecay)
    if args.optimizer == 'fromage':
        optimizer = Fromage(params, lr=args.lr)
    if args.optimizer == 'adamw':
        optimizer = AdamW(params, lr=args.lr, weight_decay=args.wdecay)
    if args.optimizer == 'radam':
        optimizer = RAdam(params, lr=args.lr, weight_decay=args.wdecay)
    if args.optimizer.lower() == 'adabelief':
        optimizer = AdaBelief(params,
                              lr=args.lr,
                              weight_decay=args.wdecay,
                              eps=args.eps,
                              betas=(args.beta1, args.beta2))
    if args.optimizer == 'adabound':
        optimizer = AdaBound(params,
                             lr=args.lr,
                             weight_decay=args.wdecay,
                             final_lr=30,
Exemple #4
0
ntokens = len(corpus.dictionary)
if args.model == 'Transformer':
    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
                                   args.nlayers, args.dropout).to(device)
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nlayers, args.dropout, args.tied).to(device)

criterion = nn.NLLLoss()

if args.optim == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
if args.optim == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
if args.optim == 'fromage':
    optimizer = Fromage(model.parameters(), lr=args.lr, p_bound=args.p_bound)

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

Exemple #5
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(log_dir="runs/" + args.output_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    if args.optim == "adam":
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        print(f"\n using Adam with lr {args.learning_rate}\n")
    elif args.optim == "fromage":
        optimizer = Fromage(optimizer_grouped_parameters,
                            lr=args.learning_rate)
        print(f"\n using fromage with lr {args.learning_rate}\n")
    elif args.optim == "SGD":
        optimizer = torch.optim.SGD(optimizer_grouped_parameters,
                                    lr=args.learning_rate)
        print(f"\n using SGD with lr {args.learning_rate}\n")
    else:
        raise Exception("that optim is not implemented")
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in ["xlm", "roberta", "distilbert"]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemple #6
0
netG = Generator().to(device)
netD = Discriminator().to(device)

print("Generator:")
print(f"{sum(p.numel() for p in netG.parameters())} parameters")
print(f"{len(list(netG.parameters()))} tensors")

print("\nDiscriminator:")
print(f"{sum(p.numel() for p in netD.parameters())} parameters")
print(f"{len(list(netD.parameters()))} tensors")

if args.optim == 'sgd':
    optG = torch.optim.SGD(netG.parameters(), lr=args.initial_lr)
    optD = torch.optim.SGD(netD.parameters(), lr=args.initial_lr)
elif args.optim == 'fromage':
    optG = Fromage(netG.parameters(), lr=args.initial_lr)
    optD = Fromage(netD.parameters(), lr=args.initial_lr)
elif args.optim == 'lars':
    optG = Lars(netG.parameters(), lr=args.initial_lr)
    optD = Lars(netD.parameters(), lr=args.initial_lr)
elif args.optim == 'adam':
    optG = torch.optim.Adam(netG.parameters(),
                            lr=args.initial_lr,
                            betas=(0.0, 0.999),
                            eps=1e-08)
    optD = torch.optim.Adam(netD.parameters(),
                            lr=args.initial_lr,
                            betas=(0.0, 0.999),
                            eps=1e-08)
else:
    raise Exception("Unsupported optim")