def train(self,
              model,
              data_loader,
              validation_loader,
              tb=None,
              epochs=20,
              log_interval=100,
              checkpoint_interval=100):
        optimizer = AdamW(model.parameters(), lr=6e-4)
        scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=3000,
                                                    num_training_steps=epochs *
                                                    len(data_loader))
        for epoch in range(epochs):
            model.train()
            total_mle_loss = 0.0
            n_word_total = 0.0
            n_word_correct = 0.0
            for batch_idx, batch in enumerate(
                    tqdm(data_loader, mininterval=2, leave=False)):
                batch_xs, batch_ys = map(lambda x: x.to(self.device), batch)
                trg_ys = batch_ys[:, 1:]
                pred_logits = model(input_ids=batch_xs,
                                    decoder_input_ids=batch_ys[:, :-1])
                pred_logits = pred_logits.contiguous().view(
                    -1, pred_logits.size(2))

                # pred_logits = pred_logits.reshape(-1, pred_logits.size(2))
                loss, n_correct, n_total = self.compute_mle_loss(
                    pred_logits, trg_ys, smoothing=True)
                loss.backward()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                scheduler.step()
                total_mle_loss += loss.item()
                optimizer.zero_grad()

                # non_pad_mask = trg_ys.ne(PAD_IDX)
                # n_word = non_pad_mask.sum().item()
                n_word_total += n_total
                n_word_correct += n_correct
                if tb is not None and batch_idx % log_interval == 0:
                    tb_mle_batch(tb, total_mle_loss, n_word_total,
                                 n_word_correct, epoch, batch_idx,
                                 len(data_loader))
                    total_mle_loss = 0.0
                    n_word_total = 0.0
                    n_word_correct = 0.0

                if batch_idx != 0 and batch_idx % checkpoint_interval == 0:
                    pred_max = pred_logits.reshape(-1, 127,
                                                   len(idx2word)).max(2)[1]
                    pred = pd.DataFrame(pred_max.to('cpu').numpy())
                    pred_words = np.where(pred.isin(idx2word.keys()),
                                          pred.replace(idx2word), UNKNOWN_WORD)
                    trg_ys = pd.DataFrame(batch_ys[:, 1:].to('cpu').numpy())
                    trg_words = np.where(trg_ys.isin(idx2word.keys()),
                                         trg_ys.replace(idx2word),
                                         UNKNOWN_WORD)
                    with open('output_tests.txt', 'a') as f:
                        f.write("On the iteration %d" % batch_idx)
                        f.write("The actual line:\n")
                        f.write(str(trg_words[0]))
                        f.write("The prediciton of the line:\n")
                        f.write(str(pred_words[0]))
                        f.write('\n\n\n\n\n')

                    save_checkpoint(epoch,
                                    model,
                                    optimizer,
                                    scheduler,
                                    suffix=str(batch_idx))

            loss_per_word = total_mle_loss / n_word_total
            accuracy = n_word_correct / n_word_total

            # if tb is not None:
            # tb_mle_epoch(tb, loss_per_word, accuracy, epoch)

            # self.validate_BLEU(model, deepcopy(validation_loader), epoch, tb)


# for batch_idx, batch in enumerate(tqdm(validation_loader, mininterval=2, leave=False)):
# 	with torch.no_grad():
# 		batch_xs, batch_ys = map(lambda x: x.to(self.device), batch)
# 		trg_ys = pd.DataFrame(batch_ys[:, 1:].to('cpu').numpy())
# 		pred = model(batch_xs, batch_ys[:, :-1])
# 		pred_max = pred.to('cpu').max(2)[1]
# 		pred = pd.DataFrame(pred_max.numpy())
# 		pred_words = np.where(pred.isin(idx2word.keys()), pred.replace(idx2word), UNKNOWN_WORD)
# 		trg_words = np.where(trg_ys.isin(idx2word.keys()), trg_ys.replace(idx2word), UNKNOWN_WORD)
# 		print(pred_words[0])
# 		print(trg_words[0])
# 		break
def train_sim(epoch_num=10,
              optim_type='ACGD',
              startPoint=None,
              start_n=0,
              z_dim=128,
              batchsize=64,
              l2_penalty=0.0,
              momentum=0.0,
              log=False,
              loss_name='WGAN',
              model_name='dc',
              model_config=None,
              data_path='None',
              show_iter=100,
              logdir='test',
              dataname='CIFAR10',
              device='cpu',
              gpu_num=1):
    lr_d = 1e-4
    lr_g = 1e-4
    dataset = get_data(dataname=dataname, path=data_path)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config)
    D.apply(weights_init_d).to(device)
    G.apply(weights_init_g).to(device)

    optim_d = RMSprop(D.parameters(), lr=lr_d)
    optim_g = RMSprop(G.parameters(), lr=lr_g)

    if startPoint is not None:
        chk = torch.load(startPoint)
        D.load_state_dict(chk['D'])
        G.load_state_dict(chk['G'])
        optim_d.load_state_dict(chk['d_optim'])
        optim_g.load_state_dict(chk['g_optim'])
        print('Start from %s' % startPoint)
    if gpu_num > 1:
        D = nn.DataParallel(D, list(range(gpu_num)))
        G = nn.DataParallel(G, list(range(gpu_num)))
    timer = time.time()
    count = 0
    if 'DCGAN' in model_name:
        fixed_noise = torch.randn((64, z_dim, 1, 1), device=device)
    else:
        fixed_noise = torch.randn((64, z_dim), device=device)
    for e in range(epoch_num):
        print('======Epoch: %d / %d======' % (e, epoch_num))
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            if 'DCGAN' in model_name:
                z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device)
            else:
                z = torch.randn((d_real.shape[0], z_dim), device=device)
            fake_x = G(z)
            d_fake = D(fake_x)
            loss = get_loss(name=loss_name,
                            g_loss=False,
                            d_real=d_real,
                            d_fake=d_fake,
                            l2_weight=l2_penalty,
                            D=D)
            D.zero_grad()
            G.zero_grad()
            loss.backward()
            optim_d.step()
            optim_g.step()

            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , Loss: %.5f, time: %.3fs' %
                      (count, loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s_%s/' % (dataname, logdir)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img,
                                      path + 'iter_%d.png' % (count + start_n),
                                      normalize=True)
                save_checkpoint(
                    path=logdir,
                    name='%s-%s%.3f_%d.pth' %
                    (optim_type, model_name, lr_g, count + start_n),
                    D=D,
                    G=G,
                    optimizer=optim_d,
                    g_optimizer=optim_g)
            if wandb and log:
                wandb.log({
                    'Real score': d_real.mean().item(),
                    'Fake score': d_fake.mean().item(),
                    'Loss': loss.item()
                })
            count += 1
Esempio n. 3
0
def train(model,
          optimizer,
          train_loader,
          valid_loader,
          save_path,
          criterion,
          num_epochs=50,
          eval_every=50,
          best_valid_loss=float("Inf"),
          model_name="model"):

    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    print("Start training for", num_epochs, "epochs...")
    model.float()
    model.train()
    for epoch in range(num_epochs):
        print("Epoch", epoch + 1, "of", num_epochs)
        for train_batch in train_loader:
            labels = train_batch['binary_label'].unsqueeze(1).to(device)
            content = train_batch['content']
            content = torch.stack(content, dim=1).to(device)
            output = model(content).unsqueeze(1).to(device)

            loss = criterion(output, labels.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():
                    # validation loop
                    for val_batch in valid_loader:
                        labels = val_batch['binary_label'].unsqueeze(1).to(
                            device)
                        content = val_batch['content']
                        content = torch.stack(content, dim=1).to(device)
                        output = model(content).unsqueeze(1).to(device)

                        loss = criterion(output, labels.float())
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0
                valid_running_loss = 0.0
                model.train()

                # print progress
                print(
                    'Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                    .format(epoch + 1, num_epochs, global_step,
                            num_epochs * len(train_loader), average_train_loss,
                            average_valid_loss))

                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(save_path + model_name + '.pt', model,
                                    optimizer, best_valid_loss, log_file)
                    save_metrics(save_path + model_name + '_metrics.pt',
                                 train_loss_list, valid_loss_list,
                                 global_steps_list, log_file)

    save_metrics(save_path + model_name + '_metrics.pt', train_loss_list,
                 valid_loss_list, global_steps_list, log_file)
    print('Finished Training!')
            if epoch % 10 == 0:
                checkpoint = {
                    'epoch': epoch,
                    'batch_size': batch_size,
                    'ME_state': me.state_dict(),
                    'CE_state': ce.state_dict(),
                    'Pred_state': predictor.state_dict(),
                    'CE_hidden_size': grid[iter]['hce_hidden'],
                    'Classifier_hidden_size': grid[iter]['pred_hidden'],
                    'dropout': grid[iter]['dropout'],
                    'optimME_state': optimME.state_dict(),
                    'optimCE_state': optimCE.state_dict(),
                    'optimPred_state': optimPred.state_dict()
                }

                save_checkpoint(checkpoint)

            # Check for early stopping
            if val_pred_loss < min_val_loss:
                epoch_no_improv = 0
                min_val_loss = val_pred_loss
            else:
                epoch_no_improv += 1

            if epoch_no_improv == es_patience:
                print('=> Early stopping!')
                break
            else:
                continue

        # Save trained model
def train_cgd(epoch_num=10,
              optim_type='ACGD',
              startPoint=None,
              start_n=0,
              z_dim=128,
              batchsize=64,
              tols={
                  'tol': 1e-10,
                  'atol': 1e-16
              },
              l2_penalty=0.0,
              momentum=0.0,
              loss_name='WGAN',
              model_name='dc',
              model_config=None,
              data_path='None',
              show_iter=100,
              logdir='test',
              dataname='CIFAR10',
              device='cpu',
              gpu_num=1,
              ada_train=True,
              log=False,
              collect_info=False,
              args=None):
    lr_d = args['lr_d']
    lr_g = args['lr_g']
    dataset = get_data(dataname=dataname, path=data_path)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config)
    D.apply(weights_init_d).to(device)
    G.apply(weights_init_g).to(device)
    if optim_type == 'BCGD':
        optimizer = BCGD(max_params=G.parameters(),
                         min_params=D.parameters(),
                         lr_max=lr_g,
                         lr_min=lr_d,
                         momentum=momentum,
                         tol=tols['tol'],
                         atol=tols['atol'],
                         device=device)
        # scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone)
    elif optim_type == 'ICR':
        optimizer = ICR(max_params=G.parameters(),
                        min_params=D.parameters(),
                        lr=lr_d,
                        alpha=1.0,
                        device=device)
        # scheduler = icrScheduler(optimizer, milestone)
    elif optim_type == 'ACGD':
        optimizer = ACGD(max_params=G.parameters(),
                         min_params=D.parameters(),
                         lr_max=lr_g,
                         lr_min=lr_d,
                         tol=tols['tol'],
                         atol=tols['atol'],
                         device=device,
                         solver='cg')
        # scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone)
    if startPoint is not None:
        chk = torch.load(startPoint)
        D.load_state_dict(chk['D'])
        G.load_state_dict(chk['G'])
        # optimizer.load_state_dict(chk['optim'])
        print('Start from %s' % startPoint)
    if gpu_num > 1:
        D = nn.DataParallel(D, list(range(gpu_num)))
        G = nn.DataParallel(G, list(range(gpu_num)))
    timer = time.time()
    count = 0
    if 'DCGAN' in model_name:
        fixed_noise = torch.randn((64, z_dim, 1, 1), device=device)
    else:
        fixed_noise = torch.randn((64, z_dim), device=device)

    mod = 10
    accs = torch.tensor([0.8 for _ in range(mod)])

    for e in range(epoch_num):
        # scheduler.step(epoch=e)
        print('======Epoch: %d / %d======' % (e, epoch_num))
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            if 'DCGAN' in model_name:
                z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device)
            else:
                z = torch.randn((d_real.shape[0], z_dim), device=device)
            fake_x = G(z)
            d_fake = D(fake_x)
            loss = get_loss(name=loss_name,
                            g_loss=False,
                            d_real=d_real,
                            d_fake=d_fake,
                            l2_weight=l2_penalty,
                            D=D)
            optimizer.zero_grad()
            optimizer.step(loss)

            num_correct = torch.sum(d_real > 0) + torch.sum(d_fake < 0)
            acc = num_correct.item() / (d_real.shape[0] + d_fake.shape[0])
            accs[count % mod] = acc
            acc_indicator = sum(accs) / mod
            if acc_indicator > 0.9:
                ada_ratio = 0.05
            elif acc_indicator < 0.80:
                ada_ratio = 0.1
            else:
                ada_ratio = 1.0
            if ada_train:
                optimizer.set_lr(lr_max=lr_g, lr_min=ada_ratio * lr_d)

            if count % show_iter == 0 and count != 0:
                time_cost = time.time() - timer
                print('Iter :%d , Loss: %.5f, time: %.3fs' %
                      (count, loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s_%s/' % (dataname, logdir)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img,
                                      path + 'iter_%d.png' % (count + start_n),
                                      normalize=True)
                save_checkpoint(path=logdir,
                                name='%s-%s_%d.pth' %
                                (optim_type, model_name, count + start_n),
                                D=D,
                                G=G,
                                optimizer=optimizer)
            if wandb and log:
                wandb.log(
                    {
                        'Real score': d_real.mean().item(),
                        'Fake score': d_fake.mean().item(),
                        'Loss': loss.item(),
                        'Acc_indicator': acc_indicator,
                        'Ada ratio': ada_ratio
                    },
                    step=count,
                )

            if collect_info and wandb:
                cgd_info = optimizer.get_info()
                wandb.log(
                    {
                        'CG iter num': cgd_info['iter_num'],
                        'CG runtime': cgd_info['time'],
                        'D gradient': cgd_info['grad_y'],
                        'G gradient': cgd_info['grad_x'],
                        'D hvp': cgd_info['hvp_y'],
                        'G hvp': cgd_info['hvp_x'],
                        'D cg': cgd_info['cg_y'],
                        'G cg': cgd_info['cg_x']
                    },
                    step=count)
            count += 1
Esempio n. 6
0
    else:
        cycle = int((epoch - args.epochs) // args.cycle_interval + 2)
        print('In %d -th cycle' % cycle)

    # do the fastSWA updates
    if args.fastswa_frequencies is not None:
        for fastswa_freq, fastswa_net, fastswa_opt in zip(
                fastswa_freqs,
                fastswa_nets,
                fastswa_optims,
        ):
            if epoch >= (args.epochs - args.cycle_interval) and (
                    epoch - args.epochs +
                    args.cycle_interval) % fastswa_freq == 0:
                save_checkpoint(epoch, model, ema_model, swa_model,
                                fastswa_nets[0], accuracy, args,
                                path_checkpoint)
                print("Evaluate fast-swa-{} at epoch {}".format(
                    fastswa_freq, epoch))
                fastswa_opt.update(model)
                update_batchnorm(fastswa_net, trainloader)
                fastswa_acc = fastswa.test(testloader)
                accuracy['test_fastswa_acc'].append(fastswa_acc)
            else:
                accuracy['test_fastswa_acc'].append(None)

    # swa update
    if ((epoch >= args.epochs)) and ((epoch - args.epochs) %
                                     args.cycle_interval) == 0:
        swa_model_optim.update(model)
        print("SWA Model Updated!")
Esempio n. 7
0
def main():
    global args

    ## create models and optimizers
    print("=> creating models...")
    classifier = archs.resnet50shared(pretrained=True).to(device)
    decoder = archs.decoder(final_upsample_mode=args.upsample).to(device)

    optimizer = {}
    optimizer['classifier'] = torch.optim.SGD(classifier.parameters(),
                                              args.lr,
                                              momentum=args.momentum,
                                              weight_decay=args.weight_decay)
    optimizer['decoder'] = torch.optim.Adam(decoder.parameters(),
                                            args.lr_casme,
                                            weight_decay=args.weight_decay)

    cudnn.benchmark = True

    ## data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=False,
                                               sampler=None)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=False)

    ## training loop
    for epoch in range(args.epochs):
        epoch_start_time = time.time()

        adjust_learning_rate(optimizer, epoch, args)

        ## train for one epoch
        tr_s = train_or_eval(train_loader, classifier, decoder, True,
                             optimizer, epoch)

        ## evaluate on validation set
        val_s = train_or_eval(val_loader, classifier, decoder)

        ## save checkpoint
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict_classifier': classifier.state_dict(),
                'state_dict_decoder': decoder.state_dict(),
                'optimizer_classifier': optimizer['classifier'].state_dict(),
                'optimizer_decoder': optimizer['decoder'].state_dict(),
                'args': args,
            }, args)

        ## log
        with open(args.log_path, 'a') as f:
            f.write(
                str(epoch + 1) + ' ' + str(time.time() - epoch_start_time) +
                ' ' + tr_s['acc'] + ' ' + val_s['acc'] + ' ' + tr_s['acc_m'] +
                ' ' + val_s['acc_m'] + ' ' + tr_s['avg_mask'] + ' ' +
                val_s['avg_mask'] + ' ' + tr_s['std_mask'] + ' ' +
                val_s['std_mask'] + ' ' + tr_s['entropy'] + ' ' +
                val_s['entropy'] + ' ' + tr_s['tv'] + ' ' + val_s['tv'] + '\n')
Esempio n. 8
0
import argparse
import train_utils

parser = argparse.ArgumentParser(
    description='This script helps in training the model',
)

parser.add_argument('--data_directory', dest='data_directory', action='store', default='./flowers')
parser.add_argument('--model_name', dest='model_name', action='store', default='vgg16')
parser.add_argument('--save_dir', dest='save_dir', action='store', default='checkpoint.pth')
parser.add_argument('--learning_rate', dest='learning_rate', action='store', default=0.001, type=float)
parser.add_argument('--hidden_input', dest='hidden_input',  action='store', default=1024, type=int)
parser.add_argument('--epochs', dest='epochs', action='store', default=5, type=int)
parser.add_argument('--gpu', dest="mode", action="store", default="gpu")

args = parser.parse_args()

# fetch dataloaders
train_data, train_dataloader, test_dataloader, validate_dataloader = train_utils.load_data(args.data_directory)

# setup the classifier, criterion, optimizer model
model, optimizer, criterion = train_utils.create_model(
    args.model_name, args.hidden_input, args.learning_rate, args.mode)

# train model
train_utils.train_model(model, optimizer, criterion, train_dataloader, 
                        validate_dataloader, args.epochs, args.mode)

# save the model as checkpoint
train_utils.save_checkpoint(model, args, optimizer, train_data)
Esempio n. 9
0
def train_d(epoch_num=10,
            logdir='test',
            optim='SGD',
            loss_name='JSD',
            show_iter=500,
            model_weight=None,
            load_d=False,
            load_g=False,
            compare_path=None,
            info_time=100,
            run_select=None,
            device='cpu'):
    lr_d = 0.001
    lr_g = 0.01
    batchsize = 128
    z_dim = 96
    print('discriminator lr: %.3f' % lr_d)
    dataset = get_data(dataname='MNIST', path='../datas/mnist')
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D = dc_D().to(device)
    G = dc_G(z_dim=z_dim).to(device)
    D.apply(weights_init_d)
    G.apply(weights_init_g)
    if model_weight is not None:
        chk = torch.load(model_weight)
        if load_d:
            D.load_state_dict(chk['D'])
            print('Load D from %s' % model_weight)
        if load_g:
            G.load_state_dict(chk['G'])
            print('Load G from %s' % model_weight)
    if compare_path is not None:
        discriminator = dc_D().to(device)
        model_weight = torch.load(compare_path)
        discriminator.load_state_dict(model_weight['D'])
        model_vec = torch.cat(
            [p.contiguous().view(-1) for p in discriminator.parameters()])
        print('Load discriminator from %s' % compare_path)
    if run_select is not None:
        fixed_data = torch.load(run_select)
        real_set = fixed_data['real_set']
        fake_set = fixed_data['fake_set']
        real_d = fixed_data['real_d']
        fake_d = fixed_data['fake_d']
        fixed_vec = fixed_data['pred_vec']
        print('load fixed data set')
    from datetime import datetime
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' %
                           (logdir, current_time, lr_d))
    if optim == 'SGD':
        d_optimizer = SGD(D.parameters(), lr=lr_d)
        print('Optimizer SGD')
    else:
        d_optimizer = BCGD2(max_params=G.parameters(),
                            min_params=D.parameters(),
                            lr_max=lr_g,
                            lr_min=lr_d,
                            update_max=False,
                            device=device,
                            collect_info=True)
        print('Optimizer BCGD2')
    timer = time.time()
    count = 0
    d_losses = []
    g_losses = []
    for e in range(epoch_num):
        tol_correct = 0
        tol_dloss = 0
        tol_gloss = 0
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            z = torch.randn((real_x.shape[0], z_dim), device=device)
            fake_x = G(z)
            d_fake = D(fake_x)
            D_loss = get_loss(name=loss_name,
                              g_loss=False,
                              d_real=d_real,
                              d_fake=d_fake)
            tol_dloss += D_loss.item() * real_x.shape[0]
            G_loss = get_loss(name=loss_name,
                              g_loss=True,
                              d_real=d_real,
                              d_fake=d_fake)
            tol_gloss += G_loss.item() * fake_x.shape[0]
            if compare_path is not None and count % info_time == 0:
                diff = get_diff(net=D, model_vec=model_vec)
                writer.add_scalar('Distance from checkpoint',
                                  diff.item(),
                                  global_step=count)
                if run_select is not None:
                    with torch.no_grad():
                        d_real_set = D(real_set)
                        d_fake_set = D(fake_set)
                        diff_real = torch.norm(d_real_set - real_d, p=2)
                        diff_fake = torch.norm(d_fake_set - fake_d, p=2)
                        d_vec = torch.cat([d_real_set, d_fake_set])
                        diff = torch.norm(d_vec.sub_(fixed_vec), p=2)
                        writer.add_scalars('L2 norm of pred difference', {
                            'Total': diff.item(),
                            'real set': diff_real.item(),
                            'fake set': diff_fake.item()
                        },
                                           global_step=count)
            d_optimizer.zero_grad()
            if optim == 'SGD':
                D_loss.backward()
                d_optimizer.step()
                gd = torch.norm(torch.cat(
                    [p.grad.contiguous().view(-1) for p in D.parameters()]),
                                p=2)
                gg = torch.norm(torch.cat(
                    [p.grad.contiguous().view(-1) for p in G.parameters()]),
                                p=2)
            else:
                d_optimizer.step(D_loss)
                cgdInfo = d_optimizer.get_info()
                gd = cgdInfo['grad_y']
                gg = cgdInfo['grad_x']
                writer.add_scalars('Grad', {'update': cgdInfo['update']},
                                   global_step=count)
            tol_correct += (d_real > 0).sum().item() + (d_fake <
                                                        0).sum().item()
            writer.add_scalars('Loss', {
                'D_loss': D_loss.item(),
                'G_loss': G_loss.item()
            },
                               global_step=count)
            writer.add_scalars('Grad', {
                'D grad': gd,
                'G grad': gg
            },
                               global_step=count)
            writer.add_scalars('Discriminator output', {
                'Generated image': d_fake.mean().item(),
                'Real image': d_real.mean().item()
            },
                               global_step=count)
            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' %
                      (count, D_loss.item(), G_loss.item(), time_cost))
                timer = time.time()
                save_checkpoint(path=logdir,
                                name='FixG-%.3f_%d.pth' % (lr_d, count),
                                D=D,
                                G=G)
            count += 1
    writer.close()
Esempio n. 10
0
def run_lstm(learning_rate, batch_size, cuda, num_inputs, num_outputs,
             num_hidden, checkpoint_interval, total_batches, model_file):
    """
    Train LSTM baseline.
    """
    # Seeding
    SEED = 1000
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    # Model Loading
    if model_file == 'None':
        lstm = LSTM(num_inputs, num_hidden)
        if cuda:
            lstm.cuda()
        # Constants for keeping track
        total_examples = 0
        losses = []
        costs = []
        seq_lens = []
    else:
        from_before = torch.load(model_file)
        state_dict = from_before['state_dict']
        num_inputs = from_before['num_inputs']
        num_outputs = from_before['num_outputs']
        batch_size = from_before['batch_size']
        cuda = from_before['cuda']
        lstm = LSTM(num_inputs, num_hidden)

    # Dataset creation
    training_dataset = random_binary(max_seq_length=20,
                                     num_sequences=200,
                                     vector_dim=8,
                                     batch_Size=batch_size)
    testing_dataset = random_binary(max_seq_length=10,
                                    num_sequences=50,
                                    vector_dim=8,
                                    batch_Size=batch_size)

    # Optimizer type and loss function
    optimizer = torch.optim.RMSprop(lstm.parameters(),
                                    lr=learning_rate,
                                    momentum=0.9)
    criterion = torch.nn.BCELoss()

    np.random.seed(
        SEED
    )  # reset training seed to ensure that batches remain the same between runs!
    for batch in training_dataset:
        lstm.init_hidden(batch_size)
        batch = Variable(batch)
        if cuda:
            batch = batch.cuda()
        optimizer.zero_grad()
        output = Variable(torch.zeros(batch.size()))
        if cuda:
            output = output.cuda()
        for i in range(batch.size()[2]):
            x = batch[:, :, i]
            output[:, :, i] = lstm.forward(x)

        # Output response
        x = Variable(torch.zeros(batch.size()[0:2]))
        if cuda:
            x = x.cuda()
        for i in range(batch.size()[2]):
            output[:, :, i] = lstm.forward(x)

        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        print("Current Batch Loss:", round(loss.data[0], 3))
        total_examples += batch_size

        # The cost is the number of error bits per sequence
        binary_output = output.clone().data
        binary_output = binary_output > 0.5
        cost = torch.sum(torch.abs(binary_output.float() - batch.data))

        losses += [loss.data[0]]
        costs += [cost / batch_size]
        seq_lens += [batch.size(2)]

        # Checkpoint model
        if (checkpoint_interval != 0) and (total_examples % checkpoint_interval
                                           == 0):
            print("Saving checkpoint!")
            save_checkpoint(lstm, total_examples / batch_size, losses, costs,
                            seq_lens, total_examples, None, num_inputs,
                            num_outputs, None, None, None, None, None,
                            batch_size, cuda, num_hidden, 'LSTM')

            # Evaluate model on this saved checkpoint
            test_cost, prediction, input = evaluate_lstm_baseline(
                model=lstm,
                testset=testing_dataset,
                batch_size=batch_size,
                cuda=cuda)
            print("Total Test Cost (in bits per sequence):", test_cost)
            print("Example of Input/Output")
            print("prediction:", prediction[0])
            print("Input:", input[0])
        if total_examples / checkpoint_interval >= total_batches:
            break
Esempio n. 11
0
def train_cgd(epoch_num=10,
              milestone=None,
              optim_type='ACGD',
              startPoint=None,
              start_n=0,
              z_dim=128,
              batchsize=64,
              tols={
                  'tol': 1e-10,
                  'atol': 1e-16
              },
              l2_penalty=0.0,
              momentum=0.0,
              loss_name='WGAN',
              model_name='dc',
              model_config=None,
              data_path='None',
              show_iter=100,
              logdir='test',
              dataname='CIFAR10',
              device='cpu',
              gpu_num=1,
              collect_info=False):
    lr_d = 0.01
    lr_g = 0.01
    dataset = get_data(dataname=dataname, path=data_path)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config)
    D.apply(weights_init_d).to(device)
    G.apply(weights_init_g).to(device)
    from datetime import datetime
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' %
                           (logdir, current_time, lr_d))
    if optim_type == 'BCGD':
        optimizer = BCGD(max_params=G.parameters(),
                         min_params=D.parameters(),
                         lr_max=lr_g,
                         lr_min=lr_d,
                         momentum=momentum,
                         tol=tols['tol'],
                         atol=tols['atol'],
                         device=device)
        scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone)
    elif optim_type == 'ICR':
        optimizer = ICR(max_params=G.parameters(),
                        min_params=D.parameters(),
                        lr=lr_d,
                        alpha=1.0,
                        device=device)
        scheduler = icrScheduler(optimizer, milestone)
    elif optim_type == 'ACGD':
        optimizer = ACGD(max_params=G.parameters(),
                         min_params=D.parameters(),
                         lr_max=lr_g,
                         lr_min=lr_d,
                         tol=tols['tol'],
                         atol=tols['atol'],
                         device=device,
                         solver='cg')
        scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone)
    if startPoint is not None:
        chk = torch.load(startPoint)
        D.load_state_dict(chk['D'])
        G.load_state_dict(chk['G'])
        optimizer.load_state_dict(chk['optim'])
        print('Start from %s' % startPoint)
    if gpu_num > 1:
        D = nn.DataParallel(D, list(range(gpu_num)))
        G = nn.DataParallel(G, list(range(gpu_num)))
    timer = time.time()
    count = 0
    if 'DCGAN' in model_name:
        fixed_noise = torch.randn((64, z_dim, 1, 1), device=device)
    else:
        fixed_noise = torch.randn((64, z_dim), device=device)
    for e in range(epoch_num):
        scheduler.step(epoch=e)
        print('======Epoch: %d / %d======' % (e, epoch_num))
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            if 'DCGAN' in model_name:
                z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device)
            else:
                z = torch.randn((d_real.shape[0], z_dim), device=device)
            fake_x = G(z)
            d_fake = D(fake_x)
            loss = get_loss(name=loss_name,
                            g_loss=False,
                            d_real=d_real,
                            d_fake=d_fake,
                            l2_weight=l2_penalty,
                            D=D)
            optimizer.zero_grad()
            optimizer.step(loss)

            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , Loss: %.5f, time: %.3fs' %
                      (count, loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s_%s/' % (dataname, logdir)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img,
                                      path + 'iter_%d.png' % (count + start_n),
                                      normalize=True)
                save_checkpoint(
                    path=logdir,
                    name='%s-%s%.3f_%d.pth' %
                    (optim_type, model_name, lr_g, count + start_n),
                    D=D,
                    G=G,
                    optimizer=optimizer)
            writer.add_scalars('Discriminator output', {
                'Generated image': d_fake.mean().item(),
                'Real image': d_real.mean().item()
            },
                               global_step=count)
            writer.add_scalar('Loss', loss.item(), global_step=count)
            if collect_info:
                cgd_info = optimizer.get_info()
                writer.add_scalar('Conjugate Gradient/iter num',
                                  cgd_info['iter_num'],
                                  global_step=count)
                writer.add_scalar('Conjugate Gradient/running time',
                                  cgd_info['time'],
                                  global_step=count)
                writer.add_scalars('Delta', {
                    'D gradient': cgd_info['grad_y'],
                    'G gradient': cgd_info['grad_x'],
                    'D hvp': cgd_info['hvp_y'],
                    'G hvp': cgd_info['hvp_x'],
                    'D cg': cgd_info['cg_y'],
                    'G cg': cgd_info['cg_x']
                },
                                   global_step=count)
            count += 1
    writer.close()
Esempio n. 12
0
def run(learning_rate, batch_size, cuda, memory_feature_size, num_inputs,
        num_outputs, controller_size, controller_type, controller_layers,
        memory_size, integer_shift, checkpoint_interval, total_batches,
        model_file):

    # model_file = "checkpoints/ntm/copy-batch-5120.0--LSTM.model"

    # Seeding
    SEED = 1000
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    # Model Loading
    if model_file == 'None':
        ntm = NTM(num_inputs=num_inputs,
                  num_outputs=num_outputs,
                  controller_size=controller_size,
                  controller_type=controller_type,
                  controller_layers=controller_layers,
                  memory_size=memory_size,
                  memory_feature_size=memory_feature_size,
                  integer_shift=integer_shift,
                  batch_size=batch_size,
                  use_cuda=cuda)
        # Constants for keeping track
        total_examples = 0
        losses = []
        costs = []
        seq_lens = []
    else:
        from_before = torch.load(model_file)
        state_dict = from_before['state_dict']
        controller_type = from_before['controller_type']
        num_inputs = from_before['num_inputs']
        num_outputs = from_before['num_outputs']
        controller_size = from_before['controller_size']
        controller_layers = from_before['controller_layers']
        memory_size = from_before['memory_size']
        memory_feature_size = from_before['memory_feature_size']
        integer_shift = from_before['integer_shift']
        batch_size = from_before['batch_size']
        cuda = from_before['cuda']
        saved_biases = True
        ntm = NTM(num_inputs=num_inputs,
                  num_outputs=num_outputs,
                  controller_size=controller_size,
                  controller_type=controller_type,
                  controller_layers=controller_layers,
                  memory_size=memory_size,
                  memory_feature_size=memory_feature_size,
                  integer_shift=integer_shift,
                  batch_size=batch_size,
                  use_cuda=cuda,
                  saved_biases=saved_biases)
        ntm.load_state_dict(state_dict)
        losses = from_before['loss']
        costs = from_before['cost']
        seq_lens = from_before['seq_lengths']
        total_examples = from_before['total_examples']

    # Dataset creation
    training_dataset = random_binary(max_seq_length=20,
                                     num_sequences=500,
                                     vector_dim=8,
                                     batch_Size=batch_size)
    testing_dataset = random_binary(max_seq_length=10,
                                    num_sequences=50,
                                    vector_dim=8,
                                    batch_Size=batch_size)

    # Optimizer type and loss function
    # optimizer = torch.optim.Adam(ntm.parameters(), lr=learning_rate)
    optimizer = torch.optim.RMSprop(ntm.parameters(),
                                    lr=learning_rate,
                                    momentum=0.9,
                                    alpha=0.95)
    criterion = torch.nn.BCELoss()

    np.random.seed(
        SEED
    )  # reset training seed to ensure that batches remain the same between runs!
    for batch in training_dataset:

        optimizer.zero_grad()
        # Initialize head weights and memory to zero
        ntm.init_headweights()
        ntm.init_memory()

        batch = Variable(batch)
        if cuda:
            batch = batch.cuda()
        next_r = ntm.read_head.create_state(batch_size)
        if controller_type == 'LSTM':
            lstm_h, lstm_c = ntm.controller.create_state(batch_size)

        #  Read batch in
        for i in range(batch.size()[2]):
            x = batch[:, :, i]
            if controller_type == 'LSTM':
                _, next_r, lstm_h, lstm_c = ntm.forward(x=x,
                                                        r=next_r,
                                                        lstm_h=lstm_h,
                                                        lstm_c=lstm_c)
            elif controller_type == 'MLP':
                _, next_r = ntm.forward(x=x, r=next_r)

        # Output response
        x = Variable(torch.zeros(batch.size()[0:2]))
        output = Variable(torch.zeros(batch[:, :, :-1].size()))
        if cuda:
            x = x.cuda()
            output = output.cuda()

        for i in range(output.size()[2]):
            if controller_type == 'LSTM':
                output[:, :,
                       i], next_r, lstm_h, lstm_c = ntm.forward(x=x,
                                                                r=next_r,
                                                                lstm_h=lstm_h,
                                                                lstm_c=lstm_c)
            elif controller_type == 'MLP':
                output[:, :, i], next_r = ntm.forward(x=x, r=next_r)

        loss = criterion(output, batch[:, :, :-1])
        loss.backward(retain_graph=True)
        optimizer.step()

        print("Current Batch Loss:", round(loss.data[0], 3))
        total_examples += batch_size

        # The cost is the number of error bits per sequence
        binary_output = output.clone().data
        binary_output = binary_output > 0.5
        cost = torch.sum(
            torch.abs(binary_output.float() - batch.data[:, :, :-1]))

        losses += [loss.data[0]]
        costs += [cost / batch_size]
        seq_lens += [batch.size(2)]

        # Checkpoint model
        if (checkpoint_interval != 0) and (total_examples % checkpoint_interval
                                           == 0):
            print("Saving Checkpoint!")
            save_checkpoint(ntm, total_examples / batch_size, losses, costs,
                            seq_lens, total_examples, controller_type,
                            num_inputs, num_outputs, controller_size,
                            controller_layers, memory_size,
                            memory_feature_size, integer_shift, batch_size,
                            cuda)

            # Evaluate model on this saved checkpoint
            test_cost, prediction, input = evaluate(
                model=ntm,
                testset=testing_dataset,
                batch_size=batch_size,
                memory_feature_size=memory_feature_size,
                controller_type=controller_type,
                cuda=cuda)
            print("Total Test Cost (in bits per sequence):", test_cost)
            print("Example of Input/Output")
            print("prediction:", prediction[0])
            print("Input:", input[0])

        if total_examples / checkpoint_interval >= total_batches:
            break
Esempio n. 13
0
def trainValidateSegmentation(args):
    '''
    Main function for trainign and validation
    :param args: global arguments
    :return: None
    '''

    # load the model
    cuda_available = torch.cuda.is_available()
    num_gpus = torch.cuda.device_count()
    model = net.EESPNet_Seg(args.classes,
                            s=args.s,
                            pretrained=args.pretrained,
                            gpus=num_gpus)

    if num_gpus >= 1:
        model = torch.nn.DataParallel(model)

    args.savedir = args.savedir + str(args.s) + '/'

    # create the directory if not exist
    if not os.path.exists(args.savedir):
        os.mkdir(args.savedir)

    # check if processed data file exists or not
    if not os.path.isfile(args.cached_data_file):
        dataLoad = ld.LoadData(args.data_dir, args.classes,
                               args.cached_data_file)
        data = dataLoad.processData()
        if data is None:
            print('Error while pickling data. Please check.')
            exit(-1)
    else:
        data = pickle.load(open(args.cached_data_file, "rb"))

    if cuda_available:
        args.onGPU = True
        model = model.cuda()

    total_paramters = netParams(model)
    print('Total network parameters: ' + str(total_paramters))

    # define optimization criteria
    weight = torch.from_numpy(
        data['classWeights'])  # convert the numpy array to torch
    if args.onGPU:
        weight = weight.cuda()

    criteria = torch.nn.CrossEntropyLoss(weight)  #weight

    if args.onGPU:
        criteria = criteria.cuda()

    print('Data statistics')
    print(data['mean'], data['std'])
    print(data['classWeights'])

    #compose the data with transforms
    trainDataset_main = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.RandomCropResize(size=(args.inWidth, args.inHeight)),
        myTransforms.RandomFlip(),
        #myTransforms.RandomCrop(64).
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale1 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.RandomCropResize(size=(int(args.inWidth * 1.5),
                                            int(1.5 * args.inHeight))),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale2 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.RandomCropResize(size=(int(args.inWidth * 1.25),
                                            int(1.25 *
                                                args.inHeight))),  # 1536, 768
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale3 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.RandomCropResize(size=(int(args.inWidth * 0.75),
                                            int(0.75 * args.inHeight))),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    trainDataset_scale4 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.RandomCropResize(size=(int(args.inWidth * 0.5),
                                            int(0.5 * args.inHeight))),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    valDataset = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(1024, 512),
        myTransforms.ToTensor(args.scaleIn),
        #
    ])

    # since we training from scratch, we create data loaders at different scales
    # so that we can generate more augmented data and prevent the network from overfitting

    trainLoader = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        data['trainIm'], data['trainAnnot'], transform=trainDataset_main),
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.num_workers,
                                              pin_memory=True)

    trainLoader_scale1 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDataset_scale1),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True)

    trainLoader_scale2 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDataset_scale2),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True)

    trainLoader_scale3 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDataset_scale3),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True)

    trainLoader_scale4 = torch.utils.data.DataLoader(
        myDataLoader.MyDataset(data['trainIm'],
                               data['trainAnnot'],
                               transform=trainDataset_scale4),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True)

    valLoader = torch.utils.data.DataLoader(myDataLoader.MyDataset(
        data['valIm'], data['valAnnot'], transform=valDataset),
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            num_workers=args.num_workers,
                                            pin_memory=True)

    if args.onGPU:
        cudnn.benchmark = True

    start_epoch = 0
    best_val = 0
    lr = args.lr

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr, (0.9, 0.999),
                                 eps=1e-08,
                                 weight_decay=5e-4)
    # we step the loss by 2 after step size is reached
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_loss, gamma=0.5)

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            best_val = checkpoint['best_val']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write(
            "\n%s\t%s\t%s\t%s\t%s\t" %
            ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val'))
    logger.flush()

    for epoch in range(start_epoch, args.max_epochs):

        #scheduler.step(epoch)
        poly_lr_scheduler(args, optimizer, epoch)
        lr = 0
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        print("Learning rate: " + str(lr))

        # train for one epoch
        # We consider 1 epoch with all the training data (at different scales)
        train(args, trainLoader_scale1, model, criteria, optimizer, epoch)
        train(args, trainLoader_scale2, model, criteria, optimizer, epoch)
        train(args, trainLoader_scale4, model, criteria, optimizer, epoch)
        train(args, trainLoader_scale3, model, criteria, optimizer, epoch)
        lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr = train(
            args, trainLoader, model, criteria, optimizer, epoch)

        # evaluate on validation set
        lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = val(
            args, valLoader, model, criteria)

        is_best = mIOU_val > best_val
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr': lr,
                'best_val': best_val,
            }, args.savedir + 'checkpoint.pth.tar')

        #save the model also
        if is_best:
            model_file_name = args.savedir + os.sep + 'model_best.pth'
            torch.save(model.state_dict(), model_file_name)

        with open(args.savedir + 'acc_' + str(epoch) + '.txt', 'w') as log:
            log.write(
                "\nEpoch: %d\t Overall Acc (Tr): %.4f\t Overall Acc (Val): %.4f\t mIOU (Tr): %.4f\t mIOU (Val): %.4f"
                % (epoch, overall_acc_tr, overall_acc_val, mIOU_tr, mIOU_val))
            log.write('\n')
            log.write('Per Class Training Acc: ' + str(per_class_acc_tr))
            log.write('\n')
            log.write('Per Class Validation Acc: ' + str(per_class_acc_val))
            log.write('\n')
            log.write('Per Class Training mIOU: ' + str(per_class_iu_tr))
            log.write('\n')
            log.write('Per Class Validation mIOU: ' + str(per_class_iu_val))

        logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.7f" %
                     (epoch, lossTr, lossVal, mIOU_tr, mIOU_val, lr))
        logger.flush()
        print("Epoch : " + str(epoch) + ' Details')
        print(
            "\nEpoch No.: %d\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f"
            % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val))
    logger.close()
Esempio n. 14
0
def train_scg(config, tols, milestone, device='cpu'):
    lr_d = config['lr_d']
    lr_g = config['lr_g']
    optim_type = config['optimizer']
    z_dim = config['z_dim']
    model_name = config['model']
    epoch_num = config['epoch_num']
    show_iter = config['show_iter']
    loss_name = config['loss_type']
    l2_penalty = config['d_penalty']
    logdir = config['logdir']
    start_n = config['startn']
    dataset = get_data(dataname=config['dataset'], path='../datas/%s' % config['datapath'])
    dataloader = DataLoader(dataset=dataset, batch_size=config['batchsize'],
                            shuffle=True, num_workers=4)
    inner_loader = DataLoader(dataset=dataset, batch_size=config['batchsize'],
                              shuffle=True, num_workers=4)
    D, G = get_model(model_name=model_name, z_dim=z_dim)
    D.apply(weights_init_d).to(device)
    G.apply(weights_init_g).to(device)
    optimizer = SCG(max_params=G.parameters(), min_params=D.parameters(),
                    lr_max=lr_g, lr_min=lr_d,
                    tol=tols['tol'], atol=tols['atol'],
                    dataloader=inner_loader,
                    device=device, solver='cg')
    scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone)
    if config['checkpoint'] is not None:
        startPoint = config['checkpoint']
        chk = torch.load(startPoint)
        D.load_state_dict(chk['D'])
        G.load_state_dict(chk['G'])
        optimizer.load_state_dict(chk['optim'])
        print('Start from %s' % startPoint)
    gpu_num = config['gpu_num']
    if gpu_num > 1:
        D = nn.DataParallel(D, list(range(gpu_num)))
        G = nn.DataParallel(G, list(range(gpu_num)))

    timer = time.time()
    count = 0
    if model_name == 'DCGAN' or model_name == 'DCGAN-WBN':
        fixed_noise = torch.randn((64, z_dim, 1, 1), device=device)
    else:
        fixed_noise = torch.randn((64, z_dim), device=device)
    for e in range(epoch_num):
        scheduler.step(epoch=e)
        print('======Epoch: %d / %d======' % (e, epoch_num))
        for real_x in dataloader:
            optimizer.zero_grad()
            real_x = real_x[0]
            if model_name == 'DCGAN' or model_name == 'DCGAN-WBN':
                z = torch.randn((real_x.shape[0], z_dim, 1, 1), device=device)
            else:
                z = torch.randn((real_x.shape[0], z_dim), device=device)
            def closure(train_x):
                train_x = train_x.to(device)
                fake_x = G(z)
                d_fake = D(fake_x)
                d_real = D(train_x)
                loss = get_loss(name=loss_name, g_loss=False,
                                d_real=d_real, d_fake=d_fake,
                                l2_weight=l2_penalty, D=D)
                return loss
            loss = optimizer.step(closure=closure, img=real_x)
            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , Loss: %.5f, time: %.3fs'
                      % (count, loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s_%s/' % (config['dataset'], logdir)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True)
                save_checkpoint(path=logdir,
                                name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n),
                                D=D, G=G, optimizer=optimizer)
            count += 1
Esempio n. 15
0
def main():
    args = get_args()

    use_cuda = (not args.no_cuda) and torch.cuda.is_available()
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    device = torch.device("cuda" if use_cuda else "cpu")
    # adjust batch size based on the number of gpus available
    args.batch_size = int(torch.cuda.device_count()) * args.batch_size_per_gpu

    # log and create snapshots
    os.makedirs(args.log_dir, exist_ok=True)
    filenames_to_snapshot = glob("*.py") + glob("*.sh")
    utils.snapshot_files(filenames_to_snapshot, args.log_dir)
    logger = utils.get_logger(log_dir=args.log_dir)
    with open(os.path.join(args.log_dir, "params.json"), 'w') as fh:
        json.dump(args.__dict__, fh, indent=2)
    logger.info("%s", repr(args))

    # tensorboard writer
    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, 'tensorboard'))

    # random seed for reproducability
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # create dataloaders
    trainset = loader.RB2DataLoader(
        data_dir=args.data_folder,
        data_filename=args.train_data,
        nx=args.nx,
        nz=args.nz,
        nt=args.nt,
        n_samp_pts_per_crop=args.n_samp_pts_per_crop,
        downsamp_xz=args.downsamp_xz,
        downsamp_t=args.downsamp_t,
        normalize_output=args.normalize_channels,
        return_hres=False,
        lres_filter=args.lres_filter,
        lres_interp=args.lres_interp)
    evalset = loader.RB2DataLoader(
        data_dir=args.data_folder,
        data_filename=args.eval_data,
        nx=args.nx,
        nz=args.nz,
        nt=args.nt,
        n_samp_pts_per_crop=args.n_samp_pts_per_crop,
        downsamp_xz=args.downsamp_xz,
        downsamp_t=args.downsamp_t,
        normalize_output=args.normalize_channels,
        return_hres=True,
        lres_filter=args.lres_filter,
        lres_interp=args.lres_interp)

    train_sampler = RandomSampler(trainset,
                                  replacement=True,
                                  num_samples=args.pseudo_epoch_size)
    eval_sampler = RandomSampler(evalset,
                                 replacement=True,
                                 num_samples=args.num_log_images)

    train_loader = DataLoader(trainset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              drop_last=True,
                              sampler=train_sampler,
                              **kwargs)
    eval_loader = DataLoader(evalset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             drop_last=False,
                             sampler=eval_sampler,
                             **kwargs)

    # setup model
    unet = UNet3d(in_features=4,
                  out_features=args.lat_dims,
                  igres=trainset.scale_lres,
                  nf=args.unet_nf,
                  mf=args.unet_mf)
    imnet = ImNet(dim=3,
                  in_features=args.lat_dims,
                  out_features=4,
                  nf=args.imnet_nf,
                  activation=NONLINEARITIES[args.nonlin])
    all_model_params = list(unet.parameters()) + list(imnet.parameters())

    if args.optim == "sgd":
        optimizer = optim.SGD(all_model_params, lr=args.lr)
    else:
        optimizer = optim.Adam(all_model_params, lr=args.lr)

    start_ep = 0
    global_step = np.zeros(1, dtype=np.uint32)
    tracked_stats = np.inf

    if args.resume:
        resume_dict = torch.load(args.resume)
        start_ep = resume_dict["epoch"]
        global_step = resume_dict["global_step"]
        tracked_stats = resume_dict["tracked_stats"]
        unet.load_state_dict(resume_dict["unet_state_dict"])
        imnet.load_state_dict(resume_dict["imnet_state_dict"])
        optimizer.load_state_dict(resume_dict["optim_state_dict"])
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)

    unet = nn.DataParallel(unet)
    unet.to(device)
    imnet = nn.DataParallel(imnet)
    imnet.to(device)

    model_param_count = lambda model: sum(x.numel()
                                          for x in model.parameters())
    logger.info("{}(unet) + {}(imnet) paramerters in total".format(
        model_param_count(unet), model_param_count(imnet)))

    checkpoint_path = os.path.join(args.log_dir, "checkpoint_latest.pth.tar")

    # get pdelayer for the RB2 equations
    if args.normalize_channels:
        mean = trainset.channel_mean
        std = trainset.channel_std
    else:
        mean = std = None
    pde_layer = get_rb2_pde_layer(mean=mean,
                                  std=std,
                                  t_crop=args.nt * 0.125,
                                  z_crop=args.nz * (1. / 128),
                                  x_crop=args.nx * (1. / 128),
                                  prandtl=args.prandtl,
                                  rayleigh=args.rayleigh,
                                  use_continuity=args.use_continuity)

    if args.lr_scheduler:
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

    # training loop
    for epoch in range(start_ep + 1, args.epochs + 1):
        loss = train(args, unet, imnet, train_loader, epoch, global_step,
                     device, logger, writer, optimizer, pde_layer)
        eval(args, unet, imnet, eval_loader, epoch, global_step, device,
             logger, writer, optimizer, pde_layer)
        if args.lr_scheduler:
            scheduler.step(loss)
        if loss < tracked_stats:
            tracked_stats = loss
            is_best = True
        else:
            is_best = False

        utils.save_checkpoint(
            {
                "epoch": epoch,
                "unet_state_dict": unet.module.state_dict(),
                "imnet_state_dict": imnet.module.state_dict(),
                "optim_state_dict": optimizer.state_dict(),
                "tracked_stats": tracked_stats,
                "global_step": global_step,
            }, is_best, epoch, checkpoint_path, "_pdenet", logger)
Esempio n. 16
0
def train(epoch_num=10,
          milestone=None,
          optim_type='Adam',
          lr_d=1e-4,
          lr_g=1e-4,
          startPoint=None,
          start_n=0,
          z_dim=128,
          batchsize=64,
          loss_name='WGAN',
          model_name='dc',
          model_config=None,
          data_path='None',
          show_iter=100,
          logdir='test',
          dataname='cifar10',
          device='cpu',
          gpu_num=1,
          saturating=False):
    dataset = get_data(dataname=dataname, path=data_path)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config)
    D.apply(weights_init_d).to(device)
    G.apply(weights_init_g).to(device)
    from datetime import datetime
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    writer = SummaryWriter(log_dir='logs/%s/%s' % (logdir, current_time))
    d_optimizer = Adam(D.parameters(), lr=lr_d, betas=(0.5, 0.999))
    g_optimizer = Adam(G.parameters(), lr=lr_g, betas=(0.5, 0.999))
    if startPoint is not None:
        chk = torch.load(startPoint)
        D.load_state_dict(chk['D'])
        G.load_state_dict(chk['G'])
        d_optimizer.load_state_dict(chk['d_optim'])
        g_optimizer.load_state_dict(chk['g_optim'])
        print('Start from %s' % startPoint)
    if gpu_num > 1:
        D = nn.DataParallel(D, list(range(gpu_num)))
        G = nn.DataParallel(G, list(range(gpu_num)))
    timer = time.time()
    count = 0
    if 'DCGAN' in model_name:
        fixed_noise = torch.randn((64, z_dim, 1, 1), device=device)
    else:
        fixed_noise = torch.randn((64, z_dim), device=device)

    for e in range(epoch_num):
        print('======Epoch: %d / %d======' % (e, epoch_num))
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            if 'DCGAN' in model_name:
                z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device)
            else:
                z = torch.randn((d_real.shape[0], z_dim), device=device)
            fake_x = G(z)
            d_fake = D(fake_x)
            d_loss = get_loss(name=loss_name,
                              g_loss=False,
                              d_real=d_real,
                              d_fake=d_fake)
            d_optimizer.zero_grad()
            g_optimizer.zero_grad()
            d_loss.backward()
            d_optimizer.step()

            if not saturating:
                if 'DCGAN' in model_name:
                    z = torch.randn((d_real.shape[0], z_dim, 1, 1),
                                    device=device)
                else:
                    z = torch.randn((d_real.shape[0], z_dim), device=device)
                fake_x = G(z)
                d_fake = D(fake_x)
                g_loss = get_loss(name=loss_name, g_loss=True, d_fake=d_fake)
                g_optimizer.zero_grad()
                g_loss.backward()
            else:
                g_loss = d_loss
            g_optimizer.step()

            writer.add_scalar('Loss/D loss', d_loss.item(), count)
            writer.add_scalar('Loss/G loss', g_loss.item(), count)
            writer.add_scalars('Discriminator output', {
                'Generated image': d_fake.mean().item(),
                'Real image': d_real.mean().item()
            },
                               global_step=count)
            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter %d, D Loss: %.5f, G loss: %.5f, time: %.2f s' %
                      (count, d_loss.item(), g_loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s_%s/' % (dataname, logdir)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img,
                                      path + 'iter_%d.png' % (count + start_n),
                                      normalize=True)
                save_checkpoint(path=logdir,
                                name='%s-%s_%d.pth' %
                                (optim_type, model_name, count + start_n),
                                D=D,
                                G=G,
                                optimizer=d_optimizer,
                                g_optimizer=g_optimizer)
            count += 1
    writer.close()
Esempio n. 17
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # Define training directory in case number of classes is required by the model instance
    main_file = args.root / args.main_file
    num_classes = len( [cur_dir.name for cur_dir in main_file.iterdir() 
                        if len(list(cur_dir.iterdir())) >= args.min_allowed_imgs] )
    if not num_classes == 1000:
        print('[INFO]: Using {} classes instead of 1000 ImageNet classes'.format(num_classes))

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.model))
        model = models.__dict__[args.model](num_classes=num_classes)
    
    
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    if args.loss_func in ['cross', 'cross_entropy', 'entropy']:
        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    
    elif args.loss_func in ['l2', 'l2_squared', 'squared', 'MSE']:
        print('[INFO] Using MSE loss function instead of Cross Entropy.')
        args.loss_func = 'l2'
        criterion = nn.MSELoss().cuda(args.gpu)

    if args.opt.lower() == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    elif args.opt.lower() == 'adam':
        print('[INFO] Using Adam optimizer instead of SGD.')
        optimizer = torch.optim.Adam(model.parameters(), args.lr,
                                    weight_decay=args.weight_decay)
    elif args.opt.lower() == 'lbfgs':
        print('[INFO] Using LBFGS optimizer instead of SGD.')
        optimizer = torch.optim.LBFGS(model.parameters(), args.lr,
                                      history_size=20
                                     )
    else:
        raise ValueError('Incorrect optimizer selection {}'.format(args.opt))
        
    
    if args.initial_lr:
        param_setup = [{'params': cur_lay.parameters()} 
                       for i, cur_lay in enumerate(model)
                       if 'weight' in dir(cur_lay)]
        optimizer = torch.optim.SGD(param_setup, args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)


    if args.schedule_lr:
        scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                                      args.lr / 100, args.lr)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    test_file = args.root / args.test_file
    if args.sub_file:
        sub_file = args.root / args.sub_file
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_trans_list = []
    if not args.norandomcrop:
        train_trans_list.append(transforms.RandomResizedCrop(224))
    if not args.norandomflip:
        train_trans_list.append(transforms.RandomHorizontalFlip())
    train_trans_list = train_trans_list + [transforms.ToTensor(), normalize]
    
    
    train_dataset = datasets.ImageFolder(
        main_file,
        transforms.Compose(train_trans_list)
    )
    
    test_dataset = datasets.ImageFolder(test_file, 
                                    transforms.Compose([
                                        transforms.Resize(256),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        normalize,
                                    ]), 
                                    train=False)
    
    if args.sub_file:
        sub_dataset = datasets.ImageFolder(test_file, 
                                    transforms.Compose([
                                        transforms.Resize(256),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        normalize,
                                    ]), 
                                    train=False)
    

    if args.train_size or args.select_class_list:
        if not args.select_class_list:
            args.select_class_list = list(range(args.num_classes))
        sel_idx = []
        for lbl in args.select_class_list:
            lbl_idx = [i for i, t in enumerate(train_dataset.targets) if t == lbl]
            sel_idx += random.sample(lbl_idx, (args.train_size if args.train_size else len(lbl_idx)))
        train_dataset.samples = train_dataset.samples[sel_idx]
        train_dataset.targets = train_dataset.targets[sel_idx]
        for cur_idx, cur_cls in enumerate(args.select_class_list):
            train_dataset.targets[train_dataset.targets==cur_cls] = cur_idx
        
        sel_idx = []
        for lbl in args.select_class_list:
            lbl_idx = [i for i, t in enumerate(test_dataset.targets) if t == lbl]
            sel_idx += lbl_idx
        test_dataset.samples = test_dataset.samples[sel_idx]
        test_dataset.targets = test_dataset.targets[sel_idx]
        for cur_idx, cur_cls in enumerate(args.select_class_list):
            test_dataset.targets[test_dataset.targets==cur_cls] = cur_idx
    
        
    # Inject symmetric noise to training set
    if args.inject_noise:
        im_per_class = int(len(train_dataset) / args.num_classes)
        noisy_labels = np.zeros((len(train_dataset),), dtype=int)
        num_shuffle = int(im_per_class * args.inject_noise)
        for i in range(args.num_classes):
            noisy_idx = []
            cur_idx = [idx for idx, label in enumerate(train_dataset.targets) if label==i]
            shuffled_idx = random.sample(cur_idx, len(cur_idx))
            for r in range(args.num_classes):
                noisy_idx += [r for idx in shuffled_idx[im_per_class - (r+1)*num_shuffle:im_per_class - r*num_shuffle]]
            noisy_idx += [i for idx in shuffled_idx[:im_per_class - args.num_classes*num_shuffle]]
            noisy_labels[cur_idx] = np.array(noisy_idx)
        train_dataset.targets = noisy_labels
    
    # TODO: Replace fraction of one training set randomly with another.
    if args.mix_cifar:
        assert args.mix_rate, "mix_rate should be given when mix_cifar is set"
        assert args.traindir2, "traindir2 must be given when mix_cifar is set"
        assert not args.inject_noise, "inject_noise should not be given when mix_cifar is set"
        assert not args.testdir2, "only one testdir can be set when mix_cifar is set"
        
        traindir2 = os.path.join(args.root, args.traindir2)
        clean_dataset = datasets.ImageFolder(
            traindir2,
            transforms.Compose([
                transforms.ToTensor(),
                normalize,
            ]))
        
        im_per_class = int(len(train_dataset) / len(train_dataset.classes))
        num_shuffle = int(im_per_class * args.mix_rate)
        shuffled_samples = []
        clean_samples = []
        for i in range(len(train_dataset.classes)):
            cur_imgs = [s[0] for s in train_dataset.samples if s[1]==i]
            cur_imgs = random.sample(cur_imgs, im_per_class - num_shuffle)
            mix_imgs = [s[0] for s in clean_dataset.samples if s[1]==i]
            mix_imgs = random.sample(mix_imgs, num_shuffle)
            clean_samples += [(img, i) for img in mix_imgs]
            shuffled_samples += [(img, i) for img in cur_imgs + mix_imgs]
            
        train_dataset.samples = shuffled_samples
        clean_dataset.samples = clean_samples
        
        val_loader2 = torch.utils.data.DataLoader(
            clean_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
            num_workers=args.workers, pin_memory=True, sampler=train_sampler)
        
    train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)
    
    if args.sub_file:
        val_loader2 = torch.utils.data.DataLoader(
            sub_dataset,
            batch_size=args.batch_size, shuffle=False,
            num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return
    
    if args.compute_jacobian:
        gvec = (torch.randn((1, args.num_classes)) / len(train_dataset)).cuda(args.gpu, non_blocking=True)
    
    # TODO: tracking weights of the model
    if args.track_weights:
        layer_idx = [i for i, cl in enumerate(model) if 'weight' in dir(cl)]
        cur_weights = get_weights(model, layer_idx)
        if args.track_weights == 'filters':
            filter_w_file = args.outpath / 'filter_weights.pickle'
            filter_w_dict = {('layer_'+str(l)): [] for i, l in enumerate(layer_idx) 
                             if cur_weights[i].ndim > 2}
        if args.track_weights == 'norm':
            w_norm_dict = {('layer_'+str(l)): 0 for i, l in enumerate(layer_idx) 
                             if cur_weights[i].ndim > 1}
    
    # TODO: scaling the weights of the model manually
    if args.scale_weights:
        scale_dict = {}
        for cur_l, cur_w in enumerate(cur_weights):
            if not (cur_w.ndim > 2):
                continue
            scale_dict['layer_' + str(layer_idx[cur_l])] = np.linalg.norm(cur_w.flatten()).item()
        rescale_weights(model, scale_dict)
    
    save_config(args)
    train_log = []
    log_file = args.outpath / 'log.json'

    for epoch in range(args.start_epoch, args.epochs):
        if (epoch < args.max_lr_adjusting_epoch) and (not args.schedule_lr):
            adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)
        
        epoch_log = {'epoch': epoch}
        
        # update learning rate with scheduler
        if args.schedule_lr:
            scheduler.step()

        # evaluate on validation set
        dum_acc1, dum_acc5 = validate(train_loader, model, criterion, args)
        epoch_log.update({'train': {'acc1': dum_acc1.cpu().numpy().item(), 
                                    'acc5': dum_acc5.cpu().numpy().item()}})
        
        acc1, acc5 = validate(val_loader, model, criterion, args)
        epoch_log.update({'test': {'acc1': acc1.cpu().numpy().item(), 
                                   'acc5': acc5.cpu().numpy().item()}})
        
        if args.sub_file or args.mix_cifar:
            dum_acc1, dum_acc5 = validate(val_loader2, model, criterion, args)
            epoch_log.update({'subset': {'acc1': dum_acc1.cpu().numpy().item(), 
                                         'acc5': dum_acc5.cpu().numpy().item()}})

        # compute the jacobian of the network
        if args.compute_jacobian:
            jTg = get_jacobian_prod(train_loader, model, criterion, gvec, args)
            epoch_log.update({'J_norm': {str(k): v.item() for k, v in enumerate(jTg)}})
        
        # TODO: tracking the weights of the layers
        if args.track_weights:
            w_change_dict = {('layer_'+str(l)): 0 for l in layer_idx}
            new_weights = get_weights(model, layer_idx)
            
            if args.track_weights == 'norm':
                for cur_l, cur_w in enumerate(new_weights):
                    if not (cur_w.ndim > 1):
                        continue
                    w_norm_dict['layer_' + str(layer_idx[cur_l])] = np.linalg.norm(cur_w.flatten()).item()
                epoch_log.update({'w_norm': {k: v for k, v in w_norm_dict.items()}})
                
            else:
                for cur_l in range(len(layer_idx)):
                    cur_change = new_weights[cur_l] - cur_weights[cur_l]

                    if args.track_weights == 'filters':
                        if cur_change.ndim > 2:
                            cur_change = np.mean(cur_change, axis=(2,3))
                            filter_w_dict['layer_' + str(layer_idx[cur_l])].append(np.absolute(cur_change))

                    chng = np.absolute(np.mean(cur_change))
                    w_change_dict['layer_' + str(layer_idx[cur_l])] = chng.item()

                epoch_log.update({'weight_change': {k: v for k, v in w_change_dict.items()}})

                if args.track_weights == 'filters':
                    with open(filter_w_file, 'wb') as fn:
                        pickle.dump({k: np.stack(v) for k, v in filter_w_dict.items()}, fn)

                cur_weights = [wh for wh in new_weights]
                new_weight = None
        
        train_log.append(epoch_log)
        with open(log_file, 'w') as fn:
            json.dump(train_log, fn, indent=2)
        
        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
            

        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_acc1': best_acc1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)
Esempio n. 18
0
def main(args):
    set_seed(args)
    save_dir = os.path.join(args.CHK_DIR, args.LOG_DIR, args.train_id)
    log_path = os.path.join('runs/', args.LOG_DIR, args.train_id)
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(log_path, exist_ok=True)
    ## save argparse parameters
    with open(os.path.join(log_path, args.train_id + '_args.yaml'), 'w') as f:
        for k, v in args.__dict__.items():
            f.write('{}: {}\n'.format(k, v))
    # writer = SummaryWriter(os.path.join('runs/',args.LOG_DIR, args.train_id))
    writer = SummaryWriter(log_path)

    train_loader = load_dataloaer(args)
    model = Net()
    model = try_gpu(model)
    model.train()
    criterion = Loss()
    optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                               model.parameters()),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    init_epoch = 0
    ## init time
    zero_time = time.time()

    for epoch in range(init_epoch, args.epochs):
        start_time = time.time()

        # train_loss = train_per_epoch(train_loader, model, criterion, optimizer, epoch)
        avg_loss = 0
        for cnt, (img, target) in enumerate(train_loader, 1):
            print(cnt, img.shape, target.shape)
            img, target = try_gpu(img), try_gpu(target)
            pred = model(img)
            loss = criterion(pred, target)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item()

            if (cnt % 10 == 0):
                # writer.add_scalar('training loss', loss.item(), epoch+cnt/(len(train_loader)//args.batch_size))
                writer.add_scalar(
                    'training loss', loss.item(),
                    epoch * (len(train_loader) // args.batch_size) + cnt)

            # if(cnt%100==0):
            if (cnt % 5000 == 0):
                cp_file = os.path.join(
                    save_dir,
                    'epoch_' + str(epoch) + '_itr_' + str(cnt)) + '.pt'
                save_checkpoint(
                    {
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }, cp_file)

        avg_loss /= len(train_loader)
        end_time = time.time()

        epoch_time = end_time - start_time
        total_time = end_time - zero_time

        # writer.add_scalar('training loss', train_loss, epoch)
        # total_train_loss.append(train_loss)
        total_train_loss.append(avg_loss)
        writer.close()
Esempio n. 19
0
def perform_training(params):
    '''
    Attempts to train the remaining number of epochs. Will fail
    if no valid model is loaded.

    Keyword arguments: params
    > params (dict) -- currently loaded state dict.

    Returns: N/A
    '''
    if params['model'] is None:
        print(
            'No model loaded! Type -n to create a new model, or -l to load an existing one from file.\n'
        )
        return

    # Delete downstream checkpoints (i.e. those with greater epoch numbers)
    # for consistency in saved checkpoints
    if not train_utils.delete_future_checkpoints(params): return
    setup_cuda(params)

    print('\n--- COMMENCE TRAINING ---\n')

    classifier_state = None
    if params['is_generator'] and params['adversarial_train']:
        print(
            '\nAdversarially training a VAE. Please load a classifier model.')
        classifier_state = load_model(param_factory(False))
        setup_cuda(classifier_state)

    # Training/val loop
    for epoch in range(params['cur_epoch'], params['total_epochs'] + 1):

        print('--- TRAINING: begin epoch', epoch, '---')

        # LR Decay - currently a stepwise decay
        adjust_learning_rate(epoch, params)

        # Train for one epoch
        train_one_epoch(epoch, params, classifier_state=classifier_state)
        print('--- TRAINING: end epoch', epoch, '---')

        if params['evaluate']:
            # Evaluate on validation set
            acc1 = validate(params, save=True, adversarial=False)
            if params['adversarial_train'] and not params['is_generator']:
                # TODO: MAKE VALIDATE ACTUALLY SAVE PROPERLY FOR ADVERSARIAL VALIDATION
                ad_acc1 = validate(params,
                                   save=False,
                                   adversarial=True,
                                   adversarial_attack='FGSM',
                                   whitebox=True)

            # Update best val accuracy
            if not params['is_generator']:
                if params['adversarial_train']:
                    params['best_ad_val_acc'] = max(ad_acc1,
                                                    params['best_ad_val_acc'])
                params['best_val_acc'] = max(acc1, params['best_val_acc'])

        # Update the current epoch
        params['cur_epoch'] += 1

        # Save checkpoint every 'save_every' epochs.
        # N.B. params['cur_epoch'] is always the epoch we would START
        # training at. The epoch name in the save file is the number of
        # epochs we have FINISHED training (in other words,
        # params['cur_epoch'] == (named epoch) + 1).
        if epoch % params['save_every'] == 0:
            train_utils.save_checkpoint(params, epoch)

    if params['total_epochs'] % params['save_every'] != 0:
        train_utils.save_checkpoint(params, params['total_epochs'])
    print('\n--- END TRAINING ---\n')
def train_mnist(epoch_num=10,
                show_iter=100,
                logdir='test',
                model_weight=None,
                load_d=False,
                load_g=False,
                compare_path=None,
                info_time=100,
                run_select=None,
                dataname='CIFAR10',
                data_path='None',
                device='cpu'):
    lr_d = 0.01
    lr_g = 0.01
    batchsize = 128
    z_dim = 96
    print('MNIST, discriminator lr: %.3f, generator lr: %.3f' % (lr_d, lr_g))
    dataset = get_data(dataname=dataname, path=data_path)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D = dc_D().to(device)
    G = dc_G(z_dim=z_dim).to(device)
    D.apply(weights_init_d)
    G.apply(weights_init_g)
    if model_weight is not None:
        chk = torch.load(model_weight)
        if load_d:
            D.load_state_dict(chk['D'])
            print('Load D from %s' % model_weight)
        if load_g:
            G.load_state_dict(chk['G'])
            print('Load G from %s' % model_weight)
    if compare_path is not None:
        discriminator = dc_D().to(device)
        model_weight = torch.load(compare_path)
        discriminator.load_state_dict(model_weight['D'])
        model_vec = torch.cat(
            [p.contiguous().view(-1) for p in discriminator.parameters()])
        print('Load discriminator from %s' % compare_path)
    if run_select is not None:
        fixed_data = torch.load(run_select)
        real_set = fixed_data['real_set']
        fake_set = fixed_data['fake_set']
        real_d = fixed_data['real_d']
        fake_d = fixed_data['fake_d']
        fixed_vec = fixed_data['pred_vec']
        print('load fixed data set')

    d_optimizer = SGD(D.parameters(), lr=lr_d)
    g_optimizer = SGD(G.parameters(), lr=lr_g)
    timer = time.time()
    count = 0
    fixed_noise = torch.randn((64, z_dim), device=device)
    for e in range(epoch_num):
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            z = torch.randn((d_real.shape[0], z_dim), device=device)
            fake_x = G(z)
            fake_x_c = fake_x.clone().detach()
            # update generator
            d_fake = D(fake_x)

            # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(),
            #                                             'Real image': d_real.mean().item()},
            #                    global_step=count)
            G_loss = get_loss(name='JSD', g_loss=True, d_fake=d_fake)
            g_optimizer.zero_grad()
            G_loss.backward()
            g_optimizer.step()
            gg = torch.norm(torch.cat(
                [p.grad.contiguous().view(-1) for p in G.parameters()]),
                            p=2)

            d_fake_c = D(fake_x_c)
            D_loss = get_loss(name='JSD',
                              g_loss=False,
                              d_real=d_real,
                              d_fake=d_fake_c)
            if compare_path is not None and count % info_time == 0:
                diff = get_diff(net=D, model_vec=model_vec)
                # writer.add_scalar('Distance from checkpoint', diff.item(), global_step=count)
                if run_select is not None:
                    with torch.no_grad():
                        d_real_set = D(real_set)
                        d_fake_set = D(fake_set)
                        diff_real = torch.norm(d_real_set - real_d, p=2)
                        diff_fake = torch.norm(d_fake_set - fake_d, p=2)
                        d_vec = torch.cat([d_real_set, d_fake_set])
                        diff = torch.norm(d_vec.sub_(fixed_vec), p=2)
                        # writer.add_scalars('L2 norm of pred difference',
                        #                    {'Total': diff.item(),
                        #                     'real set': diff_real.item(),
                        #                     'fake set': diff_fake.item()},
                        #                    global_step=count)
            d_optimizer.zero_grad()
            D_loss.backward()
            d_optimizer.step()
            gd = torch.norm(torch.cat(
                [p.grad.contiguous().view(-1) for p in D.parameters()]),
                            p=2)
            # writer.add_scalars('Loss', {'D_loss': D_loss.item(),
            #                             'G_loss': G_loss.item()}, global_step=count)
            # writer.add_scalars('Grad', {'D grad': gd.item(),
            #                             'G grad': gg.item()}, global_step=count)
            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' %
                      (count, D_loss.item(), G_loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s/' % logdir
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img,
                                      path + 'iter_%d.png' % count,
                                      normalize=True)
                save_checkpoint(path=logdir,
                                name='SGD-%.3f_%d.pth' % (lr_d, count),
                                D=D,
                                G=G)
            count += 1
Esempio n. 21
0
def train_g(epoch_num=10,
            logdir='test',
            loss_name='JSD',
            show_iter=500,
            model_weight=None,
            load_d=False,
            load_g=False,
            device='cpu'):
    lr_d = 0.01
    lr_g = 0.01
    batchsize = 128
    z_dim = 96
    print('MNIST, discriminator lr: %.3f' % lr_d)
    dataset = get_data(dataname='MNIST', path='../datas/mnist')
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D = dc_D().to(device)
    G = dc_G(z_dim=z_dim).to(device)
    D.apply(weights_init_d)
    G.apply(weights_init_g)
    if model_weight is not None:
        chk = torch.load(model_weight)
        if load_d:
            D.load_state_dict(chk['D'])
            print('Load D from %s' % model_weight)
        if load_g:
            G.load_state_dict(chk['G'])
            print('Load G from %s' % model_weight)
    from datetime import datetime
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    # writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_g))
    d_optimizer = SGD(D.parameters(), lr=lr_d)
    g_optimizer = SGD(G.parameters(), lr=lr_g)
    timer = time.time()
    count = 0
    for e in range(epoch_num):
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            z = torch.randn((real_x.shape[0], z_dim), device=device)
            fake_x = G(z)
            d_fake = D(fake_x)
            D_loss = get_loss(name=loss_name,
                              g_loss=False,
                              d_real=d_real,
                              d_fake=d_fake)
            G_loss = get_loss(name=loss_name,
                              g_loss=True,
                              d_real=d_real,
                              d_fake=d_fake)
            d_optimizer.zero_grad()
            g_optimizer.zero_grad()
            G_loss.backward()
            g_optimizer.step()
            print('D_loss: {}, G_loss: {}'.format(D_loss.item(),
                                                  G_loss.item()))
            # writer.add_scalars('Loss', {'D_loss': D_loss.item(),
            #                             'G_loss': G_loss.item()},
            #                    global_step=count)
            # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(),
            #                                             'Real image': d_real.mean().item()},
            #                    global_step=count)
            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' %
                      (count, D_loss.item(), G_loss.item(), time_cost))
                timer = time.time()
                save_checkpoint(path=logdir,
                                name='FixD-%.3f_%d.pth' % (lr_d, count),
                                D=D,
                                G=G)
            count += 1
Esempio n. 22
0
    def forward(self,
                num_updates,
                data_queue,
                data_event,
                process_event,
                tb=None,
                log_interval=100,
                checkpoint_interval=10000):
        temp_grads = None

        while (True):
            data_event.wait()
            data = data_queue.get()
            dist.barrier(async_op=True)

            if self.process_id == 0:
                original_state_dict = {}
                data_event.clear()

            if self.process_id == 0 and self.num_iter != 0 and self.num_iter % checkpoint_interval == 0:
                save_checkpoint(0,
                                self.model,
                                self.optimizer,
                                suffix=str(self.num_iter))

            # broadcast weights from master process to all others and save them to a detached dictionary for loadinglater
            for k, v in self.model.state_dict().items():
                if self.process_id == 0:  # and self.forward_passes == 0:
                    original_state_dict[k] = v.clone().detach()
                # v.to(self.device)
                dist.broadcast(v, src=0, async_op=True)

            self.model.to(self.device)
            self.model.train()

            # meta gradients
            support_x, support_y, query_x, query_y = map(
                lambda x: torch.LongTensor(x).to(self.device), data)
            for i in range(num_updates):
                self.meta_optimizer.zero_grad()
                pred_logits = self.model(input_ids=support_x,
                                         decoder_input_ids=support_y[:, :-1])
                pred_logits = pred_logits.contiguous().view(
                    -1, pred_logits.size(2))
                loss, n_correct = self.compute_mle_loss(pred_logits,
                                                        support_y[:, 1:],
                                                        smoothing=True)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.meta_optimizer.step()

            pred_logits = self.model(input_ids=query_x,
                                     decoder_input_ids=query_y[:, :-1])
            pred_logits = pred_logits.contiguous().view(
                -1, pred_logits.size(2))
            loss, n_correct = self.compute_mle_loss(pred_logits,
                                                    query_y[:, 1:],
                                                    smoothing=True)

            non_pad_mask = query_y[:1:].ne(PAD_IDX)
            n_word = non_pad_mask.sum().item()

            acc = torch.FloatTensor([n_correct / n_word]).to(self.device)

            # loss, pred = self.model(query_x, query_y)
            all_grads = autograd.grad(loss, self.model.parameters())
            dist.reduce(loss, 0, op=dist.ReduceOp.SUM, async_op=True)
            dist.reduce(acc, 0, op=dist.ReduceOp.SUM)

            for idx in range(len(all_grads)):
                dist.reduce(all_grads[idx].data,
                            0,
                            op=dist.ReduceOp.SUM,
                            async_op=True)
                all_grads[idx].data = (all_grads[idx].data / self.world_size)

            if self.process_id == 0 and tb is not None and self.num_iter % log_interval == 0:
                tb_mle_meta_batch(tb,
                                  loss.item() / self.world_size,
                                  acc / self.world_size, self.num_iter)

            if self.process_id == 0:
                # if self.forward_passes == 0:
                # temp_cop
                # temp_grads = list(deepcopy(all_grads))
                # else:
                # for i in range(len(temp_grads)):
                # temp_grads[i] += all_grads[i]
                # if self.forward_passes == self.total_forward:
                # temp_grads[i].data = temp_grads[i].data/self.total_forward

                self.num_iter += 1
                # self.forward_passes += 1
                # if self.forward_passes == self.total_forward:
                # self.forward_passes = 0
                self._write_grads(original_state_dict, temp_grads,
                                  (query_x, query_y))
                # print("finished emtamaffdhd")
                # else:
                # self.model.load_state_dict(self.original_state_dict)
                # print("doing forward pass")
                # self.model.to(self.device)
                # finished batch so can load data again from master
                process_event.set()