Example #1
0
    load_dict = checkpoint_io.load(model_file)
  except FileNotFoundError:
    it = epoch_idx = -1
  else:
    it = load_dict.get('it', -1)
    epoch_idx = load_dict.get('epoch_idx', -1)
    fid_best = load_dict.get('fid_best', float('inf'))
    logger.load_stats('stats.p')
  
  # Additional losses to GAN loss
  losses_g, losses_g_2d = build_g_losses(generator.module, config)

  # Trainer
  trainer = Trainer(
    generator, discriminator, g_optimizer, d_optimizer,
    'standard', 'real', 10.,
    losses_g=losses_g, losses_g_2d=losses_g_2d,
    n_labels=nlabels
  )
  
  # Training loop
  print('Start training...')
  while True:
    epoch_idx += 1
    print('Start epoch %d...' % epoch_idx)
    
    for x_real, y in train_loader:
      it += 1
      
      x_real, y = x_real.to(device), y.to(device)
      y.clamp_(None, nlabels - 1)
      
Example #2
0
    epoch_idx = -1

    # Reinitialize model average if needed
    if (config['training']['take_model_average']
            and config['training']['model_average_reinit']):
        update_average(generator_test, generator, 0.)

    # Learning rate anneling
    g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it)
    d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it)

    # Trainer
    trainer = Trainer(generator,
                      discriminator,
                      g_optimizer,
                      d_optimizer,
                      gan_type=config['training']['gan_type'],
                      reg_type=config['training']['reg_type'],
                      reg_param=config['training']['reg_param'])

# Training loop
print('Start training...')
save_dir = config['training']['out_dir'] + '/models/'
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

get_parameter_number(generator)
get_parameter_number(discriminator)

inception_mean_all = []
inception_std_all = []
Example #3
0
def main():
    pp = pprint.PrettyPrinter(indent=1)
    pp.pprint({
        'data': config['data'],
        'generator': config['generator'],
        'discriminator': config['discriminator'],
        'clusterer': config['clusterer'],
        'training': config['training']
    })
    is_cuda = torch.cuda.is_available()

    # Short hands
    batch_size = config['training']['batch_size']
    log_every = config['training']['log_every']
    inception_every = config['training']['inception_every']
    backup_every = config['training']['backup_every']
    sample_nlabels = config['training']['sample_nlabels']
    nlabels = config['data']['nlabels']
    sample_nlabels = min(nlabels, sample_nlabels)

    checkpoint_dir = path.join(out_dir, 'chkpts')
    nepochs = args.nepochs

    # Create missing directories
    if not path.exists(out_dir):
        os.makedirs(out_dir)
    if not path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # Logger
    checkpoint_io = CheckpointIO(checkpoint_dir=checkpoint_dir)

    device = torch.device("cuda:0" if is_cuda else "cpu")

    train_dataset, _ = get_dataset(
        name=config['data']['type'],
        data_dir=config['data']['train_dir'],
        size=config['data']['img_size'],
        deterministic=config['data']['deterministic'])

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        num_workers=config['training']['nworkers'],
        shuffle=True,
        pin_memory=True,
        sampler=None,
        drop_last=True)

    # Create models
    generator, discriminator = build_models(config)

    # Put models on gpu if needed
    generator = generator.to(device)
    discriminator = discriminator.to(device)

    for name, module in discriminator.named_modules():
        if isinstance(module, nn.Sigmoid):
            print('Found sigmoid layer in discriminator; not compatible with BCE with logits')
            exit()

    g_optimizer, d_optimizer = build_optimizers(generator, discriminator, config)

    devices = [int(x) for x in args.devices]
    generator = nn.DataParallel(generator, device_ids=devices)
    discriminator = nn.DataParallel(discriminator, device_ids=devices)

    # Register modules to checkpoint
    checkpoint_io.register_modules(generator=generator,
                                   discriminator=discriminator,
                                   g_optimizer=g_optimizer,
                                   d_optimizer=d_optimizer)

    # Logger
    logger = Logger(log_dir=path.join(out_dir, 'logs'),
                    img_dir=path.join(out_dir, 'imgs'),
                    monitoring=config['training']['monitoring'],
                    monitoring_dir=path.join(out_dir, 'monitoring'))

    # Distributions
    ydist = get_ydist(nlabels, device=device)
    zdist = get_zdist(config['z_dist']['type'], config['z_dist']['dim'], device=device)

    ntest = config['training']['ntest']
    x_test, y_test = utils.get_nsamples(train_loader, ntest)
    x_cluster, y_cluster = utils.get_nsamples(train_loader, config['clusterer']['nimgs'])
    x_test, y_test = x_test.to(device), y_test.to(device)
    z_test = zdist.sample((ntest, ))
    utils.save_images(x_test, path.join(out_dir, 'real.png'))
    logger.add_imgs(x_test, 'gt', 0)

    # Test generator
    if config['training']['take_model_average']:
        print('Taking model average')
        bad_modules = [nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d]
        for model in [generator, discriminator]:
            for name, module in model.named_modules():
                for bad_module in bad_modules:
                    if isinstance(module, bad_module):
                        print('Batch norm in discriminator not compatible with exponential moving average')
                        exit()
        generator_test = copy.deepcopy(generator)
        checkpoint_io.register_modules(generator_test=generator_test)
    else:
        generator_test = generator

    clusterer = get_clusterer(config)(discriminator=discriminator,
                                      x_cluster=x_cluster,
                                      x_labels=y_cluster,
                                      gt_nlabels=config['data']['nlabels'],
                                      **config['clusterer']['kwargs'])

    # Load checkpoint if it exists
    it = utils.get_most_recent(checkpoint_dir, 'model') if args.model_it == -1 else args.model_it
    it, epoch_idx, loaded_clusterer = checkpoint_io.load_models(it=it, load_samples='supervised' != config['clusterer']['name'])

    if loaded_clusterer is None:
        print('Initializing new clusterer. The first clustering can be quite slow.')
        clusterer.recluster(discriminator=discriminator)
        checkpoint_io.save_clusterer(clusterer, it=0)
        np.savez(os.path.join(checkpoint_dir, 'cluster_samples.npz'), x=x_cluster)
    else:
        print('Using loaded clusterer')
        clusterer = loaded_clusterer

    # Evaluator
    evaluator = Evaluator(
        generator_test,
        zdist,
        ydist,
        train_loader=train_loader,
        clusterer=clusterer,
        batch_size=batch_size,
        device=device,
        inception_nsamples=config['training']['inception_nsamples'])

    # Trainer
    trainer = Trainer(generator,
                      discriminator,
                      g_optimizer,
                      d_optimizer,
                      gan_type=config['training']['gan_type'],
                      reg_type=config['training']['reg_type'],
                      reg_param=config['training']['reg_param'])

    # Training loop
    print('Start training...')
    while it < args.nepochs * len(train_loader):
        epoch_idx += 1

        for x_real, y in train_loader:
            it += 1

            x_real, y = x_real.to(device), y.to(device)
            z = zdist.sample((batch_size, ))
            y = clusterer.get_labels(x_real, y).to(device)

            # Discriminator updates
            dloss, reg = trainer.discriminator_trainstep(x_real, y, z)
            logger.add('losses', 'discriminator', dloss, it=it)
            logger.add('losses', 'regularizer', reg, it=it)

            # Generators updates
            gloss = trainer.generator_trainstep(y, z)
            logger.add('losses', 'generator', gloss, it=it)

            if config['training']['take_model_average']:
                update_average(generator_test, generator, beta=config['training']['model_average_beta'])

            # Print stats
            if it % log_every == 0:
                g_loss_last = logger.get_last('losses', 'generator')
                d_loss_last = logger.get_last('losses', 'discriminator')
                d_reg_last = logger.get_last('losses', 'regularizer')
                print('[epoch %0d, it %4d] g_loss = %.4f, d_loss = %.4f, reg=%.4f'
                      % (epoch_idx, it, g_loss_last, d_loss_last, d_reg_last))

            if it % config['training']['recluster_every'] == 0 and it > config['training']['burnin_time']:
                # print cluster distribution for online methods
                if it % 100 == 0 and config['training']['recluster_every'] <= 100:
                    print(f'[epoch {epoch_idx}, it {it}], distribution: {clusterer.get_label_distribution(x_real)}')
                clusterer.recluster(discriminator=discriminator, x_batch=x_real)

            # (i) Sample if necessary
            if it % config['training']['sample_every'] == 0:
                print('Creating samples...')
                x = evaluator.create_samples(z_test, y_test)
                x = evaluator.create_samples(z_test, clusterer.get_labels(x_test, y_test).to(device))
                logger.add_imgs(x, 'all', it)

                for y_inst in range(sample_nlabels):
                    x = evaluator.create_samples(z_test, y_inst)
                    logger.add_imgs(x, '%04d' % y_inst, it)

            # (ii) Compute inception if necessary
            if it % inception_every == 0 and it > 0:
                print('PyTorch Inception score...')
                inception_mean, inception_std = evaluator.compute_inception_score()
                logger.add('metrics', 'pt_inception_mean', inception_mean, it=it)
                logger.add('metrics', 'pt_inception_stddev', inception_std, it=it)
                print(f'[epoch {epoch_idx}, it {it}] pt_inception_mean: {inception_mean}, pt_inception_stddev: {inception_std}')

            # (iii) Backup if necessary
            if it % backup_every == 0:
                print('Saving backup...')
                checkpoint_io.save('model_%08d.pt' % it, it=it)
                checkpoint_io.save_clusterer(clusterer, int(it))
                logger.save_stats('stats_%08d.p' % it)

                if it > 0: checkpoint_io.save('model.pt', it=it)
Example #4
0
def main(outdir):
    for subdir in ['all', 'snapshots', 'clusters']:
        if not os.path.exists(os.path.join(outdir, subdir)):
            os.makedirs(os.path.join(outdir, subdir), exist_ok=True)

    if data_type == 'grid':
        get_data = inputs.get_data_grid
        percent_good = evaluation.percent_good_grid
    elif data_type == 'ring':
        get_data = inputs.get_data_ring
        percent_good = evaluation.percent_good_ring
    else:
        raise NotImplementedError()

    zdist = distributions.Normal(torch.zeros(z_dim, device=device),
                                 torch.ones(z_dim, device=device))
    z_test = zdist.sample((test_batch_size, ))

    x_test, y_test = get_test(get_data=get_data,
                              batch_size=test_batch_size,
                              variance=variance,
                              k_value=k_value,
                              device=device)

    x_cluster, _ = get_test(get_data=get_data,
                            batch_size=10000,
                            variance=variance,
                            k_value=k_value,
                            device=device)

    train_loader = get_dataset(get_data=get_data,
                               batch_size=train_batch_size,
                               npts=npts,
                               variance=variance,
                               k_value=k_value)

    def train(trainer, g, d, clusterer, exp_dir):
        it = 0
        if os.path.exists(os.path.join(exp_dir, 'log.txt')):
            os.remove(os.path.join(exp_dir, 'log.txt'))

        for epoch in range(nepochs):
            for x_real, y in train_loader:
                z = zdist.sample((train_batch_size, ))
                x_real, y = x_real.to(device), y.to(device)
                y = clusterer.get_labels(x_real, y)

                dloss, _ = trainer.discriminator_trainstep(x_real, y, z)
                gloss = trainer.generator_trainstep(y, z)

                if it % args.recluster_every == 0 and args.clusterer != 'supervised':
                    if args.clusterer != 'burnin' or it >= args.burnin_time:
                        clusterer.recluster(discriminator, x_batch=x_real)

                if it % 1000 == 0:
                    x_fake = g(z_test, clusterer.get_labels(
                        x_test, y_test)).detach().cpu().numpy()

                    visualize_generated(x_fake,
                                        x_test.detach().cpu().numpy(), y, it,
                                        exp_dir)

                    visualize_clusters(x_test.detach().cpu().numpy(),
                                       clusterer.get_labels(x_test, y_test),
                                       it, exp_dir)

                    torch.save(
                        {
                            'generator': g.state_dict(),
                            'discriminator': d.state_dict(),
                            'g_optimizer': g_optimizer.state_dict(),
                            'd_optimizer': d_optimizer.state_dict()
                        },
                        os.path.join(exp_dir, 'snapshots', 'model_%d.pt' % it))

                if it % 1000 == 0:
                    g.eval()
                    d.eval()

                    x_fake = g(z_test, clusterer.get_labels(
                        x_test, y_test)).detach().cpu().numpy()
                    percent, modes, kl = percent_good(x_fake, var=variance)
                    log_message = f'[epoch {epoch} it {it}] dloss = {dloss}, gloss = {gloss}, prop_real = {percent}, modes = {modes}, kl = {kl}'
                    with open(os.path.join(exp_dir, 'log.txt'), 'a+') as f:
                        f.write(log_message + '\n')
                    print(log_message)

                it += 1

    # train a G/D from scratch
    generator, discriminator = get_models(args.model_type, 'conditional',
                                          num_clusters, args.d_act_dim, device)
    g_optimizer, d_optimizer = get_optimizers(generator, discriminator)
    trainer = Trainer(generator,
                      discriminator,
                      g_optimizer,
                      d_optimizer,
                      gan_type='standard',
                      reg_type='none',
                      reg_param=0)
    clusterer = clusterer_dict[args.clusterer](discriminator=discriminator,
                                               k_value=num_clusters,
                                               x_cluster=x_cluster)
    clusterer.recluster(discriminator=discriminator)
    train(trainer, generator, discriminator, clusterer, os.path.join(outdir))
Example #5
0
        epoch_idx = -1

        # Reinitialize model average if needed
        if (config['training']['take_model_average']
                and config['training']['model_average_reinit']):
            update_average(generator_test, generator, 0.)

        # Learning rate anneling
        g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it)
        d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it)

        # Trainer
        trainer = Trainer(generator,
                          discriminator,
                          g_optimizer,
                          d_optimizer,
                          gan_type=config['training']['gan_type'],
                          reg_type=config['training']['reg_type'],
                          reg_param=config['training']['reg_param'],
                          D_fix_layer=config['discriminator']['layers'])

    # Training loop
    print('Start training...')
    save_dir = config['training']['out_dir'] + '/models/'
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    FLAG = 500

    inception_mean_all = []
    inception_std_all = []
    fid_all = []
Example #6
0
    logger.load_stats('stats.p')

# Reinitialize model average if needed
if (config['training']['take_model_average']
        and config['training']['model_average_reinit']):
    update_average(generator_test, generator, 0.)

# Learning rate anneling
g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it)
d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it)

# Trainer
trainer = Trainer(dvae,
                  generator,
                  discriminator,
                  g_optimizer,
                  d_optimizer,
                  reg_param=config['training']['reg_param'],
                  w_info=config['training']['w_info'])

# Training loop
tqdm.write('Start training...')
pbar = tqdm(total=max_iter)
if it > 0:
    pbar.update(it)

out = False
while not out:
    epoch_idx += 1
    tqdm.write('Start epoch %d...' % epoch_idx)
Example #7
0
        config['training']['reg_param'] = reg_param

# Reinitialize model average if needed
if (config['training']['take_model_average']
        and config['training']['model_average_reinit']):
    update_average(generator_test, generator, 0.)

# Learning rate anneling
g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it)
d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it)

# Trainer
trainer = Trainer(
    generator, discriminator, g_optimizer, d_optimizer,
    gan_type=config['training']['gan_type'],
    reg_type=config['training']['reg_type'],
    reg_param=config['training']['reg_param'],
    adaptive_beta=adaptive_beta,
    **config['training']['kwargs']
)

# Training loop
print('Start training...')
while True:
    epoch_idx += 1
    print('Start epoch %d...' % epoch_idx)

    for x_real, y in train_loader:
        it += 1
        g_scheduler.step()
        d_scheduler.step()