load_dict = checkpoint_io.load(model_file) except FileNotFoundError: it = epoch_idx = -1 else: it = load_dict.get('it', -1) epoch_idx = load_dict.get('epoch_idx', -1) fid_best = load_dict.get('fid_best', float('inf')) logger.load_stats('stats.p') # Additional losses to GAN loss losses_g, losses_g_2d = build_g_losses(generator.module, config) # Trainer trainer = Trainer( generator, discriminator, g_optimizer, d_optimizer, 'standard', 'real', 10., losses_g=losses_g, losses_g_2d=losses_g_2d, n_labels=nlabels ) # Training loop print('Start training...') while True: epoch_idx += 1 print('Start epoch %d...' % epoch_idx) for x_real, y in train_loader: it += 1 x_real, y = x_real.to(device), y.to(device) y.clamp_(None, nlabels - 1)
epoch_idx = -1 # Reinitialize model average if needed if (config['training']['take_model_average'] and config['training']['model_average_reinit']): update_average(generator_test, generator, 0.) # Learning rate anneling g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it) d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it) # Trainer trainer = Trainer(generator, discriminator, g_optimizer, d_optimizer, gan_type=config['training']['gan_type'], reg_type=config['training']['reg_type'], reg_param=config['training']['reg_param']) # Training loop print('Start training...') save_dir = config['training']['out_dir'] + '/models/' if not os.path.isdir(save_dir): os.makedirs(save_dir) get_parameter_number(generator) get_parameter_number(discriminator) inception_mean_all = [] inception_std_all = []
def main(): pp = pprint.PrettyPrinter(indent=1) pp.pprint({ 'data': config['data'], 'generator': config['generator'], 'discriminator': config['discriminator'], 'clusterer': config['clusterer'], 'training': config['training'] }) is_cuda = torch.cuda.is_available() # Short hands batch_size = config['training']['batch_size'] log_every = config['training']['log_every'] inception_every = config['training']['inception_every'] backup_every = config['training']['backup_every'] sample_nlabels = config['training']['sample_nlabels'] nlabels = config['data']['nlabels'] sample_nlabels = min(nlabels, sample_nlabels) checkpoint_dir = path.join(out_dir, 'chkpts') nepochs = args.nepochs # Create missing directories if not path.exists(out_dir): os.makedirs(out_dir) if not path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # Logger checkpoint_io = CheckpointIO(checkpoint_dir=checkpoint_dir) device = torch.device("cuda:0" if is_cuda else "cpu") train_dataset, _ = get_dataset( name=config['data']['type'], data_dir=config['data']['train_dir'], size=config['data']['img_size'], deterministic=config['data']['deterministic']) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=config['training']['nworkers'], shuffle=True, pin_memory=True, sampler=None, drop_last=True) # Create models generator, discriminator = build_models(config) # Put models on gpu if needed generator = generator.to(device) discriminator = discriminator.to(device) for name, module in discriminator.named_modules(): if isinstance(module, nn.Sigmoid): print('Found sigmoid layer in discriminator; not compatible with BCE with logits') exit() g_optimizer, d_optimizer = build_optimizers(generator, discriminator, config) devices = [int(x) for x in args.devices] generator = nn.DataParallel(generator, device_ids=devices) discriminator = nn.DataParallel(discriminator, device_ids=devices) # Register modules to checkpoint checkpoint_io.register_modules(generator=generator, discriminator=discriminator, g_optimizer=g_optimizer, d_optimizer=d_optimizer) # Logger logger = Logger(log_dir=path.join(out_dir, 'logs'), img_dir=path.join(out_dir, 'imgs'), monitoring=config['training']['monitoring'], monitoring_dir=path.join(out_dir, 'monitoring')) # Distributions ydist = get_ydist(nlabels, device=device) zdist = get_zdist(config['z_dist']['type'], config['z_dist']['dim'], device=device) ntest = config['training']['ntest'] x_test, y_test = utils.get_nsamples(train_loader, ntest) x_cluster, y_cluster = utils.get_nsamples(train_loader, config['clusterer']['nimgs']) x_test, y_test = x_test.to(device), y_test.to(device) z_test = zdist.sample((ntest, )) utils.save_images(x_test, path.join(out_dir, 'real.png')) logger.add_imgs(x_test, 'gt', 0) # Test generator if config['training']['take_model_average']: print('Taking model average') bad_modules = [nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d] for model in [generator, discriminator]: for name, module in model.named_modules(): for bad_module in bad_modules: if isinstance(module, bad_module): print('Batch norm in discriminator not compatible with exponential moving average') exit() generator_test = copy.deepcopy(generator) checkpoint_io.register_modules(generator_test=generator_test) else: generator_test = generator clusterer = get_clusterer(config)(discriminator=discriminator, x_cluster=x_cluster, x_labels=y_cluster, gt_nlabels=config['data']['nlabels'], **config['clusterer']['kwargs']) # Load checkpoint if it exists it = utils.get_most_recent(checkpoint_dir, 'model') if args.model_it == -1 else args.model_it it, epoch_idx, loaded_clusterer = checkpoint_io.load_models(it=it, load_samples='supervised' != config['clusterer']['name']) if loaded_clusterer is None: print('Initializing new clusterer. The first clustering can be quite slow.') clusterer.recluster(discriminator=discriminator) checkpoint_io.save_clusterer(clusterer, it=0) np.savez(os.path.join(checkpoint_dir, 'cluster_samples.npz'), x=x_cluster) else: print('Using loaded clusterer') clusterer = loaded_clusterer # Evaluator evaluator = Evaluator( generator_test, zdist, ydist, train_loader=train_loader, clusterer=clusterer, batch_size=batch_size, device=device, inception_nsamples=config['training']['inception_nsamples']) # Trainer trainer = Trainer(generator, discriminator, g_optimizer, d_optimizer, gan_type=config['training']['gan_type'], reg_type=config['training']['reg_type'], reg_param=config['training']['reg_param']) # Training loop print('Start training...') while it < args.nepochs * len(train_loader): epoch_idx += 1 for x_real, y in train_loader: it += 1 x_real, y = x_real.to(device), y.to(device) z = zdist.sample((batch_size, )) y = clusterer.get_labels(x_real, y).to(device) # Discriminator updates dloss, reg = trainer.discriminator_trainstep(x_real, y, z) logger.add('losses', 'discriminator', dloss, it=it) logger.add('losses', 'regularizer', reg, it=it) # Generators updates gloss = trainer.generator_trainstep(y, z) logger.add('losses', 'generator', gloss, it=it) if config['training']['take_model_average']: update_average(generator_test, generator, beta=config['training']['model_average_beta']) # Print stats if it % log_every == 0: g_loss_last = logger.get_last('losses', 'generator') d_loss_last = logger.get_last('losses', 'discriminator') d_reg_last = logger.get_last('losses', 'regularizer') print('[epoch %0d, it %4d] g_loss = %.4f, d_loss = %.4f, reg=%.4f' % (epoch_idx, it, g_loss_last, d_loss_last, d_reg_last)) if it % config['training']['recluster_every'] == 0 and it > config['training']['burnin_time']: # print cluster distribution for online methods if it % 100 == 0 and config['training']['recluster_every'] <= 100: print(f'[epoch {epoch_idx}, it {it}], distribution: {clusterer.get_label_distribution(x_real)}') clusterer.recluster(discriminator=discriminator, x_batch=x_real) # (i) Sample if necessary if it % config['training']['sample_every'] == 0: print('Creating samples...') x = evaluator.create_samples(z_test, y_test) x = evaluator.create_samples(z_test, clusterer.get_labels(x_test, y_test).to(device)) logger.add_imgs(x, 'all', it) for y_inst in range(sample_nlabels): x = evaluator.create_samples(z_test, y_inst) logger.add_imgs(x, '%04d' % y_inst, it) # (ii) Compute inception if necessary if it % inception_every == 0 and it > 0: print('PyTorch Inception score...') inception_mean, inception_std = evaluator.compute_inception_score() logger.add('metrics', 'pt_inception_mean', inception_mean, it=it) logger.add('metrics', 'pt_inception_stddev', inception_std, it=it) print(f'[epoch {epoch_idx}, it {it}] pt_inception_mean: {inception_mean}, pt_inception_stddev: {inception_std}') # (iii) Backup if necessary if it % backup_every == 0: print('Saving backup...') checkpoint_io.save('model_%08d.pt' % it, it=it) checkpoint_io.save_clusterer(clusterer, int(it)) logger.save_stats('stats_%08d.p' % it) if it > 0: checkpoint_io.save('model.pt', it=it)
def main(outdir): for subdir in ['all', 'snapshots', 'clusters']: if not os.path.exists(os.path.join(outdir, subdir)): os.makedirs(os.path.join(outdir, subdir), exist_ok=True) if data_type == 'grid': get_data = inputs.get_data_grid percent_good = evaluation.percent_good_grid elif data_type == 'ring': get_data = inputs.get_data_ring percent_good = evaluation.percent_good_ring else: raise NotImplementedError() zdist = distributions.Normal(torch.zeros(z_dim, device=device), torch.ones(z_dim, device=device)) z_test = zdist.sample((test_batch_size, )) x_test, y_test = get_test(get_data=get_data, batch_size=test_batch_size, variance=variance, k_value=k_value, device=device) x_cluster, _ = get_test(get_data=get_data, batch_size=10000, variance=variance, k_value=k_value, device=device) train_loader = get_dataset(get_data=get_data, batch_size=train_batch_size, npts=npts, variance=variance, k_value=k_value) def train(trainer, g, d, clusterer, exp_dir): it = 0 if os.path.exists(os.path.join(exp_dir, 'log.txt')): os.remove(os.path.join(exp_dir, 'log.txt')) for epoch in range(nepochs): for x_real, y in train_loader: z = zdist.sample((train_batch_size, )) x_real, y = x_real.to(device), y.to(device) y = clusterer.get_labels(x_real, y) dloss, _ = trainer.discriminator_trainstep(x_real, y, z) gloss = trainer.generator_trainstep(y, z) if it % args.recluster_every == 0 and args.clusterer != 'supervised': if args.clusterer != 'burnin' or it >= args.burnin_time: clusterer.recluster(discriminator, x_batch=x_real) if it % 1000 == 0: x_fake = g(z_test, clusterer.get_labels( x_test, y_test)).detach().cpu().numpy() visualize_generated(x_fake, x_test.detach().cpu().numpy(), y, it, exp_dir) visualize_clusters(x_test.detach().cpu().numpy(), clusterer.get_labels(x_test, y_test), it, exp_dir) torch.save( { 'generator': g.state_dict(), 'discriminator': d.state_dict(), 'g_optimizer': g_optimizer.state_dict(), 'd_optimizer': d_optimizer.state_dict() }, os.path.join(exp_dir, 'snapshots', 'model_%d.pt' % it)) if it % 1000 == 0: g.eval() d.eval() x_fake = g(z_test, clusterer.get_labels( x_test, y_test)).detach().cpu().numpy() percent, modes, kl = percent_good(x_fake, var=variance) log_message = f'[epoch {epoch} it {it}] dloss = {dloss}, gloss = {gloss}, prop_real = {percent}, modes = {modes}, kl = {kl}' with open(os.path.join(exp_dir, 'log.txt'), 'a+') as f: f.write(log_message + '\n') print(log_message) it += 1 # train a G/D from scratch generator, discriminator = get_models(args.model_type, 'conditional', num_clusters, args.d_act_dim, device) g_optimizer, d_optimizer = get_optimizers(generator, discriminator) trainer = Trainer(generator, discriminator, g_optimizer, d_optimizer, gan_type='standard', reg_type='none', reg_param=0) clusterer = clusterer_dict[args.clusterer](discriminator=discriminator, k_value=num_clusters, x_cluster=x_cluster) clusterer.recluster(discriminator=discriminator) train(trainer, generator, discriminator, clusterer, os.path.join(outdir))
epoch_idx = -1 # Reinitialize model average if needed if (config['training']['take_model_average'] and config['training']['model_average_reinit']): update_average(generator_test, generator, 0.) # Learning rate anneling g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it) d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it) # Trainer trainer = Trainer(generator, discriminator, g_optimizer, d_optimizer, gan_type=config['training']['gan_type'], reg_type=config['training']['reg_type'], reg_param=config['training']['reg_param'], D_fix_layer=config['discriminator']['layers']) # Training loop print('Start training...') save_dir = config['training']['out_dir'] + '/models/' if not os.path.isdir(save_dir): os.makedirs(save_dir) FLAG = 500 inception_mean_all = [] inception_std_all = [] fid_all = []
logger.load_stats('stats.p') # Reinitialize model average if needed if (config['training']['take_model_average'] and config['training']['model_average_reinit']): update_average(generator_test, generator, 0.) # Learning rate anneling g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it) d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it) # Trainer trainer = Trainer(dvae, generator, discriminator, g_optimizer, d_optimizer, reg_param=config['training']['reg_param'], w_info=config['training']['w_info']) # Training loop tqdm.write('Start training...') pbar = tqdm(total=max_iter) if it > 0: pbar.update(it) out = False while not out: epoch_idx += 1 tqdm.write('Start epoch %d...' % epoch_idx)
config['training']['reg_param'] = reg_param # Reinitialize model average if needed if (config['training']['take_model_average'] and config['training']['model_average_reinit']): update_average(generator_test, generator, 0.) # Learning rate anneling g_scheduler = build_lr_scheduler(g_optimizer, config, last_epoch=it) d_scheduler = build_lr_scheduler(d_optimizer, config, last_epoch=it) # Trainer trainer = Trainer( generator, discriminator, g_optimizer, d_optimizer, gan_type=config['training']['gan_type'], reg_type=config['training']['reg_type'], reg_param=config['training']['reg_param'], adaptive_beta=adaptive_beta, **config['training']['kwargs'] ) # Training loop print('Start training...') while True: epoch_idx += 1 print('Start epoch %d...' % epoch_idx) for x_real, y in train_loader: it += 1 g_scheduler.step() d_scheduler.step()