def GetLoss(self): # get loss gradients over multiple GPUs loss_gen = [] loss_gen_test = [] for i in range(self.args.nr_gpu): with tf.device('/gpu:%d' % i): # train gen_par = self.model(self.xs[i], self.hs[i], ema=None, dropout_p=self.args.dropout_p, **self.model_opt) loss_gen.append( nn.discretized_mix_logistic_loss(self.xs[i], gen_par, sum_all=False)) # test gen_par = self.model(self.xs[i], self.hs[i], ema=self.ema, dropout_p=0., **self.model_opt) loss_gen_test.append( nn.discretized_mix_logistic_loss(self.xs[i], gen_par)) return loss_gen, loss_gen_test
def GetOverallLoss(self): # get loss gradients over multiple GPUs loss_gen = [] loss_gen_test = [] for i in range(self.args.nr_gpu): with tf.device('/gpu:%d' % i): # train gen_par = self.model(self.xs[i], self.hs[i], ema=None, dropout_p=self.args.dropout_p, **self.model_opt) loss_gen.append( nn.discretized_mix_logistic_loss(self.xs[i], gen_par, sum_all=False)) # test gen_par = self.model(self.xs[i], self.hs[i], ema=self.ema, dropout_p=0., **self.model_opt) loss_gen_test.append( nn.discretized_mix_logistic_loss(self.xs[i], gen_par)) # add the lossx to /gpu:0 with tf.device('/gpu:0'): for i in range(1, self.args.nr_gpu): loss_gen[0] += loss_gen[i] loss_gen_test[0] += loss_gen_test[i] # training op #optimizer = tf.group(nn.adam_updates(self.all_params, grads[0], lr=self.tf_lr, mom1=0.95, mom2=0.9995), self.maintain_averages_op) # convert loss to bits/dim self.bits_per_dim = loss_gen[0] / (self.args.nr_gpu * np.log(2.) * np.prod(self.image_shape) * self.args.batch_size) self.bits_per_dim_test = loss_gen_test[0] / ( self.args.nr_gpu * np.log(2.) * np.prod(self.image_shape) * self.args.batch_size)
dropout_p=args.dropout_p, **encoder_opt) gen_par = model(xs[i], encoder.pred, ema=None, dropout_p=args.dropout_p, **model_opt) loss_gen_reg.append(encoder.reg_loss) loss_gen_elbo.append(encoder.elbo_loss) else: gen_par = model(xs[i], hs[i], ema=None, dropout_p=args.dropout_p, **model_opt) loss_gen.append(nn.discretized_mix_logistic_loss(xs[i], gen_par)) # gradients if args.use_autoencoder: total_loss = loss_gen[i] + loss_gen_reg[i] else: total_loss = loss_gen[i] grads.append(tf.gradients(total_loss, all_params)) # test if args.use_autoencoder: encoder = encoder_model(encoder_x[i], ema=ema, dropout_p=0., **encoder_opt) gen_par = model(xs[i], encoder.pred, ema=ema,
def main(args): import os import sys import time import json from mpi4py import MPI import numpy as np import tensorflow as tf from tqdm import trange import pixel_cnn_pp.nn as nn import pixel_cnn_pp.plotting as plotting from pixel_cnn_pp import model as pxpp_models import data.cifar10_data as cifar10_data import data.imagenet_data as imagenet_data import tf_utils as tfu comm = MPI.COMM_WORLD num_tasks, task_id = comm.Get_size(), comm.Get_rank() save_dir = args.save_dir if task_id == 0: os.makedirs(save_dir) f_log = open(os.path.join(save_dir, 'print.log'), 'w') def lprint(*a, **kw): if task_id == 0: print(*a, **kw) print(*a, **kw, file=f_log) lprint('input args:\n', json.dumps(vars(args), indent=4, separators=(',', ':'))) # pretty print args # ----------------------------------------------------------------------------- # fix random seed for reproducibility rng = np.random.RandomState(args.seed + task_id) tf.set_random_seed(args.seed + task_id) # initialize data loaders for train/test splits if args.data_set == 'imagenet' and args.class_conditional: raise("We currently don't have labels for the small imagenet data set") DataLoader = {'cifar': cifar10_data.DataLoader, 'imagenet': imagenet_data.DataLoader}[args.data_set] train_data = DataLoader(args.data_dir, 'train', args.batch_size, rng=rng, shuffle=True, return_labels=args.class_conditional) test_data = DataLoader(args.data_dir, 'test', args.batch_size, shuffle=False, return_labels=args.class_conditional) obs_shape = train_data.get_observation_size() # e.g. a tuple (32,32,3) assert len(obs_shape) == 3, 'assumed right now' if args.nr_gpu is None: from tensorflow.python.client import device_lib args.nr_gpu = len([d for d in device_lib.list_local_devices() if d.device_type == 'GPU']) # data place holders x_init = tf.placeholder(tf.float32, shape=(args.init_batch_size,) + obs_shape) xs = [tf.placeholder(tf.float32, shape=(args.batch_size, ) + obs_shape) for _ in range(args.nr_gpu)] def _get_batch(is_training): if is_training: x = train_data.__next__(args.batch_size) else: x = test_data.__next__(args.batch_size) x = np.cast[np.float32]((x - 127.5) / 127.5) return dict(x=x) batch_def = dict(x=tfu.vdef(args.batch_size, obs_shape)) qr = tfu.Struct( train=tfu.PyfuncRunner(batch_def, 64, 8, True, _get_batch, is_training=True), test=tfu.PyfuncRunner(batch_def, 64, 8, True, _get_batch, is_training=False), ) tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, qr.train) tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, qr.test) if args.nr_gpu is None: from tensorflow.python.client import device_lib args.nr_gpu = len([d for d in device_lib.list_local_devices() if d.device_type == 'GPU']) sess = tfu.Session(allow_soft_placement=True).__enter__() # if the model is class-conditional we'll set up label placeholders + # one-hot encodings 'h' to condition on if args.class_conditional: raise NotImplementedError num_labels = train_data.get_num_labels() y_init = tf.placeholder(tf.int32, shape=(args.init_batch_size,)) h_init = tf.one_hot(y_init, num_labels) y_sample = np.split( np.mod(np.arange(args.batch_size), num_labels), args.nr_gpu) h_sample = [tf.one_hot(tf.Variable(y_sample[i], trainable=False), num_labels) for i in range(args.nr_gpu)] ys = [tf.placeholder(tf.int32, shape=(args.batch_size,)) for i in range(args.nr_gpu)] hs = [tf.one_hot(ys[i], num_labels) for i in range(args.nr_gpu)] else: h_init = None h_sample = [None] * args.nr_gpu hs = h_sample # create the model model_opt = {'nr_resnet': args.nr_resnet, 'nr_filters': args.nr_filters, 'nr_logistic_mix': args.nr_logistic_mix, 'resnet_nonlinearity': args.resnet_nonlinearity} model = tf.make_template('model', getattr(pxpp_models, args.model + "_spec")) # run once for data dependent initialization of parameters with tf.device('/gpu:0'): gen_par = model(x_init, h_init, init=True, dropout_p=args.dropout_p, **model_opt) # keep track of moving average all_params = tf.trainable_variables() lprint('# of Parameters', sum(np.prod(p.get_shape().as_list()) for p in all_params)) ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay) maintain_averages_op = tf.group(ema.apply(all_params)) loss_gen, loss_gen_test, grads = [], [], [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): x = qr.train.batch().x gen_par = model(x, hs[i], ema=None, dropout_p=args.dropout_p, **model_opt) if isinstance(gen_par, tuple) and len(gen_par) == 3: loss_gen.append(nn.discretized_mix_logistic_loss_per_chn(x, *gen_par)) else: loss_gen.append(nn.discretized_mix_logistic_loss(x, gen_par)) grads.append(tf.gradients(loss_gen[i], all_params)) x = qr.test.batch().x gen_par = model(x, hs[i], ema=ema, dropout_p=0., **model_opt) if isinstance(gen_par, tuple) and len(gen_par) == 3: loss_gen_test.append( nn.discretized_mix_logistic_loss_per_chn(x, *gen_par)) else: loss_gen_test.append(nn.discretized_mix_logistic_loss(x, gen_par)) # add losses and gradients together and get training updates tf_lr = tf.placeholder(tf.float32, shape=[]) with tf.device('/gpu:0'): for i in range(1, args.nr_gpu): loss_gen[0] += loss_gen[i] loss_gen_test[0] += loss_gen_test[i] for j in range(len(grads[0])): grads[0][j] += grads[i][j] if num_tasks > 1: lprint('creating mpi optimizer') # If we have multiple mpi processes, average across them. flat_grad = tf.concat([tf.reshape(g, (-1,)) for g in grads[0]], axis=0) shapes = [g.shape.as_list() for g in grads[0]] sizes = [int(np.prod(s)) for s in shapes] buf = np.zeros(sum(sizes), np.float32) def _gather_grads(my_flat_grad): comm.Allreduce(my_flat_grad, buf, op=MPI.SUM) np.divide(buf, float(num_tasks), out=buf) return buf avg_flat_grad = tf.py_func(_gather_grads, [flat_grad], tf.float32) avg_flat_grad.set_shape(flat_grad.shape) avg_grads = tf.split(avg_flat_grad, sizes, axis=0) grads[0] = [tf.reshape(g, v.shape) for g, v in zip(avg_grads, grads[0])] # training op optimizer = tf.group(nn.adam_updates( all_params, grads[0], lr=tf_lr, mom1=0.95, mom2=0.9995, eps=1e-6), maintain_averages_op) # convert loss to bits/dim total_gpus = sum(comm.allgather(args.nr_gpu)) lprint('using %d gpus across %d machines' % (total_gpus, num_tasks)) norm_const = np.log(2.) * np.prod(obs_shape) * args.batch_size norm_const *= total_gpus / num_tasks bits_per_dim = loss_gen[0] / norm_const bits_per_dim_test = loss_gen_test[0] / norm_const bits_per_dim = tf.check_numerics(bits_per_dim, 'train loss is nan') bits_per_dim_test = tf.check_numerics(bits_per_dim_test, 'test loss is nan') new_x_gen = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): gen_par = model(xs[i], hs[i], ema=ema, dropout_p=0, **model_opt) new_x_gen.append( nn.sample_from_discretized_mix_logistic(gen_par, args.nr_logistic_mix)) def sample_from_model(sess, n_samples=args.nr_gpu * args.batch_size): sample_x = np.zeros((0,) + obs_shape, dtype=np.float32) while len(sample_x) < n_samples: x_gen = [np.zeros((args.batch_size,) + obs_shape, dtype=np.float32) for i in range(args.nr_gpu)] for yi in range(obs_shape[0]): for xi in range(obs_shape[1]): new_x_gen_np = sess.run(new_x_gen, {xs[i]: x_gen[i] for i in range(args.nr_gpu)}) for i in range(args.nr_gpu): x_gen[i][:, yi, xi, :] = new_x_gen_np[i][:, yi, xi, :] sample_x = np.concatenate([sample_x] + x_gen, axis=0) img_tile = plotting.img_tile( sample_x[:int(np.floor(np.sqrt(n_samples))**2)], aspect_ratio=1.0, border_color=1.0, stretch=True) img = plotting.plot_img(img_tile, title=args.data_set + ' samples') plotting.plt.savefig( os.path.join(save_dir, '%s_samples.png' % args.data_set)) np.save(os.path.join(save_dir, '%s_samples.npy' % args.data_set), sample_x) plotting.plt.close('all') # init & save initializer = tf.global_variables_initializer() saver = tf.train.Saver() # turn numpy inputs into feed_dict for use with tensorflow def make_feed_dict(data, init=False): if type(data) is tuple: x, y = data else: x = data y = None # input to pixelCNN is scaled from uint8 [0,255] to float in range [-1,1] x = np.cast[np.float32]((x - 127.5) / 127.5) if init: feed_dict = {x_init: x} if y is not None: feed_dict.update({y_init: y}) else: x = np.split(x, args.nr_gpu) feed_dict = {xs[i]: x[i] for i in range(args.nr_gpu)} if y is not None: y = np.split(y, args.nr_gpu) feed_dict.update({ys[i]: y[i] for i in range(args.nr_gpu)}) return feed_dict # //////////// perform training ////////////// lprint('dataset size: %d' % len(train_data.data)) test_bpd = [] lr = args.learning_rate # manually retrieve exactly init_batch_size examples feed_dict = make_feed_dict(train_data.next(args.init_batch_size), init=True) train_data.reset() # rewind the iterator back to 0 to do one full epoch lprint('initializing the model...') sess.run(initializer, feed_dict) if args.load_params: # ckpt_file = save_dir + '/params_' + args.data_set + '.ckpt' ckpt_file = args.load_params lprint('restoring parameters from', ckpt_file) saver.restore(sess, ckpt_file) # Sync params before starting. my_vals = sess.run(all_params) vals = [np.zeros_like(v) for v in my_vals] [comm.Allreduce(mv, v, op=MPI.SUM) for mv, v in zip(my_vals, vals)] assign_ops = [var.assign(val / num_tasks) for var, val in zip(all_params, vals)] sess.run(assign_ops) coord = tfu.start_queue_runners(sess) batch_size = args.batch_size * total_gpus iters_per_train_epoch = len(train_data.data) // batch_size iters_per_test_epoch = len(test_data.data) // batch_size lprint('starting training') for epoch in range(args.max_epochs): begin = time.time() # train for one epoch train_losses = [] ti = trange(iters_per_train_epoch) for itr in ti: if coord.should_stop(): tfu.stop_queue_runners(coord) # forward/backward/update model on each gpu lr *= args.lr_decay l, _ = sess.run([bits_per_dim, optimizer], {tf_lr: lr}) train_losses.append(l) ti.set_postfix(loss=l, lr=lr) train_loss_gen = np.mean(train_losses) # compute likelihood over test data test_losses = [] for itr in trange(iters_per_test_epoch): if coord.should_stop(): tfu.stop_queue_runners(coord) l = sess.run(bits_per_dim_test) test_losses.append(l) test_loss_gen = np.mean(test_losses) test_bpd.append(test_loss_gen) # log progress to console stats = dict(epoch=epoch, time=time.time() - begin, lr=lr, train_bpd=train_loss_gen, test_bpd=test_loss_gen) all_stats = comm.gather(stats) if task_id == 0: lprint('-' * 16) for k in stats: lprint('%s:\t%s' % (k, np.mean([s[k] for s in all_stats]))) if epoch % args.save_interval == 0: path = os.path.join(save_dir, str(epoch)) os.makedirs(path, exist_ok=True) saver.save(sess, os.path.join(path, 'params_%s.ckpt' % args.data_set)) sample_from_model(sess)
def main(): # initialize data loaders for train/test splits if args.data_set == 'imagenet' and args.class_conditional: raise("We currently don't have labels for the small imagenet data set") if args.data_set == 'cifar': import data.cifar10_data as cifar10_data DataLoader = cifar10_data.DataLoader elif args.data_set == 'imagenet': import data.imagenet_data as imagenet_data DataLoader = imagenet_data.DataLoader else: raise("unsupported dataset") train_data = DataLoader(args.data_dir, 'train', args.batch_size * args.nr_gpu, rng=rng, shuffle=True, return_labels=args.class_conditional) test_data = DataLoader(args.data_dir, 'test', args.batch_size * args.nr_gpu, shuffle=False, return_labels=args.class_conditional) obs_shape = train_data.get_observation_size() # e.g. a tuple (32,32,3) assert len(obs_shape) == 3, 'assumed right now' # data place holders x_init = tf.placeholder(tf.float32, shape=(args.init_batch_size,) + obs_shape) xs = [tf.placeholder(tf.float32, shape=(args.batch_size, ) + obs_shape) for i in range(args.nr_gpu)] # if the model is class-conditional we'll set up label placeholders + one-hot encodings 'h' to condition on if args.class_conditional: num_labels = train_data.get_num_labels() y_init = tf.placeholder(tf.int32, shape=(args.init_batch_size,)) h_init = tf.one_hot(y_init, num_labels) y_sample = np.split(np.mod(np.arange(args.batch_size*args.nr_gpu), num_labels), args.nr_gpu) h_sample = [tf.one_hot(tf.Variable(y_sample[i], trainable=False), num_labels) for i in range(args.nr_gpu)] ys = [tf.placeholder(tf.int32, shape=(args.batch_size,)) for i in range(args.nr_gpu)] hs = [tf.one_hot(ys[i], num_labels) for i in range(args.nr_gpu)] else: h_init = None h_sample = [None] * args.nr_gpu hs = h_sample # create the model model_opt = { 'nr_resnet': args.nr_resnet, 'nr_filters': args.nr_filters, 'nr_logistic_mix': args.nr_logistic_mix, 'resnet_nonlinearity': args.resnet_nonlinearity} model = tf.make_template('model', model_spec) # run once for data dependent initialization of parameters data_dependent_init = model(x_init, h_init, init=True, dropout_p=args.dropout_p, **model_opt) # keep track of moving average all_params = tf.trainable_variables() ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay) maintain_averages_op = tf.group(ema.apply(all_params)) ema_params = [ema.average(p) for p in all_params] # get loss gradients over multiple GPUs + sampling grads = [] loss_gen = [] loss_gen_test = [] new_x_gen = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): if args.graph_cloning and i>0: # already defined the graph once, use it again via template rather than redefining again in_ = [xs[i]] + tf.global_variables() res = gpu_template.apply(in_) loss_train, loss_test, sx = res[:3] grad = res[3:] loss_gen.append(loss_train) loss_gen_test.append(loss_test) new_x_gen.append(sx) grads.append(grad) else: # train out = model(xs[i], hs[i], ema=None, dropout_p=args.dropout_p, **model_opt) loss_gen.append(nn.discretized_mix_logistic_loss(tf.stop_gradient(xs[i]), out)) # gradients grads.append(gradients(loss_gen[i], all_params)) # test out = model(xs[i], hs[i], ema=ema, dropout_p=0., **model_opt) loss_gen_test.append(nn.discretized_mix_logistic_loss(xs[i], out)) # sample out = model(xs[i], h_sample[i], ema=ema, dropout_p=0, **model_opt) new_x_gen.append(nn.sample_from_discretized_mix_logistic(out, args.nr_logistic_mix)) if args.graph_cloning: in_ = [xs[0]] + tf.global_variables() out_ = [loss_gen[0], loss_gen_test[0], new_x_gen[0]] + grads[0] gpu_template = GraphTemplate(in_, outputs=out_) # add losses and gradients together and get training updates tf_lr = tf.placeholder(tf.float32, shape=[]) with tf.device('/gpu:0'): for i in range(1,args.nr_gpu): loss_gen[0] += loss_gen[i] loss_gen_test[0] += loss_gen_test[i] for j in range(len(grads[0])): grads[0][j] += grads[i][j] # training op optimizer = tf.group(nn.adam_updates(all_params, grads[0], lr=tf_lr, mom1=0.95, mom2=0.9995), maintain_averages_op) # convert loss to bits/dim bits_per_dim = loss_gen[0]/(args.nr_gpu*np.log(2.)*np.prod(obs_shape)*args.batch_size) bits_per_dim_test = loss_gen_test[0]/(args.nr_gpu*np.log(2.)*np.prod(obs_shape)*args.batch_size) # sample from the model def sample_from_model(sess): x_gen = [np.zeros((args.batch_size,) + obs_shape, dtype=np.float32) for i in range(args.nr_gpu)] for yi in range(obs_shape[0]): for xi in range(obs_shape[1]): new_x_gen_np = sess.run(new_x_gen, {xs[i]: x_gen[i] for i in range(args.nr_gpu)}) for i in range(args.nr_gpu): x_gen[i][:,yi,xi,:] = new_x_gen_np[i][:,yi,xi,:] return np.concatenate(x_gen, axis=0) # turn numpy inputs into feed_dict for use with tensorflow def make_feed_dict(data, init=False): if type(data) is tuple: x,y = data else: x = data y = None x = np.cast[np.float32]((x - 127.5) / 127.5) # input to pixelCNN is scaled from uint8 [0,255] to float in range [-1,1] if init: feed_dict = {x_init: x} if y is not None: feed_dict.update({y_init: y}) else: x = np.split(x, args.nr_gpu) feed_dict = {xs[i]: x[i] for i in range(args.nr_gpu)} if y is not None: y = np.split(y, args.nr_gpu) feed_dict.update({ys[i]: y[i] for i in range(args.nr_gpu)}) return feed_dict # //////////// perform training ////////////// if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) print('starting training') test_bpd = [] lr = args.learning_rate saver = tf.train.Saver() with tf.Session() as sess: for epoch in range(args.max_epochs): begin = time.time() # init if epoch == 0: feed_dict = make_feed_dict(train_data.next(args.init_batch_size), init=True) # manually retrieve exactly init_batch_size examples train_data.reset() # rewind the iterator back to 0 to do one full epoch print('initializing the model...') sess.run(tf.global_variables_initializer()) sess.run(data_dependent_init, feed_dict) # train for one epoch train_losses = [] counter = 0 for d in train_data: counter+=1 feed_dict = make_feed_dict(d) # forward/backward/update model on each gpu lr *= args.lr_decay feed_dict.update({ tf_lr: lr }) l,_ = sess.run([bits_per_dim, optimizer], feed_dict) print(counter, l) train_losses.append(l) if counter>50: if l>6.5: assert False, "Test failed, expected loss 6.28537 at iteration 50" else: print("Test passed, loss %f (expected %f)"%(l, 6.28537)) sys.exit() train_loss_gen = np.mean(train_losses) # compute likelihood over test data test_losses = [] for d in test_data: feed_dict = make_feed_dict(d) l = sess.run(bits_per_dim_test, feed_dict) test_losses.append(l) test_loss_gen = np.mean(test_losses) test_bpd.append(test_loss_gen) # log progress to console print("Iteration %d, time = %ds, train bits_per_dim = %.4f, test bits_per_dim = %.4f" % (epoch, time.time()-begin, train_loss_gen, test_loss_gen)) sys.stdout.flush() if epoch % args.save_interval == 0: # generate samples from the model sample_x = [] for i in range(args.num_samples): sample_x.append(sample_from_model(sess)) sample_x = np.concatenate(sample_x,axis=0) #img_tile = plotting.img_tile(sample_x[:100], aspect_ratio=1.0, border_color=1.0, stretch=True) #img = plotting.plot_img(img_tile, title=args.data_set + ' samples') #plotting.plt.savefig(os.path.join(args.save_dir,'%s_sample%d.png' % (args.data_set, epoch))) #plotting.plt.close('all') np.savez(os.path.join(args.save_dir,'%s_sample%d.npz' % (args.data_set, epoch)), sample_x) # save params saver.save(sess, args.save_dir + '/params_' + args.data_set + '.ckpt') np.savez(args.save_dir + '/test_bpd_' + args.data_set + '.npz', test_bpd=np.array(test_bpd))
} model = tf.make_template('model', model_spec) gen_par_init = model(x_init, None, init=True, ema=None, dropout_p=0., **model_opt) grads = [] loss_gen = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): gen_par = model(xs[i], None, ema=None, dropout_p=0., **model_opt) loss, prior = nn.discretized_mix_logistic_loss(xs[i], gen_par) loss_gen.append(loss) grads.append(tf.gradients(loss_gen, [xs[i]])) with tf.device('/gpu:0'): for i in range(1, args.nr_gpu): loss_gen[0] += loss_gen[i] loss_gen_sum = loss_gen[0] / (args.nr_gpu * np.log(2.) * np.prod([64, 64, 3]) * batch_size) grads_sum = tf.squeeze(tf.concat(grads, axis=1)) / ( args.nr_gpu * np.log(2.) * np.prod([64, 64, 3]) * batch_size) saver = tf.train.Saver() sess = tf.Session() print 'Restoring weights'
# get loss gradients over multiple GPUs grads = [] loss_gen = [] loss_gen_test = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): # train gen_par = model(xs[i], masks, hs[i], ema=None, dropout_p=args.dropout_p, **model_opt) loss_gen.append( nn.discretized_mix_logistic_loss(xs[i], gen_par, masks=masks)) # gradients grads.append(tf.gradients(loss_gen[i], all_params)) # test gen_par = model(xs[i], masks, hs[i], ema=ema, dropout_p=0., **model_opt) loss_gen_test.append( nn.discretized_mix_logistic_loss(xs[i], gen_par, masks=masks)) # add losses and gradients together and get training updates tf_lr = tf.placeholder(tf.float32, shape=[]) with tf.device('/gpu:0'):
def __init__(self, layer_name, target_direction, naturalness, norm_penalty, weight_min=0, target_metric='max'): assert layer_name in RF_SIZES, 'Invalid layer name: %s' % layer_name rf_size = RF_SIZES[layer_name] texture_size = 2 * rf_size - 1 num_images = (rf_size - 1)**2 self.graph = tf.Graph() with self.graph.as_default(): self.texture_raw = tf.get_variable( 'texture_raw', shape=[1, texture_size, texture_size, 3], initializer=tf.random_normal_initializer()) norm = lambda x: tf.sqrt(tf.reduce_sum(tf.square(x))) # mask t = (np.arange(rf_size) - (rf_size - 1) / 2).astype(np.float32) x, y = np.meshgrid(t, t) p_ = tf.constant(2.0) q_ = tf.constant(4.0) sigma_ = tf.constant( rf_size / 2.4 ) # 2.4 is a heuristic based on the ratio between conv3_1 rf size and 10 r = (tf.abs(tf.constant(x))**p_ + tf.abs(tf.constant(y))**p_)**(1 / p_) self.mask = tf.exp(-(tf.abs(r[None, ..., None]) / sigma_)**q_) self.mask_p = p_ self.mask_q = q_ self.mask_sigma = sigma_ # crops with mask applied self.image_list = [] self.norms = [] for i in range(rf_size - 1): for j in range(rf_size - 1): img = self.texture_raw[:, i:i + rf_size, j:j + rf_size, :] * self.mask self.image_list.append(img) self.norms.append(tf.reduce_sum(tf.square(img))) avg_norm = tf.sqrt(tf.reduce_mean(tf.stack(self.norms))) scale = (NORMS[layer_name] / 2) / avg_norm self.texture = scale * self.texture_raw self.images = scale * tf.concat( self.image_list, axis=0, name='image_batch') # precondition gradient (only spatial whitening) with self.graph.gradient_override_map({"Identity": "mygrad_sqrt"}): self.images = tf.identity(self.images, name="Identity") self.vgg = vgg19(self.images, subtract_mean=False, final_endpoint=layer_name) vgg_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='vgg_19') saver_vgg = tf.train.Saver(var_list=vgg_vars) if type(target_direction) == int: unit_num = target_direction target_direction = np.zeros(int( self.vgg[layer_name].shape[-1]), dtype=np.float32) target_direction[unit_num] = 1 else: if target_metric in ['max', 'cos']: target_direction /= np.sqrt(np.sum(target_direction**2)) self.target_direction = tf.constant(target_direction, shape=[target_direction.size], name='target_direction') vgg_flat = tf.reshape(self.vgg[layer_name], [num_images, -1]) if target_metric in ['max', 'cos']: if target_metric == 'cos': vgg_flat /= tf.sqrt( tf.reduce_sum(tf.square(vgg_flat), axis=1) + 0.01)[:, None] self.predictions = tf.tensordot(vgg_flat, self.target_direction, axes=[[1], [0]], name='predictions') else: self.predictions = -tf.reduce_mean( tf.square(vgg_flat - self.target_direction), axis=1) # natural image prior (PixelCNN++) if naturalness > 0: model = tf.make_template('model', pxpp_model.model_spec) # pad if input_size not divisible by 4 input_size_pxpp = int(np.ceil(texture_size / 4.0) * 4.0) pad1 = int( np.ceil((input_size_pxpp - texture_size - 0.1) / 2.0)) pad2 = int( np.floor((input_size_pxpp - texture_size + 0.1) / 2.0)) init_images = tf.placeholder( tf.float32, shape=[1, input_size_pxpp, input_size_pxpp, 3]) self.pxpp_init = model(init_images, init=True) self.texture_pxpp = tf.minimum( tf.maximum(self.texture / 128, -1), 1) self.texture_pxpp = tf.pad( self.texture_pxpp, [[0, 0], [pad1, pad2], [pad1, pad2], [0, 0]]) self.pxpp = model(self.texture_pxpp) self.image_likelihood = pxpp_nn.discretized_mix_logistic_loss( self.texture_pxpp, self.pxpp) pxpp_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model') saver_pxpp = tf.train.Saver(var_list=pxpp_vars) init_feed_dict = {init_images: np.zeros(init_images.shape)} else: self.image_likelihood = tf.constant(0.0) init_feed_dict = None self.naturalness_loss = naturalness * self.image_likelihood self.loss = - (1 - weight_min) * tf.reduce_mean(self.predictions) \ - weight_min * tf.reduce_min(self.predictions) \ + self.naturalness_loss self.learning_rate = tf.placeholder(tf.float32, shape=(), name='learning_rate') var_list = [self.texture_raw] self.train_step = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss, var_list=var_list) self.session = tf.Session() self.session.run(tf.global_variables_initializer(), feed_dict=init_feed_dict) saver_vgg.restore(self.session, VGG_CHECKPOINT_FILE) if naturalness > 0: saver_pxpp.restore(self.session, PXPP_CHECKPOINT_FILE)
def __init__(self, layer_name, target_direction, num_images, diversity, naturalness, feature_layer_name=None, div_metric='euclidean', target_metric='max', input_size=None): assert layer_name in RF_SIZES, 'Invalid layer name: %s' % layer_name assert (feature_layer_name is None or feature_layer_name in RF_SIZES ), 'Invalid feature layer name: %s' % feature_layer_name if input_size is None: input_size = RF_SIZES[layer_name] self.graph = tf.Graph() with self.graph.as_default(): images_raw = tf.get_variable( 'images', shape=[num_images, input_size, input_size, 3], initializer=tf.random_normal_initializer()) norm = tf.sqrt( tf.reduce_sum(tf.square(images_raw), axis=[1, 2, 3], keepdims=True)) self.images_raw = images_raw self.images = (NORMS[layer_name] / 2) * images_raw / norm # precondition gradient (only spatial whitening) with self.graph.gradient_override_map({"Identity": "mygrad_sqrt"}): self.images = tf.identity(self.images, name="Identity") self.vgg = vgg19(self.images, subtract_mean=False, final_endpoint=layer_name) vgg_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='vgg_19') saver_vgg = tf.train.Saver(var_list=vgg_vars) if type(target_direction) == int: unit_num = target_direction target_direction = np.zeros(int( self.vgg[layer_name].shape[-1]), dtype=np.float32) target_direction[unit_num] = 1 else: if target_metric in ['max', 'cos']: target_direction /= np.sqrt(np.sum(target_direction**2)) self.target_direction = tf.constant(target_direction, shape=[target_direction.size], name='target_direction') vgg_flat = tf.reshape(self.vgg[layer_name], [num_images, -1]) if target_metric in ['max', 'cos']: if target_metric == 'cos': vgg_flat /= tf.sqrt( tf.reduce_sum(tf.square(vgg_flat), axis=1) + 0.01)[:, None] self.predictions = tf.tensordot(vgg_flat, self.target_direction, axes=[[1], [0]], name='predictions') else: self.predictions = -tf.reduce_mean( tf.square(vgg_flat - self.target_direction), axis=1) # diversity penalty if div_metric == 'euclidean': dist = lambda x, y: tf.sqrt(tf.reduce_mean(tf.square(x - y))) elif div_metric == 'cosine': dist = lambda x, y: 1 - tf.abs(tf.reduce_sum(x * y)) / ( AVG_RF_SIZE**2) if num_images > 1: distances = [] features = self.vgg[ feature_layer_name] if feature_layer_name else self.images for i in range(num_images - 1): for j in range(i + 1, num_images): distances.append( dist(features[i, :, :, :], features[j, :, :, :])) self.min_distance = tf.reduce_min(tf.stack(distances)) self.mean_distance = tf.reduce_mean(tf.stack(distances)) else: self.min_distance = tf.constant(0.0, dtype=tf.float32) self.mean_distance = tf.constant(0.0, dtype=tf.float32) # natural image prior (PixelCNN++) if naturalness > 0: model = tf.make_template('model', pxpp_model.model_spec) # pad if input_size not divisible by 4 input_size_pxpp = int(np.ceil(input_size / 4.0) * 4.0) pad1 = int(np.ceil((input_size_pxpp - input_size - 0.1) / 2.0)) pad2 = int(np.floor( (input_size_pxpp - input_size + 0.1) / 2.0)) init_images = tf.placeholder( tf.float32, shape=[num_images, input_size_pxpp, input_size_pxpp, 3]) self.pxpp_init = model(init_images, init=True) self.images_pxpp = tf.minimum( tf.maximum(self.images / 128, -1), 1) self.images_pxpp = tf.pad( self.images_pxpp, [[0, 0], [pad1, pad2], [pad1, pad2], [0, 0]]) self.pxpp = model(self.images_pxpp) self.image_likelihood = pxpp_nn.discretized_mix_logistic_loss( self.images_pxpp, self.pxpp) pxpp_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model') saver_pxpp = tf.train.Saver(var_list=pxpp_vars) init_feed_dict = {init_images: np.zeros(init_images.shape)} else: self.image_likelihood = tf.constant(0.0) init_feed_dict = None self.diversity_loss = diversity * self.min_distance self.naturalness_loss = naturalness * self.image_likelihood self.loss = -tf.reduce_mean(self.predictions) \ - self.diversity_loss \ + self.naturalness_loss self.learning_rate = tf.placeholder(tf.float32, shape=(), name='learning_rate') self.train_step = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss, var_list=[images_raw]) self.session = tf.Session() self.session.run(tf.global_variables_initializer(), feed_dict=init_feed_dict) saver_vgg.restore(self.session, VGG_CHECKPOINT_FILE) if naturalness > 0: saver_pxpp.restore(self.session, PXPP_CHECKPOINT_FILE)
dropout_p=args.dropout_p, **model_opt) # keep track of moving average all_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model_1') #tf.trainable_variables(scope="model_1") ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay) maintain_averages_op = tf.group(ema.apply(all_params)) loss_gen_test = [] outputs = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): gen_par = model(xs[i], masks[i], hs[i], ema=ema, dropout_p=0., **model_opt) outputs.append(gen_par) loss_gen_test.append(nn.discretized_mix_logistic_loss(xs[i], gen_par, masks=masks)) with tf.device('/gpu:0'): for i in range(1, args.nr_gpu): loss_gen_test[0] += loss_gen_test[i] bits_per_dim_test = loss_gen_test[ 0] / (args.nr_gpu * np.log(2.) * np.prod(obs_shape) * args.batch_size) mgen = mk.RecMaskGenerator(obs_shape[0], obs_shape[1]) agen = mk.AllOnesMaskGenerator(obs_shape[0], obs_shape[1]) # def make_feed_dict(data, init=False, masks=None, is_test=False):
# keep track of moving average all_params = tf.trainable_variables() ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay) maintain_averages_op = tf.group(ema.apply(all_params)) # get loss gradients over multiple GPUs grads = [] loss_gen = [] loss_gen_test = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): # train gen_par = model(xs[i], hs[i], ema=None, dropout_p=args.dropout_p, **model_opt) loss_gen.append(nn.discretized_mix_logistic_loss(xs[i], gen_par)) # gradients grads.append(tf.gradients(loss_gen[i], all_params)) # test gen_par = model(xs[i], hs[i], ema=ema, dropout_p=0., **model_opt) loss_gen_test.append(nn.discretized_mix_logistic_loss(xs[i], gen_par)) # add losses and gradients together and get training updates tf_lr = tf.placeholder(tf.float32, shape=[]) with tf.device('/gpu:0'): for i in range(1, args.nr_gpu): loss_gen[0] += loss_gen[i] loss_gen_test[0] += loss_gen_test[i] for j in range(len(grads[0])): grads[0][j] += grads[i][j] # training op