def test_encoder_decoder(self): with tf.Graph().as_default(): hparams = glow.glow_hparams() hparams.n_levels = 3 hparams.depth = 6 rng = np.random.RandomState(0) x_np = rng.rand(1, 64, 64, 4) x_t = tf.convert_to_tensor(x_np, dtype=tf.float32) init_ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(init_ops, init=True): x_inv, _, eps, z_levels, _ = glow_ops.encoder_decoder( "encoder_decoder", x_t, hparams, reverse=False) x_inv_inv, _, z_inv_levels, _ = glow_ops.encoder_decoder( "encoder_decoder", x_inv, hparams, eps=eps, reverse=True) with tf.Session() as session: session.run(tf.global_variables_initializer()) x_inv_np = session.run(x_inv) z_levels_np, z_inv_levels_np, x_inv_inv_np = session.run( [z_levels, z_inv_levels, x_inv_inv]) diff = x_inv_inv_np - x_np self.assertLen(z_levels_np, 2) self.assertLen(z_inv_levels_np, 2) # (h_i, w_i, c_i) = (h_{i-1}/f, w_{i-1}/f, c_{i-1}*(2f)/2) where (f=2) self.assertEqual(z_levels_np[0].shape, (1, 32, 32, 8)) self.assertEqual(z_levels_np[1].shape, (1, 16, 16, 16)) self.assertEqual(z_inv_levels_np[0].shape, (1, 32, 32, 8)) self.assertEqual(z_inv_levels_np[1].shape, (1, 16, 16, 16)) self.assertTrue(x_inv_np.shape, (1, 8, 8, 64)) self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
def test_encoder_decoder(self): with tf.Graph().as_default(): hparams = glow.glow_hparams() hparams.n_levels = 3 hparams.depth = 2 x = tf.random_uniform(shape=(16, 64, 64, 4), seed=0) x_inv, _, eps, z_levels, _ = glow_ops.encoder_decoder( "encoder_decoder", x, hparams, reverse=False) x_inv_inv, _, z_inv_levels, _ = glow_ops.encoder_decoder( "encoder_decoder", x_inv, hparams, eps=eps, reverse=True) with tf.Session() as session: session.run(tf.global_variables_initializer()) diff, x_inv_np, z_levels_np, z_inv_levels_np = session.run( [x - x_inv_inv, x_inv, z_levels, z_inv_levels]) self.assertLen(z_levels_np, 2) self.assertLen(z_inv_levels_np, 2) # (h_i, w_i, c_i) = (h_{i-1}/f, w_{i-1}/f, c_{i-1}*(2f)/2) where (f=2) self.assertEqual(z_levels_np[0].shape, (16, 32, 32, 8)) self.assertEqual(z_levels_np[1].shape, (16, 16, 16, 16)) self.assertEqual(z_inv_levels_np[0].shape, (16, 32, 32, 8)) self.assertEqual(z_inv_levels_np[1].shape, (16, 16, 16, 16)) self.assertTrue(x_inv_np.shape, (16, 8, 8, 64)) self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
def test_encoder_decoder_practical_usage(self): """Tests the following sequence of operations. 1. Define forward network with arg_scope(init=True). 2. Run one-forward pass to do data-dependent initialization and save. 3. Define forward and reverse network with arg_scope(init=False) 4. Check that reverse(forward(x)) == x """ hparams = glow.glow_hparams() hparams.n_levels = 2 hparams.depth = 12 with tf.Graph().as_default(): rng = np.random.RandomState(0) x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32) x_t = tf.convert_to_tensor(x_rand) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=True): x_inv, _, _, _ = glow_ops.encoder_decoder("revnet", x_t, hparams, reverse=False) curr_dir = tempfile.mkdtemp() model_path = os.path.join(curr_dir, "model") with tf.Session() as session: saver = tf.train.Saver() session.run(tf.global_variables_initializer()) session.run(x_inv) saver.save(session, model_path) with tf.Graph().as_default(): rng = np.random.RandomState(0) x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32) x_t = tf.convert_to_tensor(x_rand) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=False): x_inv2, _, all_eps, _ = glow_ops.encoder_decoder("revnet", x_t, hparams, reverse=False) x_inv_inv_, _ = glow_ops.encoder_decoder("revnet", x_inv2, hparams, eps=all_eps, reverse=True) with tf.Session() as session: saver = tf.train.Saver() saver.restore(session, model_path) x_inv_inv_np = session.run(x_inv_inv_) diff = np.abs(x_inv_inv_np - x_rand) self.assertTrue(np.allclose(diff, 0.0, atol=1e-3))
def glow_encoder(self, frame, condition=False, cond_latents=None, init=False): """Glow network that encodes frame to a hierarchy of latents. Args: frame: 5-D Tensor of shape (batch_size, 1, height, width, channels). condition: Whether or not to condition on cond_latents. cond_latents: optional, list of tensors with length equal to hparams.n_levels - 1. If provided, the latent at level l is conditioned on the cond_latent at level l. init: Whether the given batch is an "init" batch or a "train" batch. Returns: objective: log-likelihood of the frame per the model. z_top: top-level latent. z_levels: a list of tensors with latents at all levels. """ frame = self.squeeze_video(frame, init=init) frame = self.preprocess(frame) frame, objective = glow_ops.uniform_binning_correction(frame) glow_vals = glow_ops.encoder_decoder("codec", frame, self.hparams, eps=None, reverse=False, cond_latents=cond_latents, states=self.level_states, condition=condition) z_top, encoder_objective, self.eps, z_levels, self.level_states = glow_vals objective += encoder_objective return objective, z_top, z_levels
def body(self, features): x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) x, objective = glow_ops.uniform_binning_correction(x) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. global_step = tf.train.get_or_create_global_step() init_op = tf.logical_and(tf.equal(global_step, 0), self.is_training) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=init_op): self.z, encoder_objective, self.eps, _ = glow_ops.encoder_decoder( "codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective prior_objective, prior_dist = self.top_prior(self.z) tf.summary.scalar("top_prior", tf.reduce_mean(prior_objective)) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel _, h, w, c = common_layers.shape_list(x) objective = -objective / (np.log(2) * h * w * c) return tf.zeros_like(features["targets"]), {"training": objective}
def latents_to_frames(z_top_interp, level_eps_interp, hparams): """Decodes latents to frames.""" # Decode [z^1_t, z^2_t .. z^l_t] to [X_t] images, _, _, _ = glow_ops.encoder_decoder( "codec", z_top_interp, hparams, eps=level_eps_interp, reverse=True) images = glow_ops.postprocess(images) return images
def test_encoder_decoder_practical_usage(self): """Tests the following sequence of operations. 1. Define forward network with arg_scope(init=True). 2. Run one-forward pass to do data-dependent initialization and save. 3. Define forward and reverse network with arg_scope(init=False) 4. Check that reverse(forward(x)) == x """ hparams = glow.glow_hparams() hparams.n_levels = 2 hparams.depth = 12 with tf.Graph().as_default(): rng = np.random.RandomState(0) x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32) x_t = tf.convert_to_tensor(x_rand) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=True): x_inv, _, _, _, _ = glow_ops.encoder_decoder( "revnet", x_t, hparams, reverse=False) curr_dir = tempfile.mkdtemp() model_path = os.path.join(curr_dir, "model") with tf.Session() as session: saver = tf.train.Saver() session.run(tf.global_variables_initializer()) session.run(x_inv) saver.save(session, model_path) with tf.Graph().as_default(): rng = np.random.RandomState(0) x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32) x_t = tf.convert_to_tensor(x_rand) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=False): x_inv2, _, all_eps, _, _ = glow_ops.encoder_decoder( "revnet", x_t, hparams, reverse=False) x_inv_inv_, _, _, _ = glow_ops.encoder_decoder( "revnet", x_inv2, hparams, eps=all_eps, reverse=True) with tf.Session() as session: saver = tf.train.Saver() saver.restore(session, model_path) x_inv_inv_np = session.run(x_inv_inv_) diff = np.abs(x_inv_inv_np - x_rand) self.assertTrue(np.allclose(diff, 0.0, atol=1e-3))
def frame_to_latents(frame, hparams): """Encode frames to latents.""" # Preprocess frame = preprocess_frame(frame) # Encode [X_t] to [z^1_t, z^2_t .. z^l_t] glow_vals = glow_ops.encoder_decoder( "codec", frame, hparams, eps=None, reverse=False) z_top, _, level_eps, _, _ = glow_vals return z_top, level_eps
def test_encoder_decoder(self): with tf.Graph().as_default(): hparams = glow_ops.glow_hparams() hparams.n_levels = 2 hparams.depth = 2 x = tf.random_uniform(shape=(16, 64, 64, 4), seed=0) x_inv, _, eps = glow_ops.encoder_decoder("encoder_decoder", x, hparams, reverse=False) x_inv_inv, _ = glow_ops.encoder_decoder("encoder_decoder", x_inv, hparams, eps=eps, reverse=True) with tf.Session() as session: session.run(tf.global_variables_initializer()) diff, x_inv_np = session.run([x - x_inv_inv, x_inv]) self.assertTrue(x_inv_np.shape, (16, 8, 8, 64)) self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
def infer(self, features, *args, **kwargs): # pylint: disable=arguments-differ del args, kwargs x = features["inputs"] batch_size = common_layers.shape_list(x)[0] features["targets"] = tf.zeros(shape=(batch_size, 1, 1, 1)) _, _ = self(features) # pylint: disable=not-callable ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] var_scope = tf.variable_scope("glow/body", reuse=True) # If eps=None, images are sampled from the prior. with arg_scope(ops, init=False), var_scope: predictions, _, _, _ = glow_ops.encoder_decoder( "codec", self.z_sample, self.hparams, eps=None, reverse=True) return self.scale(predictions)
def infer(self, features, *args, **kwargs): # pylint: disable=arguments-differ del args, kwargs x = features["inputs"] batch_size = common_layers.shape_list(x)[0] features["targets"] = tf.zeros(shape=(batch_size, 1, 1, 1)) _, _ = self(features) # pylint: disable=not-callable ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] var_scope = tf.variable_scope("glow/body", reuse=True) # If eps=None, images are sampled from the prior. with arg_scope(ops, init=False), var_scope: predictions, _, _, _ = glow_ops.encoder_decoder( "codec", self.z_sample, self.hparams, eps=None, reverse=True, temperature=self.temperature) return self.scale(predictions)
def objective_tower(self, features, init=True): """Objective in terms of bits-per-pixel. Args: features: dict of tensors with "features" and "targets" keys. init: Whether or not to run data-dependent init. Returns: objective: float, bits-per-pixel. """ x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) x, objective = glow_ops.uniform_binning_correction(x) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=init): self.z, encoder_objective, self.eps, _, _ = glow_ops.encoder_decoder( "codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective self.z_top_shape = common_layers.shape_list(self.z) prior_dist = self.top_prior() prior_objective = tf.reduce_sum(prior_dist.log_prob(self.z), axis=[1, 2, 3]) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel _, h, w, c = common_layers.shape_list(x) objective = -objective / (np.log(2) * h * w * c) return objective
def body(self, features): x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) n_bins = 2**self.hparams.n_bits_x batch_size, height, width, n_channels = common_layers.shape_list(x) hwc = float(height * width * n_channels) x = x + tf.random_uniform( shape=(batch_size, height, width, n_channels), minval=0.0, maxval=1.0 / n_bins) objective = -np.log(n_bins) * hwc * tf.ones(batch_size) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. global_step = tf.train.get_or_create_global_step() init_op = tf.logical_and(tf.equal(global_step, 0), self.is_training) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=init_op): self.z, encoder_objective, self.eps = glow_ops.encoder_decoder( "codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective prior_objective, prior_dist = glow_ops.top_prior( "top_prior", self.z, learn_prior=self.hparams.learn_prior) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel objective = -objective / (np.log(2) * hwc) return tf.zeros_like(features["targets"]), {"training": objective}
def objective_tower(self, features, init=True): """Objective in terms of bits-per-pixel. Args: features: dict of tensors with "features" and "targets" keys. init: Whether or not to run data-dependent init. Returns: objective: float, bits-per-pixel. """ x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) x, objective = glow_ops.uniform_binning_correction(x) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=init): self.z, encoder_objective, self.eps, _, _ = glow_ops.encoder_decoder( "codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective self.z_top_shape = common_layers.shape_list(self.z) prior_dist = self.top_prior() prior_objective = tf.reduce_sum( prior_dist.log_prob(self.z), axis=[1, 2, 3]) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel _, h, w, c = common_layers.shape_list(x) objective = -objective / (np.log(2) * h * w * c) return objective
def infer(self, features, *args, **kwargs): # pylint: disable=arguments-differ del args, kwargs # Make a copy of features that can be used in the call to self # that builds the graph. new_features = {} new_features["inputs"] = features["inputs"] new_features["targets"] = features["infer_targets"] _, _ = self(new_features) # pylint: disable=not-callable if self.hparams.gen_mode == "unconditional": num_target_frames = 1 else: num_target_frames = self.hparams.video_num_target_frames ops = [ glow_ops.get_variable_ddi, glow_ops.actnorm, glow_ops.get_dropout ] var_scope = tf.variable_scope("next_frame_glow/body", reuse=True) all_frames = [] # If eps=None, images are sampled from the prior. with arg_scope(ops, init=False), var_scope: for target_frame in range(1, num_target_frames + 1): # subscript -> timestep, superscript -> level. # self.z_sample equals z^0_{t} (top-level latent) # (X_{t}, z^{1..l}_{t}) = Glow(z^0_{t}, z^{1..l}_{t-1}) # Get current set of cond_latents. cond_level, cond_level_latents = get_cond_latents( self.all_level_latents, self.hparams) glow_vals = glow_ops.encoder_decoder( "codec", self.z_sample, self.hparams, eps=None, reverse=True, cond_latents=cond_level_latents, states=self.level_states, condition=cond_level, temperature=self.temperature) predicted_frame, _, curr_latents, self.level_states = glow_vals all_frames.append(predicted_frame) self.all_level_latents.append(curr_latents) # Compute z^0_{t+1} = f(z^0_{t}) if target_frame < num_target_frames: cond_top, cond_top_latents = get_cond_latents( self.all_top_latents, self.hparams) prior_dist = self.top_prior(condition=cond_top, cond_latents=cond_top_latents) self.z_sample = prior_dist.sample() self.all_top_latents.append(self.z_sample) all_frames = tf.stack(all_frames) predicted_video = common_video.swap_time_and_batch_axes(all_frames) # The video-decode API requires the predicted video to be the same shape # as the target-video. Hence, for unconditional generation, # tile across time to ensure same shape. if self.hparams.gen_mode == "unconditional": predicted_video = tf.tile( predicted_video, [1, self.hparams.video_num_target_frames, 1, 1, 1]) predicted_video = glow_ops.postprocess(predicted_video) # Output of a single decode / sample. output_features = {} output_features["targets"] = tf.zeros_like(predicted_video) output_features["outputs"] = predicted_video output_features["scores"] = tf.zeros_like(predicted_video) return output_features