def glow_encoder(self, frame, condition=False, cond_latents=None, init=False): """Glow network that encodes frame to a hierarchy of latents. Args: frame: 5-D Tensor of shape (batch_size, 1, height, width, channels). condition: Whether or not to condition on cond_latents. cond_latents: optional, list of tensors with length equal to hparams.n_levels - 1. If provided, the latent at level l is conditioned on the cond_latent at level l. init: Whether the given batch is an "init" batch or a "train" batch. Returns: objective: log-likelihood of the frame per the model. z_top: top-level latent. z_levels: a list of tensors with latents at all levels. """ frame = self.squeeze_video(frame, init=init) frame = self.preprocess(frame) frame, objective = glow_ops.uniform_binning_correction(frame) glow_vals = glow_ops.encoder_decoder("codec", frame, self.hparams, eps=None, reverse=False, cond_latents=cond_latents, states=self.level_states, condition=condition) z_top, encoder_objective, self.eps, z_levels, self.level_states = glow_vals objective += encoder_objective return objective, z_top, z_levels
def body(self, features): x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) x, objective = glow_ops.uniform_binning_correction(x) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. global_step = tf.train.get_or_create_global_step() init_op = tf.logical_and(tf.equal(global_step, 0), self.is_training) ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=init_op): self.z, encoder_objective, self.eps, _ = glow_ops.encoder_decoder( "codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective prior_objective, prior_dist = self.top_prior(self.z) tf.summary.scalar("top_prior", tf.reduce_mean(prior_objective)) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel _, h, w, c = common_layers.shape_list(x) objective = -objective / (np.log(2) * h * w * c) return tf.zeros_like(features["targets"]), {"training": objective}
def preprocess_frame(frame): """Preprocess frame. 1. Converts [0, 255] to [-0.5, 0.5] 2. Adds uniform noise. Args: frame: 3-D Tensor representing pixels. Returns: frame: 3-D Tensor with values in between [-0.5, 0.5] """ # Normalize from [0.0, 1.0] -> [-0.5, 0.5] frame = common_layers.convert_rgb_to_real(frame) frame = frame - 0.5 frame, _ = glow_ops.uniform_binning_correction(frame) return frame
def objective_tower(self, features, init=True): """Objective in terms of bits-per-pixel. Args: features: dict of tensors with "features" and "targets" keys. init: Whether or not to run data-dependent init. Returns: objective: float, bits-per-pixel. """ x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) x, objective = glow_ops.uniform_binning_correction(x) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. ops = [ glow_ops.get_variable_ddi, glow_ops.actnorm, glow_ops.get_dropout ] with arg_scope(ops, init=init): encoder = glow_ops.encoder_decoder self.z, encoder_objective, self.eps, _, _ = encoder("codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective self.z_top_shape = common_layers.shape_list(self.z) prior_dist = self.top_prior() prior_objective = tf.reduce_sum(prior_dist.log_prob(self.z), axis=[1, 2, 3]) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel _, h, w, c = common_layers.shape_list(x) objective = -objective / (np.log(2) * h * w * c) return objective
def objective_tower(self, features, init=True): """Objective in terms of bits-per-pixel. Args: features: dict of tensors with "features" and "targets" keys. init: Whether or not to run data-dependent init. Returns: objective: float, bits-per-pixel. """ x = features["inputs"] # Scale x such that the pixels lie in-between -0.5 and.0.5 x = self.preprocess(x) x, objective = glow_ops.uniform_binning_correction(x) # The arg_scope call ensures that the actnorm parameters are set such that # the per-channel output activations have zero mean and unit variance # ONLY during the first step. After that the parameters are learned # through optimisation. ops = [glow_ops.get_variable_ddi, glow_ops.actnorm] with arg_scope(ops, init=init): self.z, encoder_objective, self.eps, _, _ = glow_ops.encoder_decoder( "codec", x, self.hparams, eps=None, reverse=False) objective += encoder_objective self.z_top_shape = common_layers.shape_list(self.z) prior_dist = self.top_prior() prior_objective = tf.reduce_sum( prior_dist.log_prob(self.z), axis=[1, 2, 3]) self.z_sample = prior_dist.sample() objective += prior_objective # bits per pixel _, h, w, c = common_layers.shape_list(x) objective = -objective / (np.log(2) * h * w * c) return objective