def generator(self, z, is_training, out_shape): """Generator outputting image in [0, 1].""" hparams = self.hparams height, width, c_dim = out_shape batch_size = hparams.batch_size with tf.variable_scope( "generator", initializer=tf.random_normal_initializer(stddev=0.02)): net = tf.layers.dense(z, 1024, name="g_fc1") net = tf.layers.batch_normalization(net, training=is_training, momentum=0.999, name="g_bn1") net = lrelu(net) net = tf.layers.dense(net, 128 * (height // 4) * (width // 4), name="g_fc2") net = tf.layers.batch_normalization(net, training=is_training, momentum=0.999, name="g_bn2") net = lrelu(net) net = tf.reshape(net, [batch_size, height // 4, width // 4, 128]) net = deconv2d(net, [batch_size, height // 2, width // 2, 64], 4, 4, 2, 2, name="g_dc3") net = tf.layers.batch_normalization(net, training=is_training, momentum=0.999, name="g_bn3") net = lrelu(net) net = deconv2d(net, [batch_size, height, width, c_dim], 4, 4, 2, 2, name="g_dc4") out = tf.nn.sigmoid(net) return common_layers.convert_real_to_rgb(out)
def top(self, body_output, _): frames = body_output if isinstance(body_output, list): frames = tf.stack(body_output, axis=1) rgb_frames = common_layers.convert_real_to_rgb(frames) common_video.gif_summary("body_output", rgb_frames) return tf.expand_dims(rgb_frames, axis=-1)
def generator(self, z, is_training, out_shape): """Generator outputting image in [0, 1].""" hparams = self.hparams height, width, c_dim = out_shape batch_size = hparams.batch_size with tf.variable_scope( "generator", initializer=tf.random_normal_initializer(stddev=0.02)): net = tf.layers.dense(z, 1024, name="g_fc1") net = tf.layers.batch_normalization(net, training=is_training, momentum=0.999, name="g_bn1") net = lrelu(net) net = tf.layers.dense(net, 128 * (height // 4) * (width // 4), name="g_fc2") net = tf.layers.batch_normalization(net, training=is_training, momentum=0.999, name="g_bn2") net = lrelu(net) net = tf.reshape(net, [batch_size, height // 4, width // 4, 128]) net = deconv2d(net, [batch_size, height // 2, width // 2, 64], 4, 4, 2, 2, name="g_dc3") net = tf.layers.batch_normalization(net, training=is_training, momentum=0.999, name="g_bn3") net = lrelu(net) net = deconv2d(net, [batch_size, height, width, c_dim], 4, 4, 2, 2, name="g_dc4") out = tf.nn.sigmoid(net) return common_layers.convert_real_to_rgb(out)
def get_sampled_frame(self, pred_frame): """Samples the frame based on modality. if the modality is L2/L1 then the next predicted frame is the next frame and there is no sampling but in case of Softmax loss the next actual frame should be sampled from predicted frame. This enables multi-frame target prediction with Softmax loss. Args: pred_frame: predicted frame. Returns: sampled frame. """ # TODO(lukaszkaiser): the logic below heavily depend on the current # (a bit strange) video modalities - we should change that. if self.is_per_pixel_softmax: frame_shape = common_layers.shape_list(pred_frame) target_shape = frame_shape[:-1] + [self.hparams.problem.num_channels] sampled_frame = tf.reshape(pred_frame, target_shape + [256]) sampled_frame = pixels_from_softmax( sampled_frame, temperature=self.hparams.pixel_sampling_temperature) # TODO(lukaszkaiser): this should be consistent with modality.bottom() sampled_frame = common_layers.standardize_images(sampled_frame) else: x = common_layers.convert_real_to_rgb(pred_frame) x = x - tf.stop_gradient(x + tf.round(x)) x = common_layers.convert_rgb_to_real(x) return x return sampled_frame
def get_sampled_frame(self, pred_frame): """Samples the frame based on modality. if the modality is L2/L1 then the next predicted frame is the next frame and there is no sampling but in case of Softmax loss the next actual frame should be sampled from predicted frame. This enables multi-frame target prediction with Softmax loss. Args: pred_frame: predicted frame. Returns: sampled frame. """ # TODO(lukaszkaiser): the logic below heavily depend on the current # (a bit strange) video modalities - we should change that. if self.is_per_pixel_softmax: frame_shape = common_layers.shape_list(pred_frame) target_shape = frame_shape[:-1] + [self.hparams.problem.num_channels] sampled_frame = tf.reshape(pred_frame, target_shape + [256]) sampled_frame = pixels_from_softmax( sampled_frame, temperature=self.hparams.pixel_sampling_temperature) # TODO(lukaszkaiser): this should be consistent with modality.bottom() sampled_frame = common_layers.standardize_images(sampled_frame) else: x = common_layers.convert_real_to_rgb(pred_frame) x = x - tf.stop_gradient(x + tf.round(x)) x = common_layers.convert_rgb_to_real(x) return x return sampled_frame
def body(self, features): hparams = self.hparams is_training = hparams.mode == tf.estimator.ModeKeys.TRAIN if hparams.mode != tf.estimator.ModeKeys.PREDICT: x = features["targets"] shape = common_layers.shape_list(x) is1d = shape[2] == 1 self.is1d = is1d # Run encoder. x = self.encoder(x) # Bottleneck (mix during early training, not too important but stable). b, b_loss = self.bottleneck(x) self._cur_bottleneck_tensor = b b = self.unbottleneck(b, common_layers.shape_list(x)[-1]) b = common_layers.mix(b, x, hparams.bottleneck_warmup_steps, is_training) if hparams.gan_loss_factor != 0.0: # Add a purely sampled batch on which we'll compute the GAN loss. g = self.unbottleneck( self.sample(), common_layers.shape_list(x)[-1], reuse=True) b = tf.concat([g, b], axis=0) # With probability bottleneck_max_prob use the bottleneck, otherwise x. if hparams.bottleneck_max_prob < -1.0: x = tf.where( tf.less(tf.random_uniform([]), hparams.bottleneck_max_prob), b, x) else: x = b else: if self._cur_bottleneck_tensor is None: b = self.sample() else: b = self._cur_bottleneck_tensor res_size = self.hparams.hidden_size * 2**self.hparams.num_hidden_layers res_size = min(res_size, hparams.max_hidden_size) x = self.unbottleneck(b, res_size) # Run decoder. x = self.decoder(x) if hparams.mode == tf.estimator.ModeKeys.PREDICT: return x, {"bottleneck_loss": 0.0} # Cut to the right size and mix before returning. res = x[:, :shape[1], :shape[2], :] # Add GAN loss if requested. gan_loss = 0.0 if hparams.gan_loss_factor != 0.0: # Split back if we added a purely sampled batch. res_gan, res = tf.split(res, 2, axis=0) num_channels = self.hparams.problem.num_channels res_rgb = common_layers.convert_real_to_rgb( tf.nn.sigmoid(tf.layers.dense(res_gan, num_channels, name="gan_rgb"))) tf.summary.image( "gan", common_layers.tpu_safe_image_summary(res_rgb), max_outputs=1) orig_rgb = tf.to_float(features["targets_raw"]) def discriminate(x): return self.discriminator(x, is_training=is_training) gan_loss = common_layers.sliced_gan_loss(orig_rgb, reverse_gradient(res_rgb), discriminate, self.hparams.num_sliced_vecs) gan_loss *= hparams.gan_loss_factor # Mix the final result and return. res = common_layers.mix(res, features["targets"], hparams.bottleneck_warmup_steps // 2, is_training) return res, {"bottleneck_loss": b_loss, "gan_loss": -gan_loss}
def top(self, body_output, _): frames = tf.stack(body_output, axis=1) rgb_frames = common_layers.convert_real_to_rgb(frames) common_layers.summarize_video(rgb_frames, "body_output") return tf.expand_dims(rgb_frames, axis=-1)
def top(self, body_output, _): frames = tf.stack(body_output, axis=1) rgb_frames = common_layers.convert_real_to_rgb(frames) common_layers.summarize_video(rgb_frames, "body_output") # TODO(lukaszkaiser): remove the need for the last dimension of 1 in eval. return tf.expand_dims(rgb_frames, axis=-1)
def top(self, body_output, _): frames = tf.stack(body_output, axis=1) rgb_frames = common_layers.convert_real_to_rgb(frames) common_layers.summarize_video(rgb_frames, "body_output") return tf.expand_dims(rgb_frames, axis=-1)
def process(self, inputs, targets): all_frames = tf.unstack(inputs, axis = 1) + tf.unstack(targets, axis = 1) hparams = self.hparams batch_size = common_layers.shape_list(all_frames[0])[0] z_dim = hparams.z_dim g_dim = hparams.g_dim rnn_size = hparams.rnn_size prior_rnn_layers = hparams.prior_rnn_layers posterior_rnn_layers = hparams.posterior_rnn_layers predictor_rnn_layers = hparams.predictor_rnn_layers num_input_frames = hparams.num_input_frames num_target_frames = hparams.num_target_frames num_all_frames = num_input_frames + num_target_frames #Creating RNN cells predictor_cell = self.rnn_model(rnn_size, "predictor", n_layers = predictor_rnn_layers) prior_cell = self.rnn_model(rnn_size, "prior", n_layers = prior_rnn_layers) posterior_cell = self.rnn_model(rnn_size, "posterior", n_layers = posterior_rnn_layers) #Getting RNN states predictor_state = predictor_cell.zero_state(batch_size, tf.float32) prior_state = prior_cell.zero_state(batch_size, tf.float32) posterior_state = posterior_cell.zero_state(batch_size, tf.float32) #Encoding enc_frames, enc_skips = [], [] for frame in all_frames if self.is_training else all_frames[:num_input_frames]: with tf.variable_scope("encoder", reuse = tf.AUTO_REUSE): enc, skip = self.encoder(frame) enc_frames.append(enc) enc_skips.append(skip) #Prediction prior_mus = [] prior_logvars = [] posterior_mus = [] posterior_logvars = [] predicted_frames = [] z_positions = [] skip = None if self.is_training: for i in range(1,num_all_frames): h = enc_frames[i-1] h_target = enc_frames[i] if i < num_input_frames: skip = enc_skips[i-1] with tf.variable_scope("prediction", reuse = tf.AUTO_REUSE): mu, log_var, posterior_state = self.gaussian_rnn(posterior_cell, h_target, posterior_state, z_dim, "posterior") mu_p, log_var_p, prior_state = self.gaussian_rnn(prior_cell, h, prior_state, z_dim, "prior") z = utils.get_gaussian_tensor(mu,log_var) h_pred, predictor_state = self.deterministic_rnn(predictor_cell, tf.concat([h,z], axis = 1),\ predictor_state, g_dim, "predictor") with tf.variable_scope("decoder", reuse = tf.AUTO_REUSE): x_pred = self.decoder(h_pred, skip) predicted_frames.append(x_pred) prior_mus.append(mu_p) prior_logvars.append(log_var_p) posterior_mus.append(mu) posterior_logvars.append(log_var) z_positions.append(z) else: for i in range(1, num_all_frames): if i < num_input_frames: h = enc_frames[i-1] skip = enc_skips[i-1] else: with tf.variable_scope("encoder", reuse = tf.AUTO_REUSE): h, _ = self.encoder(predicted_frames[-1]) mu = log_var = mu_p = log_var_p = None if i < num_input_frames: h_target = enc_frames[i] with tf.variable_scope("prediction", reuse = tf.AUTO_REUSE): mu, log_var, posterior_state = self.gaussian_rnn(posterior_cell, h_target, posterior_state,\ z_dim, "posterior") mu_p, log_var_p, prior_state= self.gaussian_rnn(prior_cell, h, prior_state, z_dim, "prior") z = utils.get_gaussian_tensor(mu,log_var) _, predictor_state = self.deterministic_rnn(predictor_cell, tf.concat([h,z], axis = 1), predictor_state,\ g_dim, "predictor") x_pred = all_frames[i] else: with tf.variable_scope("prediction", reuse = tf.AUTO_REUSE): mu_p, log_var_p, prior_state = self.gaussian_rnn(prior_cell, h, prior_state, z_dim, "prior") z = utils.get_gaussian_tensor(mu_p, log_var_p) h_pred, predictor_state = self.deterministic_rnn(predictor_cell, tf.concat([h,z], axis = 1), predictor_state, g_dim, "predictor") with tf.variable_scope("decoder", reuse = tf.AUTO_REUSE): x_pred = self.decoder(h_pred,skip) predicted_frames.append(x_pred) prior_mus.append(mu_p) prior_logvars.append(log_var_p) posterior_mus.append(mu) posterior_logvars.append(log_var) z_positions.append(z) recon_loss = 0 kl_loss = 0 #recon loss recon_loss = l2_loss(tf.stack(predicted_frames), tf.stack(all_frames[1:]))*(num_all_frames-1) if self.is_training: #kl loss kl_loss = self.get_kl_loss(posterior_mus,posterior_logvars, prior_mus,\ prior_logvars) pred_outputs = tf.stack(predicted_frames[num_input_frames-1:], axis = 1) rgb_frames = tf.tile(common_layers.convert_real_to_rgb(tf.stack(predicted_frames, axis = 1)), [1,1,1,1,3]) all_frames = tf.stack(all_frames, axis = 1) all_frames_rgb = tf.tile(common_layers.convert_real_to_rgb(all_frames), [1,1,1,1,3]) common_video.gif_summary("body_output", rgb_frames) common_video.gif_summary("all_ground_frames", all_frames_rgb) tf.summary.scalar("kl_loss", kl_loss) tf.summary.scalar("recon_loss", recon_loss) loss = recon_loss + kl_loss return pred_outputs, loss, tf.stack(z_positions,axis = 1)