def testTrainingLossIsNotNan(self): """Tests a minimal Keras training loop for the dynamics model.""" observed_keypoints = np.random.RandomState(0).normal(size=( self.cfg.batch_size, self.cfg.observed_steps + self.cfg.predicted_steps, self.cfg.num_keypoints, 3)) model = dynamics.build_vrnn(self.cfg) model.add_loss(tf.nn.l2_loss(model.inputs[0] - model.outputs[0])) # KP loss model.add_loss(tf.reduce_mean(model.outputs[1])) # KL loss model.compile(tf.keras.optimizers.Adam(lr=1e-5)) history = model.fit(x=observed_keypoints, steps_per_epoch=1, epochs=1) self.assertFalse( np.any(np.isnan(history.history['loss'])), 'Loss contains nans: {}'.format(history.history['loss']))
def build_model(cfg, data_shapes): """Builds the complete model with image encoder plus dynamics model. This architecture is meant for testing/illustration only. Model architecture: image_sequence --> keypoints --> reconstructed_image_sequence | V dynamics_model --> predicted_keypoints The model takes a [batch_size, timesteps, H, W, C] image sequence as input. It "observes" all frames, detects keypoints, and reconstructs the images. The dynamics model learns to predict future keypoints based on the detected keypoints. Args: cfg: ConfigDict with model hyperparameters. data_shapes: Dict of shapes of model input tensors, as returned by datasets.get_sequence_dataset. Returns: tf.keras.Model object. """ input_shape_no_batch = data_shapes['image'][ 1:] # Keras uses shape w/o batch. input_images = tf.keras.Input(shape=input_shape_no_batch, name='image') # Vision model: observed_keypoints, _ = vision.build_images_to_keypoints_net( cfg, input_shape_no_batch)(input_images) keypoints_to_images_net = vision.build_keypoints_to_images_net( cfg, input_shape_no_batch) reconstructed_images = keypoints_to_images_net([ observed_keypoints, input_images[:, 0, Ellipsis], observed_keypoints[:, 0, Ellipsis] ]) # Dynamics model: observed_keypoints_stop = tf.keras.layers.Lambda( tf.stop_gradient)(observed_keypoints) dynamics_model = dynamics.build_vrnn(cfg) predicted_keypoints, kl_divergence = dynamics_model( observed_keypoints_stop) model = tf.keras.Model(inputs=[input_images], outputs=[ reconstructed_images, observed_keypoints, predicted_keypoints ], name='autoencoder') # Losses: image_loss = tf.nn.l2_loss(input_images - reconstructed_images) # Normalize by batch size and sequence length: image_loss /= tf.to_float( tf.shape(input_images)[0] * tf.shape(input_images)[1]) model.add_loss(image_loss) separation_loss = losses.temporal_separation_loss( cfg, observed_keypoints[:, :cfg.observed_steps, Ellipsis]) model.add_loss(cfg.separation_loss_scale * separation_loss) vrnn_coord_pred_loss = tf.nn.l2_loss(observed_keypoints_stop - predicted_keypoints) # Normalize by batch size and sequence length: vrnn_coord_pred_loss /= tf.to_float( tf.shape(input_images)[0] * tf.shape(input_images)[1]) model.add_loss(vrnn_coord_pred_loss) kl_loss = tf.reduce_mean(kl_divergence) # Mean over batch and timesteps. model.add_loss(cfg.kl_loss_scale * kl_loss) return model