def encoder_model(frames,
                  sequence_length,
                  initializer,
                  scope='encoder',
                  fc_conv_layer=False):
    """
  Args:
    frames: 5D array of batch with videos - shape(batch_size, num_frames, frame_width, frame_higth, num_channels)
    sequence_length: number of frames that shall be encoded
    scope: tensorflow variable scope name
    initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data)
    fc_conv_layer: adds an fc layer at the end of the encoder
  Returns:
    hidden4: hidden state of highest ConvLSTM layer
    fc_conv_layer: indicated whether a Fully Convolutional (8x8x16 -> 1x1x1024) shall be added
  """

    lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5 = None, None, None, None, None

    for i in range(sequence_length):

        frame = frames[:, i, :, :, :]

        reuse = (i > 0)

        with tf.variable_scope(scope, reuse=reuse):
            #LAYER 1: conv1
            conv1 = slim.layers.conv2d(
                frame,
                16, [5, 5],
                stride=2,
                scope='conv1',
                normalizer_fn=tf_layers.layer_norm,
                weights_initializer=initializer,
                normalizer_params={'scope': 'layer_norm1'})

            #LAYER 2: convLSTM1
            hidden1, lstm_state1 = basic_conv_lstm_cell(conv1,
                                                        lstm_state1,
                                                        16,
                                                        initializer,
                                                        filter_size=5,
                                                        scope='convlstm1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

            #LAYER 3: conv2
            conv2 = slim.layers.conv2d(
                hidden1,
                hidden1.get_shape()[3], [5, 5],
                stride=2,
                scope='conv2',
                normalizer_fn=tf_layers.layer_norm,
                weights_initializer=initializer,
                normalizer_params={'scope': 'layer_norm3'})

            #LAYER 4: convLSTM2
            hidden2, lstm_state2 = basic_conv_lstm_cell(conv2,
                                                        lstm_state2,
                                                        16,
                                                        initializer,
                                                        filter_size=5,
                                                        scope='convlstm2')
            hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4')

            #LAYER 5: conv3
            conv3 = slim.layers.conv2d(
                hidden2,
                hidden2.get_shape()[3], [5, 5],
                stride=2,
                scope='conv3',
                normalizer_fn=tf_layers.layer_norm,
                weights_initializer=initializer,
                normalizer_params={'scope': 'layer_norm5'})

            #LAYER 6: convLSTM3
            hidden3, lstm_state3 = basic_conv_lstm_cell(conv3,
                                                        lstm_state3,
                                                        16,
                                                        initializer,
                                                        filter_size=3,
                                                        scope='convlstm3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6')

            #LAYER 7: conv4
            conv4 = slim.layers.conv2d(
                hidden3,
                hidden3.get_shape()[3], [3, 3],
                stride=2,
                scope='conv4',
                normalizer_fn=tf_layers.layer_norm,
                weights_initializer=initializer,
                normalizer_params={'scope': 'layer_norm7'})

            #LAYER 8: convLSTM4 (8x8 featuremap size)
            hidden4, lstm_state4 = basic_conv_lstm_cell(conv4,
                                                        lstm_state4,
                                                        32,
                                                        initializer,
                                                        filter_size=3,
                                                        scope='convlstm4')
            hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm8')

            #LAYER 8: conv5
            conv5 = slim.layers.conv2d(
                hidden4,
                hidden4.get_shape()[3], [3, 3],
                stride=2,
                scope='conv5',
                normalizer_fn=tf_layers.layer_norm,
                weights_initializer=initializer,
                normalizer_params={'scope': 'layer_norm9'})

            hidden5, lstm_state5 = basic_conv_lstm_cell(conv5,
                                                        lstm_state5,
                                                        32,
                                                        initializer,
                                                        filter_size=3,
                                                        scope='convlstm5')
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm10')

            #LAYER 9: Fully Convolutional Layer (8x8x16 --> 1x1xFC_LAYER_SIZE)
            if fc_conv_layer:
                fc_conv = slim.layers.conv2d(hidden5,
                                             FC_LAYER_SIZE, [4, 4],
                                             stride=1,
                                             scope='fc_conv',
                                             padding='VALID',
                                             weights_initializer=initializer)
                hidden_repr = fc_conv
            else:
                hidden_repr = hidden5

    return hidden_repr
def decoder_model(hidden_repr,
                  sequence_length,
                  initializer,
                  num_channels=3,
                  scope='decoder',
                  fc_conv_layer=False):
    """
  Args:
    hidden_repr: Tensor of latent space representation
    sequence_length: number of frames that shall be decoded from the hidden_repr
    num_channels: number of channels for generated frames
    initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data)
    fc_conv_layer: adds an fc layer at the end of the encoder
  Returns:
    frame_gen: array of generated frames (Tensors)
    fc_conv_layer: indicates whether hidden_repr is 1x1xdepth tensor a and fully concolutional layer shall be added
  """
    frame_gen = []

    lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5 = None, None, None, None, None
    assert (not fc_conv_layer) or (hidden_repr.get_shape()[1] ==
                                   hidden_repr.get_shape()[2] == 1)

    for i in range(sequence_length):
        reuse = (i > 0)  #reuse variables (recurrence) after first time step

        with tf.variable_scope(scope, reuse=reuse):

            #Fully Convolutional Layer (1x1xFC_LAYER_SIZE -> 8x8x16)
            if fc_conv_layer:
                fc_conv = slim.layers.conv2d_transpose(
                    hidden_repr,
                    32, [4, 4],
                    stride=1,
                    scope='fc_conv',
                    padding='VALID',
                    weights_initializer=initializer)
                hidden1_input = fc_conv
            else:
                hidden1_input = hidden_repr

            #LAYER 1: convLSTM1
            hidden1, lstm_state1 = basic_conv_lstm_cell(hidden1_input,
                                                        lstm_state1,
                                                        32,
                                                        initializer,
                                                        filter_size=3,
                                                        scope='convlstm1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm1')

            #LAYER 2: upconv1 (8x8 -> 16x16)
            upconv1 = slim.layers.conv2d_transpose(
                hidden1,
                hidden1.get_shape()[3],
                3,
                stride=2,
                scope='upconv1',
                weights_initializer=initializer,
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm2'})

            #LAYER 3: convLSTM2
            hidden2, lstm_state2 = basic_conv_lstm_cell(upconv1,
                                                        lstm_state2,
                                                        32,
                                                        initializer,
                                                        filter_size=3,
                                                        scope='convlstm2')
            hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')

            #LAYER 4: upconv2 (16x16 -> 32x32)
            upconv2 = slim.layers.conv2d_transpose(
                hidden2,
                hidden2.get_shape()[3],
                3,
                stride=2,
                scope='upconv2',
                weights_initializer=initializer,
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm4'})

            #LAYER 5: convLSTM3
            hidden3, lstm_state3 = basic_conv_lstm_cell(upconv2,
                                                        lstm_state3,
                                                        16,
                                                        initializer,
                                                        filter_size=3,
                                                        scope='convlstm3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm5')

            # LAYER 6: upconv3 (32x32 -> 64x64)
            upconv3 = slim.layers.conv2d_transpose(
                hidden3,
                hidden3.get_shape()[3],
                5,
                stride=2,
                scope='upconv3',
                weights_initializer=initializer,
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm6'})

            #LAYER 7: convLSTM4
            hidden4, lstm_state4 = basic_conv_lstm_cell(upconv3,
                                                        lstm_state4,
                                                        16,
                                                        initializer,
                                                        filter_size=5,
                                                        scope='convlstm4')
            hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm7')

            #Layer 8: upconv4 (64x64 -> 128x128)
            upconv4 = slim.layers.conv2d_transpose(
                hidden4,
                16,
                5,
                stride=2,
                scope='upconv4',
                normalizer_fn=tf_layers.layer_norm,
                weights_initializer=initializer,
                normalizer_params={'scope': 'layer_norm8'})

            #LAYER 9: convLSTM5
            hidden5, lstm_state5 = basic_conv_lstm_cell(upconv4,
                                                        lstm_state5,
                                                        16,
                                                        initializer,
                                                        filter_size=5,
                                                        scope='convlstm5')
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm9')

            upconv5 = slim.layers.conv2d_transpose(
                hidden5,
                num_channels,
                5,
                stride=2,
                scope='upconv5',
                weights_initializer=initializer)

            frame_gen.append(upconv5)

    assert len(frame_gen) == sequence_length
    return frame_gen
def encoder_model(frames, sequence_length, initializer, keep_prob_dropout=0.9, scope='encoder', fc_conv_layer=False):
  """
  Args:
    frames: 5D array of batch with videos - shape(batch_size, num_frames, frame_width, frame_higth, num_channels)
    sequence_length: number of frames that shall be encoded
    scope: tensorflow variable scope name
    initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data)
    fc_conv_layer: adds an fc layer at the end of the encoder
  Returns:
    hidden4: hidden state of highest ConvLSTM layer
    fc_conv_layer: indicated whether a Fully Convolutional (8x8x16 -> 1x1x1024) shall be added
  """

  lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6 = None, None, None, None, None, None

  for i in range(sequence_length):

    frame = frames[:,i,:,:,:]

    reuse = (i > 0)

    with tf.variable_scope(scope, reuse=reuse):
      #LAYER 1: conv1
      conv1 = slim.layers.conv2d(frame, 32, [5, 5], stride=2, scope='conv1', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
          normalizer_params={'scope': 'layer_norm1'})
      conv1 = tf.nn.dropout(conv1, keep_prob_dropout)

      #LAYER 2: convLSTM1
      hidden1, lstm_state1 = basic_conv_lstm_cell(conv1, lstm_state1, 32, initializer, filter_size=5, scope='convlstm1')
      hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
      hidden1 = tf.nn.dropout(hidden1, keep_prob_dropout)

      #LAYER 3: conv2
      conv2 = slim.layers.conv2d(hidden1, hidden1.get_shape()[3], [5, 5], stride=2, scope='conv2', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                  normalizer_params={'scope': 'layer_norm3'})
      conv2 = tf.nn.dropout(conv2, keep_prob_dropout)

      #LAYER 4: convLSTM2
      hidden2, lstm_state2 = basic_conv_lstm_cell(conv2, lstm_state2, 32, initializer, filter_size=5, scope='convlstm2')
      hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4')
      hidden2 = tf.nn.dropout(hidden2, keep_prob_dropout)

      #LAYER 5: conv3
      conv3 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [5, 5], stride=2, scope='conv3', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                  normalizer_params={'scope': 'layer_norm5'})
      conv3 = tf.nn.dropout(conv3, keep_prob_dropout)

      #LAYER 6: convLSTM3
      hidden3, lstm_state3 = basic_conv_lstm_cell(conv3, lstm_state3, 32, initializer, filter_size=3, scope='convlstm3')
      hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6')
      hidden3 = tf.nn.dropout(hidden3, keep_prob_dropout)

      #LAYER 7: conv4
      conv4 = slim.layers.conv2d(hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                 normalizer_params={'scope': 'layer_norm7'})
      conv4 = tf.nn.dropout(conv4, keep_prob_dropout)

      #LAYER 8: convLSTM4 (8x8 feature map size)
      hidden4, lstm_state4 = basic_conv_lstm_cell(conv4, lstm_state4, 64, initializer, filter_size=3, scope='convlstm4')
      hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm8')
      hidden4 = tf.nn.dropout(hidden4, keep_prob_dropout)

      #LAYER 8: conv5
      conv5 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv5', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, 
                                 normalizer_params={'scope': 'layer_norm9'})
      conv5 = tf.nn.dropout(conv5, keep_prob_dropout)

      # LAYER 9: convLSTM5 (4x4 feature map size)
      hidden5, lstm_state5 = basic_conv_lstm_cell(conv5, lstm_state5, 64, initializer, filter_size=3, scope='convlstm5')
      hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm10')
      hidden5 = tf.nn.dropout(hidden5, keep_prob_dropout)

      # LAYER 10: Fully Convolutional Layer (4x4x128 --> 1x1xFC_LAYER_SIZE)
      # necessary for dimension compatibility with conv lstm cell
      fc_conv = slim.layers.conv2d(hidden5, FC_LAYER_SIZE, [4,4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer)
      fc_conv = tf.nn.dropout(fc_conv, keep_prob_dropout)

      # LAYER 11: Fully Convolutional LSTM (1x1x256 -> 1x1x128)
      hidden6, lstm_state6 = basic_conv_lstm_cell(fc_conv, lstm_state6, FC_LSTM_LAYER_SIZE, initializer, filter_size=1, scope='convlstm6')
      #no dropout since its the last encoder layer --> hidden repr should be steady

      # mu and sigma for sampling latent variable
      sigma = slim.layers.fully_connected(inputs=lstm_state6, num_outputs=VAE_REPR_SIZE, activation_fn=tf.nn.softplus)
      mu = slim.layers.fully_connected(inputs=lstm_state6, num_outputs=VAE_REPR_SIZE, activation_fn=None)

      # do reparamazerization trick to allow backprop flow through deterministic nodes sigma and mu
      z = mu + sigma * tf.random_normal(tf.shape(mu), mean=0., stddev=1.)

  return z, mu, sigma