Example #1
0
 def create_model(self,
                  model_input,
                  vocab_size,
                  num_frames,
                  is_training=True,
                  **unused_params):
     gru_size = FLAGS.gru_cells
     number_of_layers = FLAGS.gru_layers
     backward = FLAGS.gru_backward
     random_frames = FLAGS.random_frames
     iter = FLAGS.iterations
     if random_frames:
         num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
         model_input = sample_random_frames(model_input, num_frames_2, iter)
     if backward:
         model_input = tf.reverse_sequence(model_input,
                                           num_frames,
                                           seq_axis=1)
     stacked_GRU = tf.contrib.rnn.MultiRNNCell([
         tf.contrib.rnn.GRUCell(gru_size) for _ in range(number_of_layers)
     ],
                                               state_is_tuple=False)
     loss = 0.0
     with tf.variable_scope("RNN"):
         outputs, state = tf.nn.dynamic_rnn(stacked_GRU,
                                            model_input,
                                            sequence_length=num_frames,
                                            dtype=tf.float32)
     aggregated_model = getattr(video_level_models,
                                FLAGS.video_level_clf_model)
     return aggregated_model().create_model(model_input=state,
                                            vocab_size=vocab_size,
                                            is_training=is_training,
                                            **unused_params)
Example #2
0
 def create_model(self,
                  model_input,
                  vocab_size,
                  num_frames,
                  iterations=None,
                  batch_norm=None,
                  random_frames=None,
                  cluster_size=None,
                  hidden_size=None,
                  is_training=True,
                  **unused_params):
     iter = iterations or FLAGS.iterations
     batch_norm = batch_norm or FLAGS.batch_norm
     random_frames = random_frames or FLAGS.random_frames
     cluster_size = cluster_size or FLAGS.fv_cluster_size
     hidden1_size = hidden_size or FLAGS.fv_hidden_size
     relu = FLAGS.relu
     gating = FLAGS.gating
     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
     if random_frames:
         model_input = sample_random_frames(model_input, num_frames, iter)
     else:
         model_input = sample_random_seq(model_input, num_frames, iter)
     max_frames = model_input.get_shape().as_list()[1]
     feature_size = model_input.get_shape().as_list()[2]
     reshaped_input = tf.reshape(model_input, [-1, feature_size])
     tf.summary.histogram("input_hist", reshaped_input)
     video_NetFV = NetFV(1024, max_frames, cluster_size, batch_norm,
                         is_training)
     audio_NetFV = NetFV(128, max_frames, cluster_size // 2, batch_norm,
                         is_training)
     if batch_norm:
         reshaped_input = slim.batch_norm(reshaped_input,
                                          center=True,
                                          scale=True,
                                          is_training=is_training,
                                          scope="input_bn")
     with tf.variable_scope("video_FV"):
         fv_video = video_NetFV.forward(reshaped_input[:, 0:1024])
     with tf.variable_scope("audio_FV"):
         fv_audio = audio_NetFV.forward(reshaped_input[:, 1024:])
     fv = tf.concat([fv_video, fv_audio], 1)
     fv_dim = fv.get_shape().as_list()[1]
     hidden1_weights = tf.get_variable(
         "hidden1_weights", [fv_dim, hidden1_size],
         initializer=tf.random_normal_initializer(stddev=1 /
                                                  math.sqrt(cluster_size)))
     activation = tf.matmul(fv, hidden1_weights)
     if batch_norm and relu:
         activation = slim.batch_norm(activation,
                                      center=True,
                                      scale=True,
                                      is_training=is_training,
                                      scope="hidden1_bn")
     else:
         hidden1_biases = tf.get_variable(
             "hidden1_biases", [hidden1_size],
             initializer=tf.random_normal_initializer(stddev=0.01))
         tf.summary.histogram("hidden1_biases", hidden1_biases)
         activation += hidden1_biases
     if relu:
         activation = tf.nn.relu6(activation)
     if gating:
         gating_weights = tf.get_variable(
             "gating_weights_2", [hidden1_size, hidden1_size],
             initializer=tf.random_normal_initializer(
                 stddev=1 / math.sqrt(hidden1_size)))
         gates = tf.matmul(activation, gating_weights)
         if batch_norm:
             gates = slim.batch_norm(gates,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="gating_bn")
         else:
             gating_biases = tf.get_variable(
                 "gating_biases", [cluster_size],
                 initializer=tf.random_normal(stddev=1 /
                                              math.sqrt(feature_size)))
             gates += gating_biases
         gates = tf.sigmoid(gates)
         activation = tf.multiply(activation, gates)
     aggregated_model = getattr(video_level_models,
                                FLAGS.video_level_clf_model)
     return aggregated_model().create_model(model_input=activation,
                                            vocab_size=vocab_size,
                                            is_training=is_training,
                                            **unused_params)
Example #3
0
 def create_model(self,
                  model_input,
                  vocab_size,
                  num_frames,
                  iterations=None,
                  batch_norm=None,
                  random_frames=None,
                  cluster_size=None,
                  hidden_size=None,
                  is_training=True,
                  **unused_params):
     iter = iterations or FLAGS.iterations
     batch_norm = batch_norm or FLAGS.batch_norm
     random_frames = random_frames or FLAGS.random_frames
     cluster_size = cluster_size or FLAGS.dbof_cluster_size
     hidden1_size = hidden_size or FLAGS.dbof_hidden_size
     relu = FLAGS.relu
     fc_dimred = FLAGS.fc_dimred
     max_pool = FLAGS.softdbof_maxpool
     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
     if random_frames:
         model_input = sample_random_frames(model_input, num_frames, iter)
     else:
         model_input = sample_random_seq(model_input, num_frames, iter)
     max_frames = model_input.get_shape().as_list()[1]
     feature_size = model_input.get_shape().as_list()[2]
     reshaped_input = tf.reshape(model_input, [-1, feature_size])
     tf.summary.histogram("input_hist", reshaped_input)
     video_Dbof = SoftDBoF(1024, max_frames, cluster_size, max_pool,
                           batch_norm, is_training)
     audio_Dbof = SoftDBoF(128, max_frames, cluster_size // 8, max_pool,
                           batch_norm, is_training)
     if batch_norm:
         reshaped_input = slim.batch_norm(reshaped_input,
                                          center=True,
                                          scale=True,
                                          is_training=is_training,
                                          scope="input_bn")
     with tf.variable_scope("video_DBOF"):
         dbof_video = video_Dbof.forward(reshaped_input[:, 0:1024])
     with tf.variable_scope("audio_DBOF"):
         dbof_audio = audio_Dbof.forward(reshaped_input[:, 1024:])
     dbof = tf.concat([dbof_video, dbof_audio], 1)
     dbof_dim = dbof.get_shape().as_list()[1]
     if fc_dimred:
         hidden1_weights = tf.get_variable(
             "hidden1_weights", [dbof_dim, hidden1_size],
             initializer=tf.random_normal_initializer(
                 stddev=1 / math.sqrt(cluster_size)))
         tf.summary.histogram("hidden1_weights", hidden1_weights)
         activation = tf.matmul(dbof, hidden1_weights)
         if batch_norm and relu:
             activation = slim.batch_norm(activation,
                                          center=True,
                                          scale=True,
                                          is_training=is_training,
                                          scope="hidden1_bn")
         else:
             hidden1_biases = tf.get_variable(
                 "hidden1_biases", [hidden1_size],
                 initializer=tf.random_normal_initializer(stddev=0.01))
             tf.summary.histogram("hidden1_biases", hidden1_biases)
             activation += hidden1_biases
         if relu:
             activation = tf.nn.relu6(activation)
         tf.summary.histogram("hidden1_output", activation)
     else:
         activation = dbof
     aggregated_model = getattr(video_level_models,
                                FLAGS.video_level_clf_model)
     return aggregated_model().create_model(model_input=activation,
                                            vocab_size=vocab_size,
                                            is_training=is_training,
                                            **unused_params)