def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     is_training=True,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers
        random_frames = FLAGS.lstm_random_sequence
        iterations = FLAGS.iterations
        backward = FLAGS.lstm_backward

        if random_frames:
            num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
            model_input = utils.SampleRandomFrames(model_input, num_frames_2,
                                                   iterations)
        if backward:
            model_input = tf.reverse_sequence(model_input,
                                              num_frames,
                                              seq_axis=1)

        stacked_lstm = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.BasicLSTMCell(
                lstm_size, forget_bias=1.0, state_is_tuple=False)
            for _ in range(number_of_layers)
        ],
                                                   state_is_tuple=False)

        loss = 0.0
        with tf.variable_scope("RNN"):
            outputs, state = tf.nn.dynamic_rnn(stacked_lstm,
                                               model_input,
                                               sequence_length=num_frames,
                                               dtype=tf.float32)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=state,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Exemple #2
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     is_training=True,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.
        Args:
          model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                       input features.
          vocab_size: The number of classes in the dataset.
          num_frames: A vector of length 'batch' which indicates the number of
               frames for each video (before padding).
        Returns:
          A dictionary with a tensor containing the probability predictions of the
          model in the 'predictions' key. The dimensions of the tensor are
          'batch_size' x 'num_classes'.
        """
        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers
        random_frames = FLAGS.lstm_random_sequence
        iterations = FLAGS.iterations
        dropout_rate = FLAGS.dropout_rate
        if random_frames:
            num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
            model_input = utils.SampleRandomFrames(model_input, num_frames_2,
                                                   iterations)
        lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(
            lstm_size, forget_bias=1.0)  # Foreward direction cell
        lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(
            lstm_size, forget_bias=1.0)  # Backward direction cell
        if is_training:
            lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(
                lstm_fw_cell, output_keep_prob=(1 - dropout_rate))
            lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(
                lstm_bw_cell, output_keep_prob=(1 - dropout_rate))

        lstm_fw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell] *
                                                   (number_of_layers - 1),
                                                   state_is_tuple=False)
        lstm_bw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell] *
                                                   (number_of_layers - 1),
                                                   state_is_tuple=False)
        loss = 0.0
        with tf.variable_scope("RNN"):
            outputs, state = tf.contrib.rnn.static_bidirectional_rnn(
                lstm_fw_cell, lstm_bw_cell, model_input, dtype=tf.float32)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=state,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
    """Creates a model which uses a stack of LSTMs to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
    lstm_size = FLAGS.lstm_cells
    number_of_layers = FLAGS.lstm_layers

    stacked_lstm_fw = tf.contrib.rnn.MultiRNNCell(
            [
                tf.contrib.rnn.BasicLSTMCell(
                    lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
                ])
    stacked_lstm_bw = tf.contrib.rnn.MultiRNNCell(
            [
                tf.contrib.rnn.BasicLSTMCell(
                    lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
                ])

    loss = 0.0

    outputs, state = tf.nn.bidirectional_dynamic_rnn(stacked_lstm_fw, stacked_lstm_bw, model_input,
                                       sequence_length=num_frames,
                                       dtype=tf.float32)

    s = tf.concat([state[0][-1].h, state[1][-1].h], 1)
    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    if FLAGS.video_level_classifier_model == 'FrameMoeModel':
      nf = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
      outputs = utils.SampleRandomFrames(tf.concat(outputs,2), nf, FLAGS.iterations)
      return aggregated_model().create_model(
        model_input=outputs,
        vocab_size=vocab_size,
        num_frames=num_frames,
        **unused_params)
    else:
      return aggregated_model().create_model(
        model_input=s,
        vocab_size=vocab_size,
        **unused_params)
Exemple #4
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.
        Args:
          model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                       input features.
          vocab_size: The number of classes in the dataset.
          num_frames: A vector of length 'batch' which indicates the number of
               frames for each video (before padding).
        Returns:
          A dictionary with a tensor containing the probability predictions of the
          model in the 'predictions' key. The dimensions of the tensor are
          'batch_size' x 'num_classes'.
        """
        max_frame = 128
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        model_input = utils.SampleRandomFrames(model_input, num_frames,
                                               max_frame)
        # max_frame = model_input.get_shape().as_list()[1]
        image = tf.reshape(model_input, [-1, 32, 32])
        image = tf.expand_dims(image, 3)
        with slim.arg_scope(
            [slim.conv2d],
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=0.01),
                weights_regularizer=slim.l2_regularizer(0.0005),
                normalizer_fn=slim.batch_norm):
            net = slim.conv2d(image,
                              32, [5, 5],
                              padding='VALID',
                              scope='conv1')
            net = slim.relu(net, 32, scope='relu1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.conv2d(net, 64, [5, 5], padding='VALID', scope='conv2')
            net = slim.relu(net, 64, scope='relu2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.conv2d(net, 128, [5, 5], padding='VALID', scope='conv3')
            net = slim.relu(net, 128, scope='relu3')
            net = tf.squeeze(net, [1, 2], name='squeezed')
            print(net)

        net = tf.reshape(net, [-1, max_frame, 128])
        net = utils.FramePooling(net, 'max')
        net = slim.fully_connected(net, 512, scope='fc4')
        print(net)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=net,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemple #5
0
 def get_input():
     num_frames_cast = tf.cast(tf.expand_dims(num_frames, 1),
                               tf.float32)
     if random_frames:
         sample_model_input = utils.SampleRandomFrames(
             model_input, num_frames_cast, iterations)
     else:
         sample_model_input = utils.SampleRandomSequence(
             model_input, num_frames_cast, iterations)
     max_frames = sample_model_input.get_shape().as_list()[1]
     feature_size = sample_model_input.get_shape().as_list()[2]
     reshaped_input = tf.reshape(sample_model_input, [-1, feature_size])
     tf.summary.histogram("input", reshaped_input)
     if input_add_batch_norm:
         reshaped_input = slim.batch_norm(reshaped_input,
                                          center=True,
                                          scale=True,
                                          is_training=is_training,
                                          scope="input_bn",
                                          reuse=tf.AUTO_REUSE)
     return reshaped_input, max_frames, feature_size
Exemple #6
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.netvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
        relu = FLAGS.netvlad_relu
        dimred = FLAGS.netvlad_dimred
        gating = FLAGS.gating
        remove_diag = FLAGS.gating_remove_diag
        print "FLAGS.lightvlad", FLAGS.lightvlad
        lightvlad = FLAGS.lightvlad
        vlagd = FLAGS.vlagd

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        print "num_frames:", num_frames
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)

        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])

        if lightvlad:
            video_NetVLAD = LightVLAD(1024, max_frames, cluster_size,
                                      add_batch_norm)
            audio_NetVLAD = LightVLAD(128, max_frames, cluster_size / 2,
                                      add_batch_norm)

        if add_batch_norm:  # and not lightvlad:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             scope="input_bn")

        with tf.variable_scope("video_VLAD"):
            vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024])

        with tf.variable_scope("audio_VLAD"):
            vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:])

        vlad = tf.concat([vlad_video, vlad_audio], 1)

        vlad_dim = vlad.get_shape().as_list()[1]
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [vlad_dim, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(cluster_size)))

        activation = tf.matmul(vlad, hidden1_weights)

        if add_batch_norm and relu:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         scope="hidden1_bn")

        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases

        if relu:
            activation = tf.nn.relu6(activation)

        if gating:
            gating_weights = tf.get_variable(
                "gating_weights_2", [hidden1_size, hidden1_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(hidden1_size)))

            gates = tf.matmul(activation, gating_weights)

            if remove_diag:
                #removes diagonals coefficients
                diagonals = tf.matrix_diag_part(gating_weights)
                gates = gates - tf.multiply(diagonals, activation)

            if add_batch_norm:
                gates = slim.batch_norm(gates,
                                        center=True,
                                        scale=True,
                                        scope="gating_bn")
            else:
                gating_biases = tf.get_variable(
                    "gating_biases", [cluster_size],
                    initializer=tf.random_normal(stddev=1 /
                                                 math.sqrt(feature_size)))
                gates += gating_biases

            gates = tf.sigmoid(gates)

            activation = tf.multiply(activation, gates)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemple #7
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        model_input = utils.SampleRandomFrames(model_input, num_frames, 64)

        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]

        regularizer = slim.l2_regularizer(1e-8)
        padding = 'SAME'
        ##############
        # get attention state
        lstm_size = 512

        # srink the dimention to lstm_size
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        rnn_in = slim.fully_connected(reshaped_input,
                                      lstm_size,
                                      weights_regularizer=regularizer)
        rnn_in = tf.reshape(rnn_in, [-1, max_frames, lstm_size])

        with tf.variable_scope('BLSTM1'):
            lstm_fw_cell = tf.contrib.rnn.LSTMCell(lstm_size,
                                                   forget_bias=1.0,
                                                   cell_clip=10.)
            lstm_bw_cell = tf.contrib.rnn.LSTMCell(lstm_size,
                                                   forget_bias=1.0,
                                                   cell_clip=10.)
            outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(
                lstm_fw_cell,
                lstm_bw_cell,
                tf.unstack(rnn_in, axis=1),
                dtype=tf.float32)
        with tf.variable_scope('BLSTM2'):
            lstm_fw_cell1 = tf.contrib.rnn.LSTMCell(lstm_size,
                                                    forget_bias=1.0,
                                                    cell_clip=10.)
            lstm_bw_cell1 = tf.contrib.rnn.LSTMCell(lstm_size,
                                                    forget_bias=1.0,
                                                    cell_clip=10.)
            outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(
                lstm_fw_cell1, lstm_bw_cell1, outputs, dtype=tf.float32)

        rnn_out = tf.reshape(outputs, [-1, 2 * lstm_size])
        att_pooling = slim.fully_connected(rnn_out,
                                           1,
                                           activation_fn=None,
                                           weights_regularizer=regularizer)

        att_pooling = tf.reshape(att_pooling, [-1, max_frames, 1])
        att_pooling = tf.nn.softmax(att_pooling, dim=1)
        ###############
        cluster_size = 8192
        activation = slim.fully_connected(reshaped_input,
                                          cluster_size,
                                          weights_regularizer=regularizer)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        att_pooled_max = tf.reduce_mean(tf.multiply(activation, att_pooling),
                                        1)

        activation = slim.fully_connected(att_pooled_max,
                                          2048,
                                          weights_regularizer=regularizer)

        activation = tf.reshape(activation, [-1, 2, 2, 512])

        #init_act = slim.flatten(activation)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training)
        activation = tf.nn.relu6(activation)
        activation = slim.conv2d(activation,
                                 64, [1, 1],
                                 padding=padding,
                                 weights_regularizer=regularizer)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training)
        activation = tf.nn.relu6(activation)
        activation = slim.conv2d(activation,
                                 64, [3, 3],
                                 padding=padding,
                                 weights_regularizer=regularizer)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training)
        activation = slim.flatten(activation)
        #init_act = slim.fully_connected(init_act, activation.get_shape().as_list()[1])
        #activation = activation + init_act
        #activation = tf.nn.relu6(activation)
        activation = slim.dropout(activation, 0.8, is_training=is_training)

        activation = slim.fully_connected(activation,
                                          2048,
                                          weights_regularizer=regularizer)

        output = slim.fully_connected(activation,
                                      vocab_size,
                                      activation_fn=tf.nn.sigmoid,
                                      weights_regularizer=regularizer)
        return {"predictions": output}
Exemple #8
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     is_training=True,
                     hidden=None,
                     n_enc_layers=None,
                     n_dec_layers=None,
                     heads=None,
                     **unused_params):

        iterations = iterations or FLAGS.iterations
        # add_batch_norm = add_batch_norm or FLAGS.add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames

        heads = heads or FLAGS.heads
        self.hidden = hidden or FLAGS.hidden
        n_enc_layers = n_enc_layers or FLAGS.n_enc_layers
        n_dec_layers = n_dec_layers or FLAGS.n_dec_layers

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)

        batch_size = model_input.get_shape().as_list()[0]
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        # reshaped_input = tf.reshape(model_input, [-1, feature_size])

        encoding = tf.layers.dense(
            inputs=model_input,
            units=self.hidden,
            # dtype=tf.float32,
            name="input_embedding")

        for i in np.arange(n_enc_layers):
            encoding = self.attention("enc_self_{}".format(i),
                                      encoding,
                                      h=heads)
            dense = tf.layers.dense(
                inputs=encoding,
                units=self.hidden * 2,
                activation=tf.nn.relu,
                name="enc_dense_{}_1".format(i),
            )
            encoding += tf.layers.dense(
                inputs=dense,
                units=self.hidden,
                activation=None,
                name="enc_dense_{}_2".format(i),
            )

        decoder_query = tf.get_variable(
            name="decoder_query",
            shape=(1, 1, self.hidden),
            dtype=tf.float32,
            initializer=tf.random_normal_initializer(stddev=1e-2),
        )
        decoding = self.attention(
            "dec_enc_0",
            tf.tile(decoder_query,
                    multiples=tf.concat(([tf.shape(model_input)[0]], [1], [1]),
                                        axis=0)),
            encoding,
            h=heads)
        for i in np.arange(n_dec_layers - 1):
            # decoding = self.attention("dec_self_{}".format(i + 1), decoding, h=heads)
            decoding = self.attention("dec_enc_{}".format(i + 1),
                                      decoding,
                                      encoding,
                                      h=heads)
            dense = tf.layers.dense(
                inputs=decoding,
                units=self.hidden * 2,
                activation=tf.nn.relu,
                name="dec_dense_{}_1".format(i + 1),
            )
            decoding += tf.layers.dense(
                inputs=dense,
                units=self.hidden,
                activation=None,
                name="dec_dense_{}_2".format(i + 1),
            )

        activation = tf.reshape(decoding, [-1, self.hidden])

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        with tf.variable_scope(scope, tf.AUTO_REUSE):
            num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
            if random_frames:
                model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                       iterations)
            else:
                model_input = utils.SampleRandomSequence(
                    model_input, num_frames, iterations)
            max_frames = model_input.get_shape().as_list()[1]
            feature_size = model_input.get_shape().as_list()[2]
            reshaped_input = tf.reshape(model_input, [-1, feature_size])

            tf.summary.histogram("input_hist", reshaped_input)
            reshaped_input = tf.expand_dims(reshaped_input, -1)
            reshaped_input = tf.expand_dims(reshaped_input, -1)

            out1 = tf.layers.conv2d(
                reshaped_input,
                128, (32, 1),
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                padding='same')
            out1_norm = tf.layers.batch_normalization(out1,
                                                      training=is_training)
            out1_pool = tf.layers.max_pooling2d(out1_norm, (8, 1),
                                                2,
                                                padding='same')

            out2 = tf.layers.conv2d(
                out1_pool,
                256, (32, 1),
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                padding='same')
            out2_norm = tf.layers.batch_normalization(out2,
                                                      training=is_training)
            out2_pool = tf.layers.max_pooling2d(out2_norm, (8, 1),
                                                2,
                                                padding='same')

            out3 = tf.layers.conv2d(
                out2_pool,
                256, (32, 1),
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                padding='same')
            out3_norm = tf.layers.batch_normalization(out3,
                                                      training=is_training)
            out3_pool = tf.layers.max_pooling2d(out3_norm, (8, 1),
                                                2,
                                                padding='same')

            out = tf.reduce_max(out3_pool, axis=[2, 3])
            activation = tf.reshape(out, [-1, max_frames, out.shape[1]])
            cluster_size = out.shape[1]

            activation = utils.FramePooling(activation,
                                            FLAGS.dbof_pooling_method)

            activation = tf.layers.dense(
                activation,
                hidden1_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer())

            tf.summary.histogram("activation", activation)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)
            results = aggregated_model().create_model(model_input=activation,
                                                      vocab_size=vocab_size,
                                                      is_training=is_training,
                                                      **unused_params)

            results['features'] = activation
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemple #10
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     mix_number=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     groups=None,
                     expansion=None,
                     drop_rate=None,
                     gating_reduction=None,
                     **unused_params):
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        '''
        ####################################
        n_bag = 300 // 5
        model_input = utils.ReshapeFramesToMIL(model_input)  # B x FLAGS.pad_seq_length/5 x 5 x 1132
        max_frames = 5
        feature_size = model_input.get_shape().as_list()[3]
        #####################################
        '''
        if FLAGS.sample_random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   FLAGS.iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     FLAGS.iterations)

        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size
        gating_reduction = gating_reduction or FLAGS.gating_reduction
        groups = groups or FLAGS.groups
        drop_rate = drop_rate or FLAGS.drop_rate
        mix_number = mix_number or FLAGS.mix_number
        expansion = expansion or FLAGS.expansion

        max_frames = model_input.get_shape().as_list()[1]
        mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32)

        ftr_mean = tf.reduce_mean(model_input, axis=-1)
        ftr_mean = slim.batch_norm(ftr_mean,
                                   center=True,
                                   scale=True,
                                   fused=True,
                                   is_training=is_training,
                                   scope="mix_weights_bn")
        mix_weights = slim.fully_connected(
            ftr_mean,
            mix_number,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer(),
            scope="mix_weights")
        mix_weights = tf.nn.softmax(mix_weights, axis=-1)
        tf.summary.histogram("mix_weights", mix_weights)

        results = []
        for n in range(mix_number):
            with tf.variable_scope("branch_%d" % n):
                res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024],
                                          audio_ftr=model_input[:, :, 1024:],
                                          vocab_size=vocab_size,
                                          max_frames=max_frames,
                                          cluster_size=cluster_size,
                                          groups=groups,
                                          expansion=expansion,
                                          drop_rate=drop_rate,
                                          hidden1_size=hidden1_size,
                                          is_training=is_training,
                                          gating_reduction=gating_reduction,
                                          mask=mask,
                                          **unused_params)
                results.append(res)

        aux_preds = [res["predictions"] for res in results]
        logits = [res["logits"] for res in results]
        logits = tf.stack(logits, axis=1)

        mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1),
                                              logits),
                                  axis=1)

        pred = tf.nn.sigmoid(mix_logit)

        if is_training:
            rank_pred = tf.expand_dims(tf.nn.softmax(tf.div(
                mix_logit, FLAGS.cl_temperature),
                                                     axis=-1),
                                       axis=1)
            aux_rank_preds = tf.nn.softmax(tf.div(logits,
                                                  FLAGS.cl_temperature),
                                           axis=-1)
            epsilon = 1e-8
            kl_loss = tf.reduce_sum(rank_pred *
                                    (tf.log(rank_pred + epsilon) -
                                     tf.log(aux_rank_preds + epsilon)),
                                    axis=-1)

            regularization_loss = FLAGS.cl_lambda * tf.reduce_mean(
                tf.reduce_sum(kl_loss, axis=-1), axis=-1)

            return {
                "predictions": pred,
                "regularization_loss": regularization_loss,
                "aux_predictions": aux_preds
            }
        else:
            return {"predictions": pred}
Exemple #11
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     mix_number=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     groups=None,
                     expansion=None,
                     drop_rate=None,
                     gating_reduction=None,
                     **unused_params):
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)

        config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
        config = copy.deepcopy(config)

        config.num_hidden_layers = FLAGS.bert_hidden_layer
        config.num_attention_heads = FLAGS.bert_attention_heads
        config.hidden_dropout_prob = FLAGS.bert_dropout_prob
        config.attention_probs_dropout_prob = FLAGS.bert_dropout_prob

        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        #breakpoint()
        with tf.variable_scope("encoder"):
            self.all_encoder_layers = modeling.transformer_model(
                input_tensor=model_input,
                attention_mask=None,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        model_input = self.all_encoder_layers[-1]

        if FLAGS.sample_random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   FLAGS.iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     FLAGS.iterations)

        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size
        gating_reduction = gating_reduction or FLAGS.gating_reduction
        groups = groups or FLAGS.groups
        drop_rate = drop_rate or FLAGS.drop_rate
        mix_number = mix_number or FLAGS.mix_number
        expansion = expansion or FLAGS.expansion

        max_frames = model_input.get_shape().as_list()[1]
        mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32)

        ftr_mean = tf.reduce_mean(model_input, axis=-1)
        ftr_mean = slim.batch_norm(ftr_mean,
                                   center=True,
                                   scale=True,
                                   fused=True,
                                   is_training=is_training,
                                   scope="mix_weights_bn")
        mix_weights = slim.fully_connected(
            ftr_mean,
            mix_number,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer(),
            scope="mix_weights")
        mix_weights = tf.nn.softmax(mix_weights, axis=-1)
        tf.summary.histogram("mix_weights", mix_weights)

        results = []
        for n in range(mix_number):
            with tf.variable_scope("branch_%d" % n):
                res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024],
                                          audio_ftr=model_input[:, :, 1024:],
                                          vocab_size=vocab_size,
                                          max_frames=max_frames,
                                          cluster_size=cluster_size,
                                          groups=groups,
                                          expansion=expansion,
                                          drop_rate=drop_rate,
                                          hidden1_size=hidden1_size,
                                          is_training=is_training,
                                          gating_reduction=gating_reduction,
                                          mask=mask,
                                          **unused_params)
                results.append(res)

        aux_preds = [res["predictions"] for res in results]
        logits = [res["logits"] for res in results]
        logits = tf.stack(logits, axis=1)

        mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1),
                                              logits),
                                  axis=1)

        pred = tf.nn.sigmoid(mix_logit)

        if is_training:
            rank_pred = tf.expand_dims(tf.nn.softmax(tf.div(
                mix_logit, FLAGS.cl_temperature),
                                                     axis=-1),
                                       axis=1)
            aux_rank_preds = tf.nn.softmax(tf.div(logits,
                                                  FLAGS.cl_temperature),
                                           axis=-1)
            epsilon = 1e-8
            kl_loss = tf.reduce_sum(rank_pred *
                                    (tf.log(rank_pred + epsilon) -
                                     tf.log(aux_rank_preds + epsilon)),
                                    axis=-1)

            regularization_loss = FLAGS.cl_lambda * tf.reduce_mean(
                tf.reduce_sum(kl_loss, axis=-1), axis=-1)

            return {
                "predictions": pred,
                "regularization_loss": regularization_loss,
                "aux_predictions": aux_preds
            }
        else:
            return {"predictions": pred}
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = model_utils.SampleRandomFrames(
                model_input, num_frames, iterations)
        else:
            model_input = model_utils.SampleRandomSequence(
                model_input, num_frames, iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.Variable(
            tf.random_normal([feature_size, cluster_size],
                             stddev=1 / math.sqrt(feature_size)))
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.Variable(
                tf.random_normal([cluster_size],
                                 stddev=1 / math.sqrt(feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = model_utils.FramePooling(activation,
                                              FLAGS.dbof_pooling_method)

        hidden1_weights = tf.Variable(
            tf.random_normal([cluster_size, hidden1_size],
                             stddev=1 / math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.Variable(
                tf.random_normal([hidden1_size], stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               original_input=model_input,
                                               vocab_size=vocab_size,
                                               **unused_params)
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        with tf.variable_scope(scope, tf.AUTO_REUSE):
            num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
            if random_frames:
                model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                       iterations)
            else:
                model_input = utils.SampleRandomSequence(
                    model_input, num_frames, iterations)
            max_frames = model_input.get_shape().as_list()[1]
            feature_size = model_input.get_shape().as_list()[2]
            reshaped_input = tf.reshape(model_input, [-1, feature_size])
            tf.summary.histogram("input_hist", reshaped_input)

            if add_batch_norm:
                reshaped_input = slim.batch_norm(reshaped_input,
                                                 center=True,
                                                 scale=True,
                                                 is_training=is_training,
                                                 scope="input_bn")

            cluster_weights = tf.get_variable(
                "cluster_weights", [feature_size, cluster_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(feature_size)))
            tf.summary.histogram("cluster_weights", cluster_weights)
            activation = tf.matmul(reshaped_input, cluster_weights)
            if add_batch_norm:
                activation = slim.batch_norm(activation,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="cluster_bn")
            else:
                cluster_biases = tf.get_variable(
                    "cluster_biases", [cluster_size],
                    initializer=tf.random_normal(stddev=1 /
                                                 math.sqrt(feature_size)))
                tf.summary.histogram("cluster_biases", cluster_biases)
                activation += cluster_biases
            activation = tf.nn.relu6(activation)
            tf.summary.histogram("cluster_output", activation)

            activation = tf.reshape(activation, [-1, max_frames, cluster_size])
            activation = utils.FramePooling(activation,
                                            FLAGS.dbof_pooling_method)

            hidden1_weights = tf.get_variable(
                "hidden1_weights", [cluster_size, hidden1_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(cluster_size)))
            tf.summary.histogram("hidden1_weights", hidden1_weights)
            activation = tf.matmul(activation, hidden1_weights)
            if add_batch_norm:
                activation = slim.batch_norm(activation,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="hidden1_bn")
            else:
                hidden1_biases = tf.get_variable(
                    "hidden1_biases", [hidden1_size],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                tf.summary.histogram("hidden1_biases", hidden1_biases)
                activation += hidden1_biases
            activation = tf.nn.relu6(activation)
            tf.summary.histogram("hidden1_output", activation)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)
            results = aggregated_model().create_model(model_input=activation,
                                                      vocab_size=vocab_size,
                                                      **unused_params)
            results['features'] = activation
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = 300
    add_batch_norm = True
    random_frames = True
    cluster_size = 64
    hidden1_size = 1024
    relu = False
    dimred = -1
    gating = True
    remove_diag = False

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)


    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])

    video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training)

    if add_batch_norm:# and not lightvlad:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")
    with tf.variable_scope("video_audio_VLAD"):
        vlad = video_audio_NetVLAD.forward(reshaped_input)

    vlad_dim = vlad.get_shape().as_list()[1]
    hidden1_weights = tf.get_variable("hidden1_weights",
      [vlad_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))

    activation = tf.matmul(vlad, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")

    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases

    if relu:
      activation = tf.nn.relu6(activation)


    if gating:
        gating_weights = tf.get_variable("gating_weights_2",
          [hidden1_size, hidden1_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))

        gates = tf.matmul(activation, gating_weights)

        if remove_diag:
            #removes diagonals coefficients
            diagonals = tf.matrix_diag_part(gating_weights)
            gates = gates - tf.multiply(diagonals,activation)


        if add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation,gates)

    aggregated_model = getattr(video_level_models,
                               'willow_MoeModel_moe2')


    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = 2048
        cluster_size_2 = 512
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.EqualSpaceMeans(model_input, num_frames,
                                                iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.get_variable(
            "cluster_weights", [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)))
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)))
            activation += cluster_biases
        activation = tf.nn.relu6(activation)

        activation = tf.reshape(activation, [-1, 3, 10, cluster_size])
        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
        activation = tf.reshape(activation, [-1, 2, 5, cluster_size])
        activation = tf.transpose(activation, [0, 2, 3, 1])
        activation = tf.reshape(activation, [-1, cluster_size * 2])
        cluster_weights_2 = tf.get_variable(
            "cluster_weights2", [cluster_size * 2, cluster_size_2],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(cluster_size * 2)))
        activation = tf.matmul(activation, cluster_weights_2)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn2")
        else:
            cluster_biases_2 = tf.get_variable(
                "cluster_biases2", [cluster_size_2],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(cluster_size * 2)))
            activation += cluster_biases_2
        activation = tf.nn.relu6(activation)
        activation = tf.reshape(activation, [-1, cluster_size_2 * 5])

        hidden1_weights = tf.get_variable(
            "hidden1_weights", [cluster_size_2 * 5, hidden1_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(cluster_size_2 * 5)))
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.netvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
        relu = FLAGS.netvlad_relu
        gating = FLAGS.gating
        remove_diag = FLAGS.gating_remove_diag
        lightvlad = FLAGS.lightvlad
        vlagd = FLAGS.vlagd

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)

        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        dimred_video = tf.get_variable(
            "dimred_video", [1024, 400],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(1024)))
        dimred_audio = tf.get_variable(
            "dimred_audio", [128, 50],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(1024)))

        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        reshaped_input1_video = tf.matmul(
            tf.reshape(model_input[:, :100, :1024], [-1, 1024]), dimred_video)
        reshaped_input2_video = tf.matmul(
            tf.reshape(model_input[:, 100:200, :1024], [-1, 1024]),
            dimred_video)
        reshaped_input3_video = tf.matmul(
            tf.reshape(model_input[:, 200:, :1024], [-1, 1024]), dimred_video)

        reshaped_input1_audio = tf.matmul(
            tf.reshape(model_input[:, :100, 1024:], [-1, 128]), dimred_audio)
        reshaped_input2_audio = tf.matmul(
            tf.reshape(model_input[:, 100:200, 1024:], [-1, 128]),
            dimred_audio)
        reshaped_input3_audio = tf.matmul(
            tf.reshape(model_input[:, 200:, 1024:], [-1, 128]), dimred_audio)

        video_NetVLAD = NetVLAD(1024, max_frames, 128, add_batch_norm,
                                is_training)
        audio_NetVLAD = NetVLAD(128, max_frames, 128 / 2, add_batch_norm,
                                is_training)

        video_NetVLAD1 = NetVLAD(400, max_frames / 3, 64, add_batch_norm,
                                 is_training)
        audio_NetVLAD1 = NetVLAD(50, max_frames / 3, 64 / 2, add_batch_norm,
                                 is_training)

        video_NetVLAD2 = NetVLAD(400, max_frames / 3, 64, add_batch_norm,
                                 is_training)
        audio_NetVLAD2 = NetVLAD(50, max_frames / 3, 64 / 2, add_batch_norm,
                                 is_training)

        video_NetVLAD3 = NetVLAD(400, max_frames / 3, 64, add_batch_norm,
                                 is_training)
        audio_NetVLAD3 = NetVLAD(50, max_frames / 3, 64 / 2, add_batch_norm,
                                 is_training)

        with tf.variable_scope("video_VLAD"):
            vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024])

        with tf.variable_scope("audio_VLAD"):
            vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:])

        with tf.variable_scope("video_VLAD1"):
            vlad_video1 = video_NetVLAD1.forward(reshaped_input1_video)

        with tf.variable_scope("audio_VLAD1"):
            vlad_audio1 = audio_NetVLAD1.forward(reshaped_input1_audio)

        with tf.variable_scope("video_VLAD2"):
            vlad_video2 = video_NetVLAD2.forward(reshaped_input2_video)

        with tf.variable_scope("audio_VLAD2"):
            vlad_audio2 = audio_NetVLAD2.forward(reshaped_input2_audio)

        with tf.variable_scope("video_VLAD3"):
            vlad_video3 = video_NetVLAD3.forward(reshaped_input3_video)

        with tf.variable_scope("audio_VLAD3"):
            vlad_audio3 = audio_NetVLAD3.forward(reshaped_input3_audio)

        vlad = tf.concat([
            vlad_video, vlad_video1, vlad_video2, vlad_video3, vlad_audio,
            vlad_audio1, vlad_audio2, vlad_audio3
        ], 1)
        vlad_dim = vlad.get_shape().as_list()[1]
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [vlad_dim, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(vlad_dim)))

        activation = tf.matmul(vlad, hidden1_weights)

        if add_batch_norm and relu:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")

        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases

        if relu:
            activation = tf.nn.relu6(activation)

        if gating:
            gating_weights = tf.get_variable(
                "gating_weights_2", [hidden1_size, hidden1_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(hidden1_size)))

            gates = tf.matmul(activation, gating_weights)

            if remove_diag:
                #removes diagonals coefficients
                diagonals = tf.matrix_diag_part(gating_weights)
                gates = gates - tf.multiply(diagonals, activation)

            if add_batch_norm:
                gates = slim.batch_norm(gates,
                                        center=True,
                                        scale=True,
                                        is_training=is_training,
                                        scope="gating_bn")
            else:
                gating_biases = tf.get_variable(
                    "gating_biases", [cluster_size],
                    initializer=tf.random_normal(stddev=1 /
                                                 math.sqrt(feature_size)))
                gates += gating_biases

            gates = tf.sigmoid(gates)

            activation = tf.multiply(activation, gates)

        # activation = tf.layers.dropout(activation, rate = 0.1, training=is_training)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Exemple #17
0
    def create_model(
            self,
            model_input,
            vocab_size,
            num_frames,
            iterations=None,
            add_batch_norm=None,
            sample_random_frames=None,
            cluster_size=None,
            hidden_size=None,
            is_training=True,
            expansion=2,
            groups=None,
            #mask=None,
            drop_rate=0.5,
            gating_reduction=None,
            **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = FLAGS.sample_random_frames if sample_random_frames is None else sample_random_frames
        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden_size = hidden_size or FLAGS.nextvlad_hidden_size
        groups = groups or FLAGS.groups
        gating_reduction = gating_reduction or FLAGS.gating_reduction

        num_frames_exp = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames_exp,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input,
                                                     num_frames_exp,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        #reshaped_input = tf.reshape(model_input, [-1, feature_size])
        #tf.summary.histogram("input_hist", reshaped_input)

        mask = tf.sequence_mask(num_frames, max_frames, dtype=tf.float32)

        input = slim.fully_connected(
            model_input,
            expansion * feature_size,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer())

        attention = slim.fully_connected(
            model_input,
            groups,
            activation_fn=tf.nn.sigmoid,
            weights_initializer=slim.variance_scaling_initializer())

        if mask is not None:
            attention = tf.multiply(attention, tf.expand_dims(mask, -1))
        attention = tf.reshape(attention, [-1, max_frames * groups, 1])
        tf.summary.histogram("sigmoid_attention", attention)
        reduce_size = expansion * feature_size // groups

        cluster_weights = tf.get_variable(
            "cluster_weights",
            [expansion * feature_size, groups * cluster_size],
            initializer=slim.variance_scaling_initializer())

        # tf.summary.histogram("cluster_weights", cluster_weights)
        reshaped_input = tf.reshape(input, [-1, expansion * feature_size])
        activation = tf.matmul(reshaped_input, cluster_weights)

        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="cluster_bn",
                                     fused=False)

        activation = tf.reshape(activation,
                                [-1, max_frames * groups, cluster_size])
        activation = tf.nn.softmax(activation, axis=-1)
        activation = tf.multiply(activation, attention)
        # tf.summary.histogram("cluster_output", activation)
        a_sum = tf.reduce_sum(activation, -2, keep_dims=True)

        cluster_weights2 = tf.get_variable(
            "cluster_weights2", [1, reduce_size, cluster_size],
            initializer=slim.variance_scaling_initializer())
        a = tf.multiply(a_sum, cluster_weights2)

        activation = tf.transpose(activation, perm=[0, 2, 1])

        reshaped_input = tf.reshape(input,
                                    [-1, max_frames * groups, reduce_size])
        vlad = tf.matmul(activation, reshaped_input)
        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.subtract(vlad, a)

        vlad = tf.nn.l2_normalize(vlad, 1)

        vlad = tf.reshape(vlad, [-1, cluster_size * reduce_size])
        vlad = slim.batch_norm(vlad,
                               center=True,
                               scale=True,
                               is_training=is_training,
                               scope="vlad_bn",
                               fused=False)

        if drop_rate > 0.:
            vlad = slim.dropout(vlad,
                                keep_prob=1. - drop_rate,
                                is_training=is_training,
                                scope="vlad_dropout")

        vlad_dim = vlad.get_shape().as_list()[1]
        print("VLAD dimension", vlad_dim)
        hidden_weights = tf.get_variable(
            "hidden_weights", [vlad_dim, hidden_size],
            initializer=slim.variance_scaling_initializer())

        activation = tf.matmul(vlad, hidden_weights)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden_bn",
                                     fused=False)

        activation = tf.nn.relu(activation, name='embedding1')

        gating_weights_1 = tf.get_variable(
            "gating_weights_1", [hidden_size, hidden_size // gating_reduction],
            initializer=slim.variance_scaling_initializer())

        gates = tf.matmul(activation, gating_weights_1)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=is_training,
                                activation_fn=slim.nn.relu,
                                scope="gating_bn")

        gating_weights_2 = tf.get_variable(
            "gating_weights_2", [hidden_size // gating_reduction, hidden_size],
            initializer=slim.variance_scaling_initializer())
        gates = tf.matmul(gates, gating_weights_2)

        gates = tf.sigmoid(gates)
        tf.summary.histogram("final_gates", gates)

        activation = tf.multiply(activation, gates, name="embedding2")

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Exemple #18
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        """Creates a model which uses a logistic classifier over the average of the

    frame-level features.

    This class is intended to be an example for implementors of frame level
    models. If you want to train a model over averaged features it is more
    efficient to average them beforehand rather than on the fly.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
        input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
        frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        num_frames_t = num_frames
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        feature_size = model_input.get_shape().as_list()[2]
        iterations = 5  #150
        attention_size = 8
        if FLAGS.is_train:
            iterations = 120
            model_input = utils.SampleRandomFrames(model_input[:, 15:, :],
                                                   num_frames - 15 - 15,
                                                   iterations)
            model_input = model_input + tf.random_normal(
                shape=tf.shape(model_input),
                mean=0.0,
                stddev=1e-3,
                dtype=tf.float32)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        video_attention = MultiAttentionLayers(1024, iterations, 256,
                                               attention_size)  #256
        audio_attention = MultiAttentionLayers(128, iterations, 256 / 4,
                                               attention_size)  #256/4

        model_input = slim.batch_norm(model_input,
                                      center=True,
                                      scale=True,
                                      is_training=True,
                                      scope="model_input_bn")

        with tf.variable_scope("video_Attention"):
            attention_video = video_attention.forward(model_input[:, :,
                                                                  0:1024])
        with tf.variable_scope("audio_Attention"):
            attention_audio = audio_attention.forward(model_input[:, :, 1024:])

        pooled = tf.concat([attention_video, attention_audio], axis=1)
        #instance_att#tf.reduce_mean(pooledi,axis=1)

        print('pooled is', pooled)
        pooled = tf.reshape(tf.transpose(pooled, perm=[0, 2, 1]), [-1, 1152])
        dr2 = tf.get_variable("dr2", [feature_size, 1024],
                              initializer=tf.random_normal_initializer(
                                  stddev=1 / math.sqrt(feature_size)))
        pooled = tf.matmul(pooled, dr2)

        pooled = slim.batch_norm(pooled,
                                 center=True,
                                 scale=True,
                                 is_training=True,
                                 scope="pooled_bn")

        gating_weights = tf.get_variable(
            "gating_weights_2", [1024, 1024],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(1024)))
        gates = tf.matmul(pooled, gating_weights)
        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=True,
                                scope="gating_bn")
        gates = tf.sigmoid(gates)
        pooled = tf.multiply(pooled, gates)

        results_temp = aggregated_model().create_model(model_input=pooled,
                                                       vocab_size=vocab_size,
                                                       **unused_params)
        results_temp['predictions'] = tf.reduce_max(tf.reshape(
            results_temp['predictions'], [-1, attention_size, vocab_size]),
                                                    axis=1)
        print(results_temp)
        return results_temp
Exemple #19
0
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.dbof_cluster_size
    hidden1_size = hidden_size or FLAGS.dbof_hidden_size
    relu = FLAGS.dbof_relu
    cluster_activation = FLAGS.dbof_activation

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    if cluster_activation == 'glu':
        cluster_size = 2*cluster_size

    video_Dbof = DBoF(1024,max_frames,cluster_size, cluster_activation, add_batch_norm, is_training)
    audio_Dbof = DBoF(128,max_frames,cluster_size/8, cluster_activation, add_batch_norm, is_training)


    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_DBOF"):
        dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_DBOF"):
        dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])

    dbof = tf.concat([dbof_video, dbof_audio],1)

    dbof_dim = dbof.get_shape().as_list()[1] 

    hidden1_weights = tf.get_variable("hidden1_weights",
      [dbof_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
    tf.summary.histogram("hidden1_weights", hidden1_weights)
    activation = tf.matmul(dbof, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")
    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases

    if relu:
      activation = tf.nn.relu6(activation)
    tf.summary.histogram("hidden1_output", activation)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)
    
    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        **unused_params)
Exemple #20
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = 300
        add_batch_norm = True
        random_frames = True
        cluster_size = 32
        hidden1_size = 1024
        relu = False
        gating = True

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.summary.histogram("input_hist", reshaped_input)

        video_NetFV = NetFV(1024, max_frames, cluster_size, add_batch_norm,
                            is_training)
        audio_NetFV = NetFV(128, max_frames, cluster_size / 2, add_batch_norm,
                            is_training)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        with tf.variable_scope("video_FV"):
            fv_video = video_NetFV.forward(reshaped_input[:, 0:1024])

        with tf.variable_scope("audio_FV"):
            fv_audio = audio_NetFV.forward(reshaped_input[:, 1024:])

        fv = tf.concat([fv_video, fv_audio], 1)

        fv_dim = fv.get_shape().as_list()[1]
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [fv_dim, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(cluster_size)))

        activation = tf.matmul(fv, hidden1_weights)

        if add_batch_norm and relu:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases

        if relu:
            activation = tf.nn.relu6(activation)

        if gating:
            gating_weights = tf.get_variable(
                "gating_weights_2", [hidden1_size, hidden1_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(hidden1_size)))

            gates = tf.matmul(activation, gating_weights)

            if add_batch_norm:
                gates = slim.batch_norm(gates,
                                        center=True,
                                        scale=True,
                                        is_training=is_training,
                                        scope="gating_bn")
            else:
                gating_biases = tf.get_variable(
                    "gating_biases", [cluster_size],
                    initializer=tf.random_normal(stddev=1 /
                                                 math.sqrt(feature_size)))
                gates += gating_biases

            gates = tf.sigmoid(gates)

            activation = tf.multiply(activation, gates)

        aggregated_model = getattr(video_level_models, 'willow_MoeModel_moe4')

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = 300
    add_batch_norm = True
    random_frames = True
    cluster_size = 8000
    hidden1_size = 1024
    fc_dimred = True
    relu = False
    max_pool = False

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
    audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)


    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_DBOF"):
        dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])

    with tf.variable_scope("audio_DBOF"):
        dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])

    dbof = tf.concat([dbof_video, dbof_audio],1)

    dbof_dim = dbof.get_shape().as_list()[1]

    if fc_dimred:
        hidden1_weights = tf.get_variable("hidden1_weights",
          [dbof_dim, hidden1_size],
          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(dbof, hidden1_weights)

        if add_batch_norm and relu:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="hidden1_bn")
        else:
          hidden1_biases = tf.get_variable("hidden1_biases",
            [hidden1_size],
            initializer = tf.random_normal_initializer(stddev=0.01))
          tf.summary.histogram("hidden1_biases", hidden1_biases)
          activation += hidden1_biases

        if relu:
          activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)
    else:
        activation = dbof

    aggregated_model = getattr(video_level_models,
                               'willow_MoeModel_moe2_noGP')


    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
  def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
    """Creates a model which uses a seqtoseq model to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
    if True:
        self.dim_image = dim_image= model_input.get_shape().as_list()[2]
        self.n_words = n_words =  vocab_size
        self.dim_hidden = dim_hidden = FLAGS.lstm_cells
        self.batch_size = tf.shape(model_input)[0]
        self.n_lstm_steps = n_lstm_steps=20
        self.drop_out_rate =drop_out_rate= 0.4
        bias_init_vector = None
        n_caption_step = 20#model_input.get_shape().as_list()[1]

        self.Wemb = tf.get_variable( 'Wemb',[n_words, dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))

        self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
            use_peepholes = True, state_is_tuple = True)
        if is_training:
            self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3,output_keep_prob=1 - self.drop_out_rate)
        else:
            self.lstm3_dropout = self.lstm3
        
        self.lstm31 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
            use_peepholes = True, state_is_tuple = True)
        if is_training:
            self.lstm3_dropout1 = tf.contrib.rnn.DropoutWrapper(self.lstm31,output_keep_prob=1 - self.drop_out_rate)
        else:
            self.lstm3_dropout1 = self.lstm31
        self.encode_image_W = tf.get_variable( 'encode_image_W',[dim_image, dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        self.encode_image_b = tf.get_variable('encode_image_b',[dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        self.embed_att_w = tf.get_variable( 'embed_att_w',[dim_hidden, 1],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        self.embed_att_Wa = tf.get_variable( 'embed_att_Wa',[dim_hidden, dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        self.embed_att_Ua = tf.get_variable( 'embed_att_Ua',[dim_hidden, dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        self.embed_att_ba = tf.get_variable( 'embed_att_ba',[dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))

        self.embed_word_W = tf.get_variable('embed_word_W',[dim_hidden, n_words],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(n_words)))
        if bias_init_vector is not None:
             self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
        else:
            self.embed_word_b = tf.get_variable( 'embed_word_b',[n_words],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(n_words)))

        self.embed_nn_Wp = tf.get_variable( 'embed_nn_Wp',[3*dim_hidden, dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        self.embed_nn_bp = tf.get_variable('embed_nn_bp',[dim_hidden],
                                          initializer = 
                                           tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden)))
        
        #print(model_input.get_shape().as_list())
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        video = utils.SampleRandomFrames(model_input, num_frames, n_lstm_steps)
        #print(video.get_shape().as_list())
        video_flat = tf.reshape(video, [-1, self.dim_image]) # (b x n) x d
        image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b) # (b x n) x h
        image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) # b x n x h
        image_emb = tf.transpose(image_emb, [1,0,2]) # n x b x h

        state1 = self.lstm3.zero_state(self.batch_size, dtype=tf.float32)#tf.zeros([self.batch_size, self.lstm3.state_size]) # b x s
        h_prev = tf.zeros([self.batch_size, self.dim_hidden]) # b x h
        
        state11 = self.lstm31.zero_state(self.batch_size, dtype=tf.float32)# # b x s
        h_prev1 = tf.zeros([self.batch_size, self.dim_hidden]) # b x h

        loss_caption = 0.0
        
        probs = []

        current_embed = tf.zeros([self.batch_size, self.dim_hidden]) # b x h
        
        image_part = tf.reshape(image_emb, [-1, self.dim_hidden])
        image_part = tf.matmul(image_part, self.embed_att_Ua) + self.embed_att_ba
        image_part = tf.reshape(image_part, [self.n_lstm_steps, self.batch_size, self.dim_hidden])
        with tf.variable_scope("model") as scope:
            for i in range(n_caption_step):
                e = tf.tanh(tf.matmul(h_prev, self.embed_att_Wa) + image_part) # n x b x h
    #            e = tf.batch_matmul(e, brcst_w)    # unnormalized relevance score 
                e = tf.reshape(e, [-1, self.dim_hidden])
                e = tf.matmul(e, self.embed_att_w) # n x b
                e = tf.reshape(e, [self.n_lstm_steps, self.batch_size])
    #            e = tf.reduce_sum(e,2) # n x b
                e_hat_exp = tf.exp(e)#tf.multiply(tf.transpose(video_mask), tf.exp(e)) # n x b 
                denomin = tf.reduce_sum(e_hat_exp,0) # b
                denomin = denomin + tf.to_float(tf.equal(denomin, 0))   # regularize denominator
                alphas = tf.tile(tf.expand_dims(tf.div(e_hat_exp,denomin),2),[1,1,self.dim_hidden]) # n x b x h  # normalize to obtain alpha
                attention_list = tf.multiply(alphas, image_emb) # n x b x h
                atten = tf.reduce_sum(attention_list,0) # b x h       #  soft-attention weighted sum
#                if i > 0: tf.get_variable_scope().reuse_variables()
                if i > 0: scope.reuse_variables()

                with tf.variable_scope("LSTM3"):
                    output12, state11 = self.lstm3_dropout1(tf.concat([atten, current_embed], 1), state11 ) # b x h
                with tf.variable_scope("LSTM31"):    
                    output1, state1 = self.lstm3_dropout(output12, state1 ) # b x h

                output2 = tf.tanh(tf.nn.xw_plus_b(tf.concat([output1,atten,current_embed], 1), self.embed_nn_Wp, self.embed_nn_bp)) # b x h
                h_prev = output1 # b x h               

                logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b) # b x w
                probs.append(logit_words)
                        
        tf_probs = tf.stack(probs,0)
        tf_probs = tf.transpose(tf_probs,[1,0,2])
        return { 'predictions': tf.nn.softmax(tf.reduce_mean(tf_probs,1))    }
Exemple #23
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        """See base class.

    Args:
       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
         input features.
       vocab_size: The number of classes in the dataset.
       num_frames: A vector of length 'batch' which indicates the number of
         frames for each video (before padding).
       iterations: the number of frames to be sampled.
       add_batch_norm: whether to add batch norm during training.
       sample_random_frames: whether to sample random frames or random sequences.
       cluster_size: the output neuron number of the cluster layer.
       hidden_size: the output neuron number of the hidden layer.
       is_training: whether to build the graph in training mode.

    Returns:
       A dictionary with a tensor containing the probability predictions of the
       model in the 'predictions' key. The dimensions of the tensor are
       'batch_size' x 'num_classes'.
    """
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size
        act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation)
        assert act_fn is not None, ("dbof_activation is not valid: %s." %
                                    FLAGS.dbof_activation)

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.compat.v1.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(
                reshaped_input,
                center=True,
                scale=True,
                is_training=is_training,
                scope="input_bn",
            )

        cluster_weights = tf.compat.v1.get_variable(
            "cluster_weights",
            [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)),
        )
        tf.compat.v1.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(
                activation,
                center=True,
                scale=True,
                is_training=is_training,
                scope="cluster_bn",
            )
        else:
            cluster_biases = tf.compat.v1.get_variable(
                "cluster_biases",
                [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)),
            )
            tf.compat.v1.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        # activation = act_fn(activation)
        # xxx 2018
        activation = tf.nn.relu6(activation)
        tf.compat.v1.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)

        hidden1_weights = tf.compat.v1.get_variable(
            "hidden1_weights",
            [cluster_size, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(cluster_size)),
        )
        tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(
                activation,
                center=True,
                scale=True,
                is_training=is_training,
                scope="hidden1_bn",
            )
        else:
            hidden1_biases = tf.compat.v1.get_variable(
                "hidden1_biases",
                [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01),
            )
            tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        # xxx 2018
        # activation = tf.nn.relu6(activation)
        activation = act_fn(activation)
        tf.compat.v1.summary.histogram("hidden1_output", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemple #24
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     groups=None,
                     expansion=None,
                     drop_rate=None,
                     gating_reduction=None,
                     **unused_params):
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if FLAGS.sample_random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   FLAGS.iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     FLAGS.iterations)

        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size
        gating_reduction = gating_reduction or FLAGS.gating_reduction
        groups = groups or FLAGS.groups
        drop_rate = drop_rate or FLAGS.drop_rate
        expansion = expansion or FLAGS.expansion

        max_frames = model_input.get_shape().as_list()[1]
        mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32)
        video_nextvlad = NeXtVLAD(1024,
                                  max_frames,
                                  cluster_size,
                                  is_training,
                                  groups=groups,
                                  expansion=expansion)
        audio_nextvlad = NeXtVLAD(128,
                                  max_frames,
                                  cluster_size // 2,
                                  is_training,
                                  groups=groups // 2,
                                  expansion=expansion)

        with tf.variable_scope("video_VLAD"):
            vlad_video = video_nextvlad.forward(model_input[:, :, 0:1024],
                                                mask=mask)

        with tf.variable_scope("audio_VLAD"):
            vlad_audio = audio_nextvlad.forward(model_input[:, :, 1024:],
                                                mask=mask)

        vlad = tf.concat([vlad_video, vlad_audio], 1)

        if drop_rate > 0.:
            vlad = slim.dropout(vlad,
                                keep_prob=1. - drop_rate,
                                is_training=is_training,
                                scope="vlad_dropout")

        vlad_dim = vlad.get_shape().as_list()[1]
        print("VLAD dimension", vlad_dim)
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [vlad_dim, hidden1_size],
            initializer=slim.variance_scaling_initializer())

        activation = tf.matmul(vlad, hidden1_weights)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden1_bn",
                                     fused=False)

        # activation = tf.nn.relu(activation)

        gating_weights_1 = tf.get_variable(
            "gating_weights_1",
            [hidden1_size, hidden1_size // gating_reduction],
            initializer=slim.variance_scaling_initializer())

        gates = tf.matmul(activation, gating_weights_1)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=is_training,
                                activation_fn=slim.nn.relu,
                                scope="gating_bn")

        gating_weights_2 = tf.get_variable(
            "gating_weights_2",
            [hidden1_size // gating_reduction, hidden1_size],
            initializer=slim.variance_scaling_initializer())
        gates = tf.matmul(gates, gating_weights_2)

        gates = tf.sigmoid(gates)
        tf.summary.histogram("final_gates", gates)

        activation = tf.multiply(activation, gates)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.netvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
        relu = FLAGS.netvlad_relu
        dimred = FLAGS.netvlad_dimred
        gating = FLAGS.gating
        remove_diag = FLAGS.gating_remove_diag
        lightvlad = FLAGS.lightvlad
        vlagd = FLAGS.vlagd
        SVD_dim = FLAGS.SVD_dim

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)

        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])

        video_NetVLAD = NetFV(1024, max_frames, int(cluster_size),
                              add_batch_norm, is_training)
        audio_NetVLAD = NetFV(128, max_frames, int(cluster_size / 2),
                              add_batch_norm, is_training)

        if add_batch_norm:  # and not lightvlad:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        with tf.variable_scope("video_VLAD"):
            vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024])

        with tf.variable_scope("audio_VLAD"):
            vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:])

        vlad = tf.concat([vlad_video, vlad_audio], 1)  # None x vlad_dim

        vlad_dim = vlad.get_shape().as_list()[1]

        ##### simplier SVD #####
        SVD_mat1 = tf.get_variable("hidden1_weights", [vlad_dim, SVD_dim],
                                   initializer=tf.glorot_uniform_initializer())

        SVD_mat2 = tf.get_variable("hidden2_weights",
                                   [SVD_dim, int(hidden1_size * 2)],
                                   initializer=tf.glorot_uniform_initializer())

        SVD_mat1_biases = tf.get_variable(
            "SVD_mat1_biases", [SVD_dim],
            initializer=tf.random_normal_initializer(stddev=0.01))

        SVD_mat2_biases = tf.get_variable(
            "SVD_mat2_biases", [int(hidden1_size * 2)],
            initializer=tf.random_normal_initializer(stddev=0.01))
        ##### simplier SVD #####

        activation = tf.matmul(vlad, SVD_mat1)  # None x 256
        activation += SVD_mat1_biases
        tf.summary.histogram("activation_in_mid_of_SVD_before_tanh",
                             activation)

        activation = tf.nn.tanh(activation)
        tf.summary.histogram("activation_in_mid_of_SVD_after_tanh", activation)

        activation = tf.matmul(activation, SVD_mat2)  # None x 2*hidden1_size
        activation += SVD_mat2_biases
        tf.summary.histogram("activation_after_SVD_project", activation)

        ## gating part
        gating_weights = tf.get_variable(
            "gating_weights_2", [int(2 * hidden1_size), hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(hidden1_size)))

        gates = tf.matmul(activation, gating_weights)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=is_training,
                                scope="gating_bn")

        gates = tf.sigmoid(gates)
        tf.summary.histogram("gates_layer", gates)
        ## gating part

        activation = tf.nn.tanh(activation)  # first tanh
        tf.summary.histogram("activation_after_1_tanh", activation)

        activation = tf.layers.dropout(activation,
                                       rate=0.3,
                                       training=is_training)
        tf.summary.histogram("activation_after_1_tanh_after_dropout",
                             activation)

        activation_hidden_weights = tf.get_variable(
            "activation_hidden_weights", [int(hidden1_size * 2), hidden1_size],
            initializer=tf.glorot_uniform_initializer())
        activation = tf.matmul(activation, activation_hidden_weights)
        tf.summary.histogram("activation_fter_1_tanh_after_hidden_weights",
                             activation)

        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden_layer_bn")
        tf.summary.histogram(
            "activation_fter_1_tanh_after_hidden_weights_after_bn", activation)

        activation = tf.nn.tanh(activation)  # second tanh
        tf.summary.histogram(
            "activation_fter_1_tanh_after_hidden_weights_after_bn_after_2_tanh",
            activation)

        activation = tf.multiply(activation, gates)
        tf.summary.histogram("activation_right_before_video", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Exemple #26
0
  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
    num_frames_t=num_frames
    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    feature_size = model_input.get_shape().as_list()[2]
    iterations=5#150
    attention_size=8
    if FLAGS.is_train: 
      iterations=120
      model_input = utils.SampleRandomFrames(model_input[:,15:,:], num_frames-15-15,
                                         iterations)
      model_input=model_input+tf.random_normal(shape=tf.shape(model_input), mean=0.0, stddev=1e-3, dtype=tf.float32)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    video_attention = MultiAttentionLayers(1024,iterations,256,attention_size)#256
    audio_attention = MultiAttentionLayers(128,iterations,256/4,attention_size)#256/4

    model_input = slim.batch_norm(
        model_input,
        center=True,
        scale=True,
        is_training=True,
        scope="model_input_bn")

    with tf.variable_scope("video_Attention"):
        attention_video = video_attention.forward(model_input[:,:,0:1024]) 
    with tf.variable_scope("audio_Attention"):
        attention_audio = audio_attention.forward(model_input[:,:,1024:])

    pooled=tf.concat([attention_video,attention_audio],axis=1)
    #instance_att#tf.reduce_mean(pooledi,axis=1)

    print('pooled is',pooled)
    pooled=tf.reshape(tf.transpose(pooled,perm=[0,2,1]),[-1,1152])
    dr2 = tf.get_variable("dr2",
      [feature_size,1024],
      initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
    pooled=tf.matmul(pooled,dr2)

    pooled = slim.batch_norm(
              pooled,
              center=True,
              scale=True,
              is_training=True,
              scope="pooled_bn")

    gating_weights = tf.get_variable("gating_weights_2",
      [1024, 1024],
      initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(1024)))     
    gates = tf.matmul(pooled, gating_weights)     
    gates = slim.batch_norm(
        gates,
        center=True,
        scale=True,
        is_training=True,
        scope="gating_bn")
    gates = tf.sigmoid(gates)
    pooled = tf.multiply(pooled,gates)

    results_temp=aggregated_model().create_model(
        model_input=pooled, vocab_size=vocab_size, **unused_params)
    results_temp['predictions']=tf.reduce_max(tf.reshape(results_temp['predictions'],[-1,attention_size,vocab_size]),axis=1)
    print(results_temp)
    return results_temp