def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers random_frames = FLAGS.lstm_random_sequence iterations = FLAGS.iterations backward = FLAGS.lstm_backward if random_frames: num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_input = utils.SampleRandomFrames(model_input, num_frames_2, iterations) if backward: model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) stacked_lstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell( lstm_size, forget_bias=1.0, state_is_tuple=False) for _ in range(number_of_layers) ], state_is_tuple=False) loss = 0.0 with tf.variable_scope("RNN"): outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, sequence_length=num_frames, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=state, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers random_frames = FLAGS.lstm_random_sequence iterations = FLAGS.iterations dropout_rate = FLAGS.dropout_rate if random_frames: num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_input = utils.SampleRandomFrames(model_input, num_frames_2, iterations) lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell( lstm_size, forget_bias=1.0) # Foreward direction cell lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell( lstm_size, forget_bias=1.0) # Backward direction cell if is_training: lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_fw_cell, output_keep_prob=(1 - dropout_rate)) lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_bw_cell, output_keep_prob=(1 - dropout_rate)) lstm_fw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell] * (number_of_layers - 1), state_is_tuple=False) lstm_bw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell] * (number_of_layers - 1), state_is_tuple=False) loss = 0.0 with tf.variable_scope("RNN"): outputs, state = tf.contrib.rnn.static_bidirectional_rnn( lstm_fw_cell, lstm_bw_cell, model_input, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=state, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers stacked_lstm_fw = tf.contrib.rnn.MultiRNNCell( [ tf.contrib.rnn.BasicLSTMCell( lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) stacked_lstm_bw = tf.contrib.rnn.MultiRNNCell( [ tf.contrib.rnn.BasicLSTMCell( lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) loss = 0.0 outputs, state = tf.nn.bidirectional_dynamic_rnn(stacked_lstm_fw, stacked_lstm_bw, model_input, sequence_length=num_frames, dtype=tf.float32) s = tf.concat([state[0][-1].h, state[1][-1].h], 1) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.video_level_classifier_model == 'FrameMoeModel': nf = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) outputs = utils.SampleRandomFrames(tf.concat(outputs,2), nf, FLAGS.iterations) return aggregated_model().create_model( model_input=outputs, vocab_size=vocab_size, num_frames=num_frames, **unused_params) else: return aggregated_model().create_model( model_input=s, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ max_frame = 128 num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_input = utils.SampleRandomFrames(model_input, num_frames, max_frame) # max_frame = model_input.get_shape().as_list()[1] image = tf.reshape(model_input, [-1, 32, 32]) image = tf.expand_dims(image, 3) with slim.arg_scope( [slim.conv2d], weights_initializer=tf.truncated_normal_initializer( stddev=0.01), weights_regularizer=slim.l2_regularizer(0.0005), normalizer_fn=slim.batch_norm): net = slim.conv2d(image, 32, [5, 5], padding='VALID', scope='conv1') net = slim.relu(net, 32, scope='relu1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.conv2d(net, 64, [5, 5], padding='VALID', scope='conv2') net = slim.relu(net, 64, scope='relu2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.conv2d(net, 128, [5, 5], padding='VALID', scope='conv3') net = slim.relu(net, 128, scope='relu3') net = tf.squeeze(net, [1, 2], name='squeezed') print(net) net = tf.reshape(net, [-1, max_frame, 128]) net = utils.FramePooling(net, 'max') net = slim.fully_connected(net, 512, scope='fc4') print(net) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=net, vocab_size=vocab_size, **unused_params)
def get_input(): num_frames_cast = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: sample_model_input = utils.SampleRandomFrames( model_input, num_frames_cast, iterations) else: sample_model_input = utils.SampleRandomSequence( model_input, num_frames_cast, iterations) max_frames = sample_model_input.get_shape().as_list()[1] feature_size = sample_model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(sample_model_input, [-1, feature_size]) tf.summary.histogram("input", reshaped_input) if input_add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn", reuse=tf.AUTO_REUSE) return reshaped_input, max_frames, feature_size
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.netvlad_cluster_size hidden1_size = hidden_size or FLAGS.netvlad_hidden_size relu = FLAGS.netvlad_relu dimred = FLAGS.netvlad_dimred gating = FLAGS.gating remove_diag = FLAGS.gating_remove_diag print "FLAGS.lightvlad", FLAGS.lightvlad lightvlad = FLAGS.lightvlad vlagd = FLAGS.vlagd num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) print "num_frames:", num_frames if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) if lightvlad: video_NetVLAD = LightVLAD(1024, max_frames, cluster_size, add_batch_norm) audio_NetVLAD = LightVLAD(128, max_frames, cluster_size / 2, add_batch_norm) if add_batch_norm: # and not lightvlad: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, scope="input_bn") with tf.variable_scope("video_VLAD"): vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:]) vlad = tf.concat([vlad_video, vlad_audio], 1) vlad_dim = vlad.get_shape().as_list()[1] hidden1_weights = tf.get_variable( "hidden1_weights", [vlad_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(vlad, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable( "gating_weights_2", [hidden1_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals, activation) if add_batch_norm: gates = slim.batch_norm(gates, center=True, scale=True, scope="gating_bn") else: gating_biases = tf.get_variable( "gating_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation, gates) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_input = utils.SampleRandomFrames(model_input, num_frames, 64) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] regularizer = slim.l2_regularizer(1e-8) padding = 'SAME' ############## # get attention state lstm_size = 512 # srink the dimention to lstm_size reshaped_input = tf.reshape(model_input, [-1, feature_size]) rnn_in = slim.fully_connected(reshaped_input, lstm_size, weights_regularizer=regularizer) rnn_in = tf.reshape(rnn_in, [-1, max_frames, lstm_size]) with tf.variable_scope('BLSTM1'): lstm_fw_cell = tf.contrib.rnn.LSTMCell(lstm_size, forget_bias=1.0, cell_clip=10.) lstm_bw_cell = tf.contrib.rnn.LSTMCell(lstm_size, forget_bias=1.0, cell_clip=10.) outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( lstm_fw_cell, lstm_bw_cell, tf.unstack(rnn_in, axis=1), dtype=tf.float32) with tf.variable_scope('BLSTM2'): lstm_fw_cell1 = tf.contrib.rnn.LSTMCell(lstm_size, forget_bias=1.0, cell_clip=10.) lstm_bw_cell1 = tf.contrib.rnn.LSTMCell(lstm_size, forget_bias=1.0, cell_clip=10.) outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( lstm_fw_cell1, lstm_bw_cell1, outputs, dtype=tf.float32) rnn_out = tf.reshape(outputs, [-1, 2 * lstm_size]) att_pooling = slim.fully_connected(rnn_out, 1, activation_fn=None, weights_regularizer=regularizer) att_pooling = tf.reshape(att_pooling, [-1, max_frames, 1]) att_pooling = tf.nn.softmax(att_pooling, dim=1) ############### cluster_size = 8192 activation = slim.fully_connected(reshaped_input, cluster_size, weights_regularizer=regularizer) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) att_pooled_max = tf.reduce_mean(tf.multiply(activation, att_pooling), 1) activation = slim.fully_connected(att_pooled_max, 2048, weights_regularizer=regularizer) activation = tf.reshape(activation, [-1, 2, 2, 512]) #init_act = slim.flatten(activation) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training) activation = tf.nn.relu6(activation) activation = slim.conv2d(activation, 64, [1, 1], padding=padding, weights_regularizer=regularizer) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training) activation = tf.nn.relu6(activation) activation = slim.conv2d(activation, 64, [3, 3], padding=padding, weights_regularizer=regularizer) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training) activation = slim.flatten(activation) #init_act = slim.fully_connected(init_act, activation.get_shape().as_list()[1]) #activation = activation + init_act #activation = tf.nn.relu6(activation) activation = slim.dropout(activation, 0.8, is_training=is_training) activation = slim.fully_connected(activation, 2048, weights_regularizer=regularizer) output = slim.fully_connected(activation, vocab_size, activation_fn=tf.nn.sigmoid, weights_regularizer=regularizer) return {"predictions": output}
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, is_training=True, hidden=None, n_enc_layers=None, n_dec_layers=None, heads=None, **unused_params): iterations = iterations or FLAGS.iterations # add_batch_norm = add_batch_norm or FLAGS.add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames heads = heads or FLAGS.heads self.hidden = hidden or FLAGS.hidden n_enc_layers = n_enc_layers or FLAGS.n_enc_layers n_dec_layers = n_dec_layers or FLAGS.n_dec_layers num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) batch_size = model_input.get_shape().as_list()[0] max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] # reshaped_input = tf.reshape(model_input, [-1, feature_size]) encoding = tf.layers.dense( inputs=model_input, units=self.hidden, # dtype=tf.float32, name="input_embedding") for i in np.arange(n_enc_layers): encoding = self.attention("enc_self_{}".format(i), encoding, h=heads) dense = tf.layers.dense( inputs=encoding, units=self.hidden * 2, activation=tf.nn.relu, name="enc_dense_{}_1".format(i), ) encoding += tf.layers.dense( inputs=dense, units=self.hidden, activation=None, name="enc_dense_{}_2".format(i), ) decoder_query = tf.get_variable( name="decoder_query", shape=(1, 1, self.hidden), dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=1e-2), ) decoding = self.attention( "dec_enc_0", tf.tile(decoder_query, multiples=tf.concat(([tf.shape(model_input)[0]], [1], [1]), axis=0)), encoding, h=heads) for i in np.arange(n_dec_layers - 1): # decoding = self.attention("dec_self_{}".format(i + 1), decoding, h=heads) decoding = self.attention("dec_enc_{}".format(i + 1), decoding, encoding, h=heads) dense = tf.layers.dense( inputs=decoding, units=self.hidden * 2, activation=tf.nn.relu, name="dec_dense_{}_1".format(i + 1), ) decoding += tf.layers.dense( inputs=dense, units=self.hidden, activation=None, name="dec_dense_{}_2".format(i + 1), ) activation = tf.reshape(decoding, [-1, self.hidden]) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size with tf.variable_scope(scope, tf.AUTO_REUSE): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence( model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) reshaped_input = tf.expand_dims(reshaped_input, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) out1 = tf.layers.conv2d( reshaped_input, 128, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out1_norm = tf.layers.batch_normalization(out1, training=is_training) out1_pool = tf.layers.max_pooling2d(out1_norm, (8, 1), 2, padding='same') out2 = tf.layers.conv2d( out1_pool, 256, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out2_norm = tf.layers.batch_normalization(out2, training=is_training) out2_pool = tf.layers.max_pooling2d(out2_norm, (8, 1), 2, padding='same') out3 = tf.layers.conv2d( out2_pool, 256, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out3_norm = tf.layers.batch_normalization(out3, training=is_training) out3_pool = tf.layers.max_pooling2d(out3_norm, (8, 1), 2, padding='same') out = tf.reduce_max(out3_pool, axis=[2, 3]) activation = tf.reshape(out, [-1, max_frames, out.shape[1]]) cluster_size = out.shape[1] activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) activation = tf.layers.dense( activation, hidden1_size, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) tf.summary.histogram("activation", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = activation if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, mix_number=None, cluster_size=None, hidden_size=None, is_training=True, groups=None, expansion=None, drop_rate=None, gating_reduction=None, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) ''' #################################### n_bag = 300 // 5 model_input = utils.ReshapeFramesToMIL(model_input) # B x FLAGS.pad_seq_length/5 x 5 x 1132 max_frames = 5 feature_size = model_input.get_shape().as_list()[3] ##################################### ''' if FLAGS.sample_random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, FLAGS.iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, FLAGS.iterations) cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size gating_reduction = gating_reduction or FLAGS.gating_reduction groups = groups or FLAGS.groups drop_rate = drop_rate or FLAGS.drop_rate mix_number = mix_number or FLAGS.mix_number expansion = expansion or FLAGS.expansion max_frames = model_input.get_shape().as_list()[1] mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32) ftr_mean = tf.reduce_mean(model_input, axis=-1) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=True, is_training=is_training, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, mix_number, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) results = [] for n in range(mix_number): with tf.variable_scope("branch_%d" % n): res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024], audio_ftr=model_input[:, :, 1024:], vocab_size=vocab_size, max_frames=max_frames, cluster_size=cluster_size, groups=groups, expansion=expansion, drop_rate=drop_rate, hidden1_size=hidden1_size, is_training=is_training, gating_reduction=gating_reduction, mask=mask, **unused_params) results.append(res) aux_preds = [res["predictions"] for res in results] logits = [res["logits"] for res in results] logits = tf.stack(logits, axis=1) mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1), logits), axis=1) pred = tf.nn.sigmoid(mix_logit) if is_training: rank_pred = tf.expand_dims(tf.nn.softmax(tf.div( mix_logit, FLAGS.cl_temperature), axis=-1), axis=1) aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.cl_temperature), axis=-1) epsilon = 1e-8 kl_loss = tf.reduce_sum(rank_pred * (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)), axis=-1) regularization_loss = FLAGS.cl_lambda * tf.reduce_mean( tf.reduce_sum(kl_loss, axis=-1), axis=-1) return { "predictions": pred, "regularization_loss": regularization_loss, "aux_predictions": aux_preds } else: return {"predictions": pred}
def create_model(self, model_input, vocab_size, num_frames, mix_number=None, cluster_size=None, hidden_size=None, is_training=True, groups=None, expansion=None, drop_rate=None, gating_reduction=None, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) config = copy.deepcopy(config) config.num_hidden_layers = FLAGS.bert_hidden_layer config.num_attention_heads = FLAGS.bert_attention_heads config.hidden_dropout_prob = FLAGS.bert_dropout_prob config.attention_probs_dropout_prob = FLAGS.bert_dropout_prob if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 #breakpoint() with tf.variable_scope("encoder"): self.all_encoder_layers = modeling.transformer_model( input_tensor=model_input, attention_mask=None, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) model_input = self.all_encoder_layers[-1] if FLAGS.sample_random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, FLAGS.iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, FLAGS.iterations) cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size gating_reduction = gating_reduction or FLAGS.gating_reduction groups = groups or FLAGS.groups drop_rate = drop_rate or FLAGS.drop_rate mix_number = mix_number or FLAGS.mix_number expansion = expansion or FLAGS.expansion max_frames = model_input.get_shape().as_list()[1] mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32) ftr_mean = tf.reduce_mean(model_input, axis=-1) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=True, is_training=is_training, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, mix_number, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) results = [] for n in range(mix_number): with tf.variable_scope("branch_%d" % n): res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024], audio_ftr=model_input[:, :, 1024:], vocab_size=vocab_size, max_frames=max_frames, cluster_size=cluster_size, groups=groups, expansion=expansion, drop_rate=drop_rate, hidden1_size=hidden1_size, is_training=is_training, gating_reduction=gating_reduction, mask=mask, **unused_params) results.append(res) aux_preds = [res["predictions"] for res in results] logits = [res["logits"] for res in results] logits = tf.stack(logits, axis=1) mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1), logits), axis=1) pred = tf.nn.sigmoid(mix_logit) if is_training: rank_pred = tf.expand_dims(tf.nn.softmax(tf.div( mix_logit, FLAGS.cl_temperature), axis=-1), axis=1) aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.cl_temperature), axis=-1) epsilon = 1e-8 kl_loss = tf.reduce_sum(rank_pred * (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)), axis=-1) regularization_loss = FLAGS.cl_lambda * tf.reduce_mean( tf.reduce_sum(kl_loss, axis=-1), axis=-1) return { "predictions": pred, "regularization_loss": regularization_loss, "aux_predictions": aux_preds } else: return {"predictions": pred}
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = model_utils.SampleRandomFrames( model_input, num_frames, iterations) else: model_input = model_utils.SampleRandomSequence( model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.Variable( tf.random_normal([feature_size, cluster_size], stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.Variable( tf.random_normal([cluster_size], stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = model_utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.Variable( tf.random_normal([cluster_size, hidden1_size], stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.Variable( tf.random_normal([hidden1_size], stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, original_input=model_input, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size with tf.variable_scope(scope, tf.AUTO_REUSE): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence( model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params) results['features'] = activation if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = 300 add_batch_norm = True random_frames = True cluster_size = 64 hidden1_size = 1024 relu = False dimred = -1 gating = True remove_diag = False num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training) if add_batch_norm:# and not lightvlad: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_audio_VLAD"): vlad = video_audio_NetVLAD.forward(reshaped_input) vlad_dim = vlad.get_shape().as_list()[1] hidden1_weights = tf.get_variable("hidden1_weights", [vlad_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(vlad, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable("gating_weights_2", [hidden1_size, hidden1_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals,activation) if add_batch_norm: gates = slim.batch_norm( gates, center=True, scale=True, is_training=is_training, scope="gating_bn") else: gating_biases = tf.get_variable("gating_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation,gates) aggregated_model = getattr(video_level_models, 'willow_MoeModel_moe2') return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = 2048 cluster_size_2 = 512 hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.EqualSpaceMeans(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) activation += cluster_biases activation = tf.nn.relu6(activation) activation = tf.reshape(activation, [-1, 3, 10, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) activation = tf.reshape(activation, [-1, 2, 5, cluster_size]) activation = tf.transpose(activation, [0, 2, 3, 1]) activation = tf.reshape(activation, [-1, cluster_size * 2]) cluster_weights_2 = tf.get_variable( "cluster_weights2", [cluster_size * 2, cluster_size_2], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size * 2))) activation = tf.matmul(activation, cluster_weights_2) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn2") else: cluster_biases_2 = tf.get_variable( "cluster_biases2", [cluster_size_2], initializer=tf.random_normal(stddev=1 / math.sqrt(cluster_size * 2))) activation += cluster_biases_2 activation = tf.nn.relu6(activation) activation = tf.reshape(activation, [-1, cluster_size_2 * 5]) hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size_2 * 5, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size_2 * 5))) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.netvlad_cluster_size hidden1_size = hidden_size or FLAGS.netvlad_hidden_size relu = FLAGS.netvlad_relu gating = FLAGS.gating remove_diag = FLAGS.gating_remove_diag lightvlad = FLAGS.lightvlad vlagd = FLAGS.vlagd num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] dimred_video = tf.get_variable( "dimred_video", [1024, 400], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(1024))) dimred_audio = tf.get_variable( "dimred_audio", [128, 50], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(1024))) reshaped_input = tf.reshape(model_input, [-1, feature_size]) reshaped_input1_video = tf.matmul( tf.reshape(model_input[:, :100, :1024], [-1, 1024]), dimred_video) reshaped_input2_video = tf.matmul( tf.reshape(model_input[:, 100:200, :1024], [-1, 1024]), dimred_video) reshaped_input3_video = tf.matmul( tf.reshape(model_input[:, 200:, :1024], [-1, 1024]), dimred_video) reshaped_input1_audio = tf.matmul( tf.reshape(model_input[:, :100, 1024:], [-1, 128]), dimred_audio) reshaped_input2_audio = tf.matmul( tf.reshape(model_input[:, 100:200, 1024:], [-1, 128]), dimred_audio) reshaped_input3_audio = tf.matmul( tf.reshape(model_input[:, 200:, 1024:], [-1, 128]), dimred_audio) video_NetVLAD = NetVLAD(1024, max_frames, 128, add_batch_norm, is_training) audio_NetVLAD = NetVLAD(128, max_frames, 128 / 2, add_batch_norm, is_training) video_NetVLAD1 = NetVLAD(400, max_frames / 3, 64, add_batch_norm, is_training) audio_NetVLAD1 = NetVLAD(50, max_frames / 3, 64 / 2, add_batch_norm, is_training) video_NetVLAD2 = NetVLAD(400, max_frames / 3, 64, add_batch_norm, is_training) audio_NetVLAD2 = NetVLAD(50, max_frames / 3, 64 / 2, add_batch_norm, is_training) video_NetVLAD3 = NetVLAD(400, max_frames / 3, 64, add_batch_norm, is_training) audio_NetVLAD3 = NetVLAD(50, max_frames / 3, 64 / 2, add_batch_norm, is_training) with tf.variable_scope("video_VLAD"): vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:]) with tf.variable_scope("video_VLAD1"): vlad_video1 = video_NetVLAD1.forward(reshaped_input1_video) with tf.variable_scope("audio_VLAD1"): vlad_audio1 = audio_NetVLAD1.forward(reshaped_input1_audio) with tf.variable_scope("video_VLAD2"): vlad_video2 = video_NetVLAD2.forward(reshaped_input2_video) with tf.variable_scope("audio_VLAD2"): vlad_audio2 = audio_NetVLAD2.forward(reshaped_input2_audio) with tf.variable_scope("video_VLAD3"): vlad_video3 = video_NetVLAD3.forward(reshaped_input3_video) with tf.variable_scope("audio_VLAD3"): vlad_audio3 = audio_NetVLAD3.forward(reshaped_input3_audio) vlad = tf.concat([ vlad_video, vlad_video1, vlad_video2, vlad_video3, vlad_audio, vlad_audio1, vlad_audio2, vlad_audio3 ], 1) vlad_dim = vlad.get_shape().as_list()[1] hidden1_weights = tf.get_variable( "hidden1_weights", [vlad_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vlad_dim))) activation = tf.matmul(vlad, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable( "gating_weights_2", [hidden1_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals, activation) if add_batch_norm: gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_bn") else: gating_biases = tf.get_variable( "gating_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation, gates) # activation = tf.layers.dropout(activation, rate = 0.1, training=is_training) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model( self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, expansion=2, groups=None, #mask=None, drop_rate=0.5, gating_reduction=None, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = FLAGS.sample_random_frames if sample_random_frames is None else sample_random_frames cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden_size = hidden_size or FLAGS.nextvlad_hidden_size groups = groups or FLAGS.groups gating_reduction = gating_reduction or FLAGS.gating_reduction num_frames_exp = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames_exp, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames_exp, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] #reshaped_input = tf.reshape(model_input, [-1, feature_size]) #tf.summary.histogram("input_hist", reshaped_input) mask = tf.sequence_mask(num_frames, max_frames, dtype=tf.float32) input = slim.fully_connected( model_input, expansion * feature_size, activation_fn=None, weights_initializer=slim.variance_scaling_initializer()) attention = slim.fully_connected( model_input, groups, activation_fn=tf.nn.sigmoid, weights_initializer=slim.variance_scaling_initializer()) if mask is not None: attention = tf.multiply(attention, tf.expand_dims(mask, -1)) attention = tf.reshape(attention, [-1, max_frames * groups, 1]) tf.summary.histogram("sigmoid_attention", attention) reduce_size = expansion * feature_size // groups cluster_weights = tf.get_variable( "cluster_weights", [expansion * feature_size, groups * cluster_size], initializer=slim.variance_scaling_initializer()) # tf.summary.histogram("cluster_weights", cluster_weights) reshaped_input = tf.reshape(input, [-1, expansion * feature_size]) activation = tf.matmul(reshaped_input, cluster_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn", fused=False) activation = tf.reshape(activation, [-1, max_frames * groups, cluster_size]) activation = tf.nn.softmax(activation, axis=-1) activation = tf.multiply(activation, attention) # tf.summary.histogram("cluster_output", activation) a_sum = tf.reduce_sum(activation, -2, keep_dims=True) cluster_weights2 = tf.get_variable( "cluster_weights2", [1, reduce_size, cluster_size], initializer=slim.variance_scaling_initializer()) a = tf.multiply(a_sum, cluster_weights2) activation = tf.transpose(activation, perm=[0, 2, 1]) reshaped_input = tf.reshape(input, [-1, max_frames * groups, reduce_size]) vlad = tf.matmul(activation, reshaped_input) vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.subtract(vlad, a) vlad = tf.nn.l2_normalize(vlad, 1) vlad = tf.reshape(vlad, [-1, cluster_size * reduce_size]) vlad = slim.batch_norm(vlad, center=True, scale=True, is_training=is_training, scope="vlad_bn", fused=False) if drop_rate > 0.: vlad = slim.dropout(vlad, keep_prob=1. - drop_rate, is_training=is_training, scope="vlad_dropout") vlad_dim = vlad.get_shape().as_list()[1] print("VLAD dimension", vlad_dim) hidden_weights = tf.get_variable( "hidden_weights", [vlad_dim, hidden_size], initializer=slim.variance_scaling_initializer()) activation = tf.matmul(vlad, hidden_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden_bn", fused=False) activation = tf.nn.relu(activation, name='embedding1') gating_weights_1 = tf.get_variable( "gating_weights_1", [hidden_size, hidden_size // gating_reduction], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(activation, gating_weights_1) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, activation_fn=slim.nn.relu, scope="gating_bn") gating_weights_2 = tf.get_variable( "gating_weights_2", [hidden_size // gating_reduction, hidden_size], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(gates, gating_weights_2) gates = tf.sigmoid(gates) tf.summary.histogram("final_gates", gates) activation = tf.multiply(activation, gates, name="embedding2") aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a logistic classifier over the average of the frame-level features. This class is intended to be an example for implementors of frame level models. If you want to train a model over averaged features it is more efficient to average them beforehand rather than on the fly. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ num_frames_t = num_frames num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) feature_size = model_input.get_shape().as_list()[2] iterations = 5 #150 attention_size = 8 if FLAGS.is_train: iterations = 120 model_input = utils.SampleRandomFrames(model_input[:, 15:, :], num_frames - 15 - 15, iterations) model_input = model_input + tf.random_normal( shape=tf.shape(model_input), mean=0.0, stddev=1e-3, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) video_attention = MultiAttentionLayers(1024, iterations, 256, attention_size) #256 audio_attention = MultiAttentionLayers(128, iterations, 256 / 4, attention_size) #256/4 model_input = slim.batch_norm(model_input, center=True, scale=True, is_training=True, scope="model_input_bn") with tf.variable_scope("video_Attention"): attention_video = video_attention.forward(model_input[:, :, 0:1024]) with tf.variable_scope("audio_Attention"): attention_audio = audio_attention.forward(model_input[:, :, 1024:]) pooled = tf.concat([attention_video, attention_audio], axis=1) #instance_att#tf.reduce_mean(pooledi,axis=1) print('pooled is', pooled) pooled = tf.reshape(tf.transpose(pooled, perm=[0, 2, 1]), [-1, 1152]) dr2 = tf.get_variable("dr2", [feature_size, 1024], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(feature_size))) pooled = tf.matmul(pooled, dr2) pooled = slim.batch_norm(pooled, center=True, scale=True, is_training=True, scope="pooled_bn") gating_weights = tf.get_variable( "gating_weights_2", [1024, 1024], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(1024))) gates = tf.matmul(pooled, gating_weights) gates = slim.batch_norm(gates, center=True, scale=True, is_training=True, scope="gating_bn") gates = tf.sigmoid(gates) pooled = tf.multiply(pooled, gates) results_temp = aggregated_model().create_model(model_input=pooled, vocab_size=vocab_size, **unused_params) results_temp['predictions'] = tf.reduce_max(tf.reshape( results_temp['predictions'], [-1, attention_size, vocab_size]), axis=1) print(results_temp) return results_temp
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size relu = FLAGS.dbof_relu cluster_activation = FLAGS.dbof_activation num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if cluster_activation == 'glu': cluster_size = 2*cluster_size video_Dbof = DBoF(1024,max_frames,cluster_size, cluster_activation, add_batch_norm, is_training) audio_Dbof = DBoF(128,max_frames,cluster_size/8, cluster_activation, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_DBOF"): dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) with tf.variable_scope("audio_DBOF"): dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) dbof = tf.concat([dbof_video, dbof_audio],1) dbof_dim = dbof.get_shape().as_list()[1] hidden1_weights = tf.get_variable("hidden1_weights", [dbof_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(dbof, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = 300 add_batch_norm = True random_frames = True cluster_size = 32 hidden1_size = 1024 relu = False gating = True num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_NetFV = NetFV(1024, max_frames, cluster_size, add_batch_norm, is_training) audio_NetFV = NetFV(128, max_frames, cluster_size / 2, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_FV"): fv_video = video_NetFV.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_FV"): fv_audio = audio_NetFV.forward(reshaped_input[:, 1024:]) fv = tf.concat([fv_video, fv_audio], 1) fv_dim = fv.get_shape().as_list()[1] hidden1_weights = tf.get_variable( "hidden1_weights", [fv_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(fv, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable( "gating_weights_2", [hidden1_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if add_batch_norm: gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_bn") else: gating_biases = tf.get_variable( "gating_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation, gates) aggregated_model = getattr(video_level_models, 'willow_MoeModel_moe4') return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = 300 add_batch_norm = True random_frames = True cluster_size = 8000 hidden1_size = 1024 fc_dimred = True relu = False max_pool = False num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training) audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_DBOF"): dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) with tf.variable_scope("audio_DBOF"): dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) dbof = tf.concat([dbof_video, dbof_audio],1) dbof_dim = dbof.get_shape().as_list()[1] if fc_dimred: hidden1_weights = tf.get_variable("hidden1_weights", [dbof_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(dbof, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) else: activation = dbof aggregated_model = getattr(video_level_models, 'willow_MoeModel_moe2_noGP') return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): """Creates a model which uses a seqtoseq model to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ if True: self.dim_image = dim_image= model_input.get_shape().as_list()[2] self.n_words = n_words = vocab_size self.dim_hidden = dim_hidden = FLAGS.lstm_cells self.batch_size = tf.shape(model_input)[0] self.n_lstm_steps = n_lstm_steps=20 self.drop_out_rate =drop_out_rate= 0.4 bias_init_vector = None n_caption_step = 20#model_input.get_shape().as_list()[1] self.Wemb = tf.get_variable( 'Wemb',[n_words, dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes = True, state_is_tuple = True) if is_training: self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3,output_keep_prob=1 - self.drop_out_rate) else: self.lstm3_dropout = self.lstm3 self.lstm31 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes = True, state_is_tuple = True) if is_training: self.lstm3_dropout1 = tf.contrib.rnn.DropoutWrapper(self.lstm31,output_keep_prob=1 - self.drop_out_rate) else: self.lstm3_dropout1 = self.lstm31 self.encode_image_W = tf.get_variable( 'encode_image_W',[dim_image, dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.encode_image_b = tf.get_variable('encode_image_b',[dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.embed_att_w = tf.get_variable( 'embed_att_w',[dim_hidden, 1], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.embed_att_Wa = tf.get_variable( 'embed_att_Wa',[dim_hidden, dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.embed_att_Ua = tf.get_variable( 'embed_att_Ua',[dim_hidden, dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.embed_att_ba = tf.get_variable( 'embed_att_ba',[dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.embed_word_W = tf.get_variable('embed_word_W',[dim_hidden, n_words], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(n_words))) if bias_init_vector is not None: self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b') else: self.embed_word_b = tf.get_variable( 'embed_word_b',[n_words], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(n_words))) self.embed_nn_Wp = tf.get_variable( 'embed_nn_Wp',[3*dim_hidden, dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) self.embed_nn_bp = tf.get_variable('embed_nn_bp',[dim_hidden], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(dim_hidden))) #print(model_input.get_shape().as_list()) num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) video = utils.SampleRandomFrames(model_input, num_frames, n_lstm_steps) #print(video.get_shape().as_list()) video_flat = tf.reshape(video, [-1, self.dim_image]) # (b x n) x d image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b) # (b x n) x h image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden]) # b x n x h image_emb = tf.transpose(image_emb, [1,0,2]) # n x b x h state1 = self.lstm3.zero_state(self.batch_size, dtype=tf.float32)#tf.zeros([self.batch_size, self.lstm3.state_size]) # b x s h_prev = tf.zeros([self.batch_size, self.dim_hidden]) # b x h state11 = self.lstm31.zero_state(self.batch_size, dtype=tf.float32)# # b x s h_prev1 = tf.zeros([self.batch_size, self.dim_hidden]) # b x h loss_caption = 0.0 probs = [] current_embed = tf.zeros([self.batch_size, self.dim_hidden]) # b x h image_part = tf.reshape(image_emb, [-1, self.dim_hidden]) image_part = tf.matmul(image_part, self.embed_att_Ua) + self.embed_att_ba image_part = tf.reshape(image_part, [self.n_lstm_steps, self.batch_size, self.dim_hidden]) with tf.variable_scope("model") as scope: for i in range(n_caption_step): e = tf.tanh(tf.matmul(h_prev, self.embed_att_Wa) + image_part) # n x b x h # e = tf.batch_matmul(e, brcst_w) # unnormalized relevance score e = tf.reshape(e, [-1, self.dim_hidden]) e = tf.matmul(e, self.embed_att_w) # n x b e = tf.reshape(e, [self.n_lstm_steps, self.batch_size]) # e = tf.reduce_sum(e,2) # n x b e_hat_exp = tf.exp(e)#tf.multiply(tf.transpose(video_mask), tf.exp(e)) # n x b denomin = tf.reduce_sum(e_hat_exp,0) # b denomin = denomin + tf.to_float(tf.equal(denomin, 0)) # regularize denominator alphas = tf.tile(tf.expand_dims(tf.div(e_hat_exp,denomin),2),[1,1,self.dim_hidden]) # n x b x h # normalize to obtain alpha attention_list = tf.multiply(alphas, image_emb) # n x b x h atten = tf.reduce_sum(attention_list,0) # b x h # soft-attention weighted sum # if i > 0: tf.get_variable_scope().reuse_variables() if i > 0: scope.reuse_variables() with tf.variable_scope("LSTM3"): output12, state11 = self.lstm3_dropout1(tf.concat([atten, current_embed], 1), state11 ) # b x h with tf.variable_scope("LSTM31"): output1, state1 = self.lstm3_dropout(output12, state1 ) # b x h output2 = tf.tanh(tf.nn.xw_plus_b(tf.concat([output1,atten,current_embed], 1), self.embed_nn_Wp, self.embed_nn_bp)) # b x h h_prev = output1 # b x h logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b) # b x w probs.append(logit_words) tf_probs = tf.stack(probs,0) tf_probs = tf.transpose(tf_probs,[1,0,2]) return { 'predictions': tf.nn.softmax(tf.reduce_mean(tf_probs,1)) }
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): """See base class. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). iterations: the number of frames to be sampled. add_batch_norm: whether to add batch norm during training. sample_random_frames: whether to sample random frames or random sequences. cluster_size: the output neuron number of the cluster layer. hidden_size: the output neuron number of the hidden layer. is_training: whether to build the graph in training mode. Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation) assert act_fn is not None, ("dbof_activation is not valid: %s." % FLAGS.dbof_activation) num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.compat.v1.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn", ) cluster_weights = tf.compat.v1.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)), ) tf.compat.v1.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn", ) else: cluster_biases = tf.compat.v1.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size)), ) tf.compat.v1.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases # activation = act_fn(activation) # xxx 2018 activation = tf.nn.relu6(activation) tf.compat.v1.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.compat.v1.get_variable( "hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)), ) tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn", ) else: hidden1_biases = tf.compat.v1.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01), ) tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases # xxx 2018 # activation = tf.nn.relu6(activation) activation = act_fn(activation) tf.compat.v1.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, cluster_size=None, hidden_size=None, is_training=True, groups=None, expansion=None, drop_rate=None, gating_reduction=None, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if FLAGS.sample_random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, FLAGS.iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, FLAGS.iterations) cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size gating_reduction = gating_reduction or FLAGS.gating_reduction groups = groups or FLAGS.groups drop_rate = drop_rate or FLAGS.drop_rate expansion = expansion or FLAGS.expansion max_frames = model_input.get_shape().as_list()[1] mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32) video_nextvlad = NeXtVLAD(1024, max_frames, cluster_size, is_training, groups=groups, expansion=expansion) audio_nextvlad = NeXtVLAD(128, max_frames, cluster_size // 2, is_training, groups=groups // 2, expansion=expansion) with tf.variable_scope("video_VLAD"): vlad_video = video_nextvlad.forward(model_input[:, :, 0:1024], mask=mask) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_nextvlad.forward(model_input[:, :, 1024:], mask=mask) vlad = tf.concat([vlad_video, vlad_audio], 1) if drop_rate > 0.: vlad = slim.dropout(vlad, keep_prob=1. - drop_rate, is_training=is_training, scope="vlad_dropout") vlad_dim = vlad.get_shape().as_list()[1] print("VLAD dimension", vlad_dim) hidden1_weights = tf.get_variable( "hidden1_weights", [vlad_dim, hidden1_size], initializer=slim.variance_scaling_initializer()) activation = tf.matmul(vlad, hidden1_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn", fused=False) # activation = tf.nn.relu(activation) gating_weights_1 = tf.get_variable( "gating_weights_1", [hidden1_size, hidden1_size // gating_reduction], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(activation, gating_weights_1) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, activation_fn=slim.nn.relu, scope="gating_bn") gating_weights_2 = tf.get_variable( "gating_weights_2", [hidden1_size // gating_reduction, hidden1_size], initializer=slim.variance_scaling_initializer()) gates = tf.matmul(gates, gating_weights_2) gates = tf.sigmoid(gates) tf.summary.histogram("final_gates", gates) activation = tf.multiply(activation, gates) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.netvlad_cluster_size hidden1_size = hidden_size or FLAGS.netvlad_hidden_size relu = FLAGS.netvlad_relu dimred = FLAGS.netvlad_dimred gating = FLAGS.gating remove_diag = FLAGS.gating_remove_diag lightvlad = FLAGS.lightvlad vlagd = FLAGS.vlagd SVD_dim = FLAGS.SVD_dim num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) video_NetVLAD = NetFV(1024, max_frames, int(cluster_size), add_batch_norm, is_training) audio_NetVLAD = NetFV(128, max_frames, int(cluster_size / 2), add_batch_norm, is_training) if add_batch_norm: # and not lightvlad: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_VLAD"): vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:]) vlad = tf.concat([vlad_video, vlad_audio], 1) # None x vlad_dim vlad_dim = vlad.get_shape().as_list()[1] ##### simplier SVD ##### SVD_mat1 = tf.get_variable("hidden1_weights", [vlad_dim, SVD_dim], initializer=tf.glorot_uniform_initializer()) SVD_mat2 = tf.get_variable("hidden2_weights", [SVD_dim, int(hidden1_size * 2)], initializer=tf.glorot_uniform_initializer()) SVD_mat1_biases = tf.get_variable( "SVD_mat1_biases", [SVD_dim], initializer=tf.random_normal_initializer(stddev=0.01)) SVD_mat2_biases = tf.get_variable( "SVD_mat2_biases", [int(hidden1_size * 2)], initializer=tf.random_normal_initializer(stddev=0.01)) ##### simplier SVD ##### activation = tf.matmul(vlad, SVD_mat1) # None x 256 activation += SVD_mat1_biases tf.summary.histogram("activation_in_mid_of_SVD_before_tanh", activation) activation = tf.nn.tanh(activation) tf.summary.histogram("activation_in_mid_of_SVD_after_tanh", activation) activation = tf.matmul(activation, SVD_mat2) # None x 2*hidden1_size activation += SVD_mat2_biases tf.summary.histogram("activation_after_SVD_project", activation) ## gating part gating_weights = tf.get_variable( "gating_weights_2", [int(2 * hidden1_size), hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_bn") gates = tf.sigmoid(gates) tf.summary.histogram("gates_layer", gates) ## gating part activation = tf.nn.tanh(activation) # first tanh tf.summary.histogram("activation_after_1_tanh", activation) activation = tf.layers.dropout(activation, rate=0.3, training=is_training) tf.summary.histogram("activation_after_1_tanh_after_dropout", activation) activation_hidden_weights = tf.get_variable( "activation_hidden_weights", [int(hidden1_size * 2), hidden1_size], initializer=tf.glorot_uniform_initializer()) activation = tf.matmul(activation, activation_hidden_weights) tf.summary.histogram("activation_fter_1_tanh_after_hidden_weights", activation) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden_layer_bn") tf.summary.histogram( "activation_fter_1_tanh_after_hidden_weights_after_bn", activation) activation = tf.nn.tanh(activation) # second tanh tf.summary.histogram( "activation_fter_1_tanh_after_hidden_weights_after_bn_after_2_tanh", activation) activation = tf.multiply(activation, gates) tf.summary.histogram("activation_right_before_video", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): num_frames_t=num_frames num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) feature_size = model_input.get_shape().as_list()[2] iterations=5#150 attention_size=8 if FLAGS.is_train: iterations=120 model_input = utils.SampleRandomFrames(model_input[:,15:,:], num_frames-15-15, iterations) model_input=model_input+tf.random_normal(shape=tf.shape(model_input), mean=0.0, stddev=1e-3, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) video_attention = MultiAttentionLayers(1024,iterations,256,attention_size)#256 audio_attention = MultiAttentionLayers(128,iterations,256/4,attention_size)#256/4 model_input = slim.batch_norm( model_input, center=True, scale=True, is_training=True, scope="model_input_bn") with tf.variable_scope("video_Attention"): attention_video = video_attention.forward(model_input[:,:,0:1024]) with tf.variable_scope("audio_Attention"): attention_audio = audio_attention.forward(model_input[:,:,1024:]) pooled=tf.concat([attention_video,attention_audio],axis=1) #instance_att#tf.reduce_mean(pooledi,axis=1) print('pooled is',pooled) pooled=tf.reshape(tf.transpose(pooled,perm=[0,2,1]),[-1,1152]) dr2 = tf.get_variable("dr2", [feature_size,1024], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) pooled=tf.matmul(pooled,dr2) pooled = slim.batch_norm( pooled, center=True, scale=True, is_training=True, scope="pooled_bn") gating_weights = tf.get_variable("gating_weights_2", [1024, 1024], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(1024))) gates = tf.matmul(pooled, gating_weights) gates = slim.batch_norm( gates, center=True, scale=True, is_training=True, scope="gating_bn") gates = tf.sigmoid(gates) pooled = tf.multiply(pooled,gates) results_temp=aggregated_model().create_model( model_input=pooled, vocab_size=vocab_size, **unused_params) results_temp['predictions']=tf.reduce_max(tf.reshape(results_temp['predictions'],[-1,attention_size,vocab_size]),axis=1) print(results_temp) return results_temp