def visual_semantic_infer(self, visual_feature_train_pos, visual_feature_train_neg, sentence_embed_train, visual_feature_test, sentence_embed_test): name="CTRL_Model" with tf.variable_scope(name): print("Building training network...............................\n") transformed_clip_train_mix = fc('v2s_lt', tf.concat([visual_feature_train_pos, visual_feature_train_neg], 0), output_dim=self.semantic_size) transformed_clip_train_norm_mix = tf.nn.l2_normalize(transformed_clip_train_mix, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train_mix = self.cross_modal_comb(transformed_clip_train_norm_mix, tf.tile(transformed_sentence_train_norm, [2,1]), self.batch_size) sim_score_mat_train_mix = vs_multilayer.vs_multilayer(cross_modal_vec_train_mix, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train_mix = tf.reshape(sim_score_mat_train_mix, [self.batch_size*2, 3]) tf.get_variable_scope().reuse_variables() print("Building test network...............................\n") transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train_mix, sim_score_mat_test
def eval(self, visual_feature_test): outputs = vs_multilayer.vs_multilayer(visual_feature_test, "PATE", middle_layer_dim=1000, reuse=True) outputs = tf.reshape(outputs, [2]) return outputs
def compute_loss_reg(self, visual_feature, offsets, labels): cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature, "APN", middle_layer_dim=1000) cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4]) # [128,4] """ cls_score_vec_0 : (128,1) cls_score_vec_1 : (128,1) p_reg_vec : (128,1) l_reg_vec : (128,1) """ # 将分类和回归向量拆分和组合 cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split( 1, 4, cls_reg_vec) cls_score_vec = tf.concat(1, (cls_score_vec_0, cls_score_vec_1)) offset_pred = tf.concat(1, (p_reg_vec, l_reg_vec)) #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( cls_score_vec, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) # offset_pred为最后全连接层的预测的坐标偏移值 # offsets是真实的坐标偏移值 loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, offset_pred, loss_reg
def eval(self, central, start, end): central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer( central, start, end, "BLA", reuse=True) outputs = tf.concat(1, (central_cls, start_reg, end_reg)) outputs = tf.reshape(outputs, [4]) print "eval output size: " + str(outputs.get_shape().as_list()) return outputs
def eval(self, visual_feature_test): # visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) outputs = vs_multilayer.vs_multilayer(visual_feature_test, "APN", middle_layer_dim=1000, reuse=True) outputs = tf.reshape(outputs, [4]) return outputs
def compute_loss(self,visual_feature,labels): # vs_multilayer的输出维度为2,指示该能否由TAG正确生成 cls_vec=vs_multilayer.vs_multilayer(visual_feature,"PATE",middle_layer_dim=1000) cls_vec=tf.reshape(cls_vec,[self.batch_size,2]) #classification loss loss_cls_vec=tf.nn.sparse_softmax_cross_entropy_with_logits(cls_vec, labels) loss_cls=tf.reduce_mean(loss_cls_vec) return loss_cls
def eval(self, visual_feature_test): sim_score = vs_multilayer.vs_multilayer( visual_feature_test, "CBR", middle_layer_dim=self.middle_layer_size, output_layer_dim=(self.action_class_num + 1) * 3, dropout=False, reuse=True) sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3]) return sim_score
def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name="CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test
def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test, sentence_ph_train_len, sentence_ph_test_len): name="CTRL_Model" with tf.variable_scope(name): print("Building training network...............................\n") transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1) if self.useLSTM: sentence_embed_train = self.lstm_embed(sentence_embed_train, sentence_ph_train_len) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print("Building test network...............................\n") transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) if self.useLSTM: sentence_embed_test = self.lstm_embed(sentence_embed_test, sentence_ph_test_len) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [self.test_batch_size, self.test_batch_size, 3]) cross_modal_vec_test_1 = self.cross_modal_comb(tf.reshape(transformed_clip_test_norm[1], shape=(1,1024)), tf.reshape(transformed_sentence_test_norm[1], shape=(1,1024)), 1) sim_score_mat_test_1 = vs_multilayer.vs_multilayer(cross_modal_vec_test_1, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test_1 = tf.reshape(sim_score_mat_test_1, [3]) return sim_score_mat_train, sim_score_mat_test, sim_score_mat_test_1
def compute_loss_reg(self, central, start, end, offsets, labels): central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer( central, start, end, "BLA") offset_pred = tf.concat(1, (start_reg, end_reg)) #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( central_cls, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, offset_pred, loss_reg
def compute_loss_reg(self,visual_feature,offsets,labels): cls_reg_vec=vs_multilayer.vs_multilayer(visual_feature,"APN",middle_layer_dim=1000) cls_reg_vec=tf.reshape(cls_reg_vec,[self.batch_size,4]) cls_score_vec_0,cls_score_vec_1,p_reg_vec,l_reg_vec=tf.split(1,4,cls_reg_vec) cls_score_vec=tf.concat(1,(cls_score_vec_0,cls_score_vec_1)) offset_pred=tf.concat(1,(p_reg_vec,l_reg_vec)) #classification loss loss_cls_vec=tf.nn.sparse_softmax_cross_entropy_with_logits(cls_score_vec, labels) loss_cls=tf.reduce_mean(loss_cls_vec) # regression loss label_tmp=tf.to_float(tf.reshape(labels,[self.batch_size,1])) label_for_reg=tf.concat(1,[label_tmp,label_tmp]) loss_reg=tf.reduce_mean(tf.mul(tf.abs(tf.sub(offset_pred,offsets)),label_for_reg)) loss=tf.add(tf.mul(self.lambda_reg,loss_reg),loss_cls) return loss,offset_pred,loss_reg
def compute_loss_reg(self, visual_feature, offsets, labels, one_hot_labels): cls_reg_vec = vs_multilayer.vs_multilayer( visual_feature, "CBR", middle_layer_dim=self.middle_layer_size, output_layer_dim=(self.action_class_num + 1) * 3) cls_reg_vec = tf.reshape( cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3]) # [128,21*3] cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1] start_offset_pred = cls_reg_vec[:, self.action_class_num + 1:(self.action_class_num + 1) * 2] end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:] #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( cls_score_vec, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss pick_start_offset_pred = [] pick_end_offset_pred = [] # 选取第K个sampel的属于某个类别的回归值。参考论文中的回归计算 for k in range(self.batch_size): # 选取第K个sample的回归预测值 pick_start_offset_pred.append(start_offset_pred[k, labels[k]]) pick_end_offset_pred.append(end_offset_pred[k, labels[k]]) pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred), [self.batch_size, 1]) pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred), [self.batch_size, 1]) labels_1 = tf.to_float(tf.not_equal( labels, 0)) # 选取对应的类别的回归值,labels中保存的是该sample属于哪个类别 label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) # 按列进行拼接 [128,2] offset_pred = tf.concat( 1, (pick_start_offset_pred, pick_end_offset_pred)) # [128,2] loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, loss_reg
def predict(self, visual_feature_test): """Inference during testing Args: visual_feature_test: Tensor, feature, (test_batch_size, visual_feature_dim) Returns: sim_score: Tensor, (action_class_num+1)*3 (Note: [0:action_class_num+1]: classification scores; [action_class_num+1:(action_class_num+1)*2: start offsets; [(action_class_num+1)*2:(action_class_num+1)*3]: end offsets) """ print('To predict the label') sim_score = vs_multilayer.vs_multilayer( visual_feature_test, "CBR", middle_layer_dim=self.middle_layer_size, class_num=self.action_class_num, dropout=False, reuse=True) sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3]) return sim_score
def compute_loss_reg(self, visual_feature, offsets, labels, test=False): cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature, "APN", middle_layer_dim=1000, test=test) cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4]) cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split( cls_reg_vec, 4, 1) cls_score_vec = tf.concat((cls_score_vec_0, cls_score_vec_1), 1) offset_pred = tf.concat((p_reg_vec, l_reg_vec), 1) # classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=cls_score_vec, labels=labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1])) label_for_reg = tf.concat([label_tmp, label_tmp], 1) loss_reg = tf.reduce_mean( tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls) return loss, offset_pred, loss_reg
def eval(self,visual_feature_test): #visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) outputs=vs_multilayer.vs_multilayer(visual_feature_test,"APN",middle_layer_dim=1000,reuse=True) outputs=tf.reshape(outputs,[4]) return outputs
def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name = "CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" """ embedding into common space dim 1024""" visual_feature_train = tf.transpose(visual_feature_train, [0, 2, 1]) # batch num fea inputs = tf.reshape(visual_feature_train, [-1, self.visual_feature_dim]) #batch x num,fe transformed_clip_train = fc( 'v2s_lt', inputs, output_dim=self.semantic_size) # batch x num, embed transformed_clip_train = tf.reshape(transformed_clip_train, [ self.batch_size, 2 * self.context_num + 1, self.semantic_size ]) #batch num embe transformed_sentence_train = fc( 's2s_lt', sentence_embed_train, output_dim=self.semantic_size) # batch, embed #### attention part print "attention part tanh(sum(x_1:t))*tanh(s) " concat_previous_feature = tf.zeros( [self.batch_size, 1, self.semantic_size]) for j in range(2 * self.context_num): now = tf.slice(transformed_clip_train, [0, 0, 0], [-1, j + 1, -1]) # print now.get_shape().as_list() now = tf.reduce_sum(now, 1) # print now.get_shape().as_list() now = tf.expand_dims(now, 1) # print now.get_shape().as_list() concat_previous_feature = tf.concat( [concat_previous_feature, now], 1) # batch num embed v = tf.tanh(tf.add(transformed_clip_train, concat_previous_feature)) relu_t = tf.tanh(transformed_sentence_train) #batch, embed concat_text = tf.reshape( tf.tile(relu_t, [1, 2 * self.context_num + 1]), [ self.batch_size, 2 * self.context_num + 1, self.semantic_size ]) # batch cont_num embed # computing weight a e = tf.reduce_sum(tf.multiply(concat_text, v), 2) # batch cont_num alpha = tf.nn.softmax(e) # batch, num_ctx a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [ self.batch_size, self.semantic_size, 2 * self.context_num + 1 ]) # batch 4096 cont_num visual_feature_train = tf.transpose(transformed_clip_train, [0, 2, 1]) # batch embed num input_vision = tf.reduce_sum(tf.multiply(visual_feature_train, a), 2) #batch embed transformed_clip_train_norm = tf.nn.l2_normalize(input_vision, dim=1) transformed_sentence_train_norm = tf.nn.l2_normalize( transformed_sentence_train, dim=1) # print transformed_clip_train_norm.shape # print transformed_sentence_train_norm.shape # exit() cross_modal_vec_train = self.cross_modal_comb( transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) # batch batch 2*conmmon_space_dim # print cross_modal_vec_train.shape # exit() sim_score_mat_train = vs_multilayer.vs_multilayer( cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape( sim_score_mat_train, [self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" visual_feature_test = tf.transpose(visual_feature_test, [0, 2, 1]) # batch num fea inputs = tf.reshape(visual_feature_test, [-1, self.visual_feature_dim]) #batch x num,fe transformed_clip_test = fc('v2s_lt', inputs, output_dim=self.semantic_size) transformed_clip_test = tf.reshape(transformed_clip_test, [ self.test_batch_size, 2 * self.context_num + 1, self.semantic_size ]) #batch num embe transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) #### attention part print "attention part tanh(sum(x_1:t))*tanh(s) " concat_previous_feature = tf.zeros( [self.test_batch_size, 1, self.semantic_size]) for j in range(2 * self.context_num): now = tf.slice(transformed_clip_test, [0, 0, 0], [-1, j + 1, -1]) print now.get_shape().as_list() now = tf.reduce_sum(now, 1) print now.get_shape().as_list() now = tf.expand_dims(now, 1) print now.get_shape().as_list() concat_previous_feature = tf.concat( 1, [concat_previous_feature, now]) # batch num embed v = tf.tanh(tf.add(transformed_clip_test, concat_previous_feature)) # batchx num, embed relu_t = tf.tanh(transformed_sentence_test) #batch, feature_embed concat_text = tf.reshape( tf.tile(relu_t, [1, 2 * self.context_num + 1]), [ self.test_batch_size, 2 * self.context_num + 1, self.semantic_size ]) # batch cont_num feature e = tf.reduce_sum(tf.mul(concat_text, v), 2) # batch cont_num alpha = tf.nn.softmax(e) # batch, num_ctx a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [ self.test_batch_size, self.semantic_size, 2 * self.context_num + 1 ]) # batch embed cont_num visual_feature_test = tf.transpose(transformed_clip_test, [0, 2, 1]) input_vision = tf.reduce_sum(tf.mul(visual_feature_test, a), 2) #batch embed transformed_clip_test_norm = tf.nn.l2_normalize(input_vision, dim=1) transformed_sentence_test_norm = tf.nn.l2_normalize( transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb( transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer( cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test
def add_loss_op(self, visual_feature, offsets, labels, one_hot_labels, name='CBR'): """This function is to compute the loss in tensorflow graph Args: visual_feature: Tensor, feature, (batch_size, visual_feature_dim) offsets: Tensor, boundary offset(both to the start and end in frame-level), (batch_size, 2) labels: Tensor, label, (batch_size) one_hot_labels: Tensor, one hot label, (batch_size, action_class_num+1) Returns: loss: loss_cls + lambda_reg * loss_reg loss_reg: L1 loss between ground truth offsets and prediction offsets loss_cls: cross entropy loss """ print('Add the standard loss') cls_reg_vec = vs_multilayer.vs_multilayer( visual_feature, name, middle_layer_dim=self.middle_layer_size, class_num=self.action_class_num, dropout=self.config.dropout) cls_reg_vec = tf.reshape( cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3]) cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1] start_offset_pred = cls_reg_vec[:, self.action_class_num + 1:(self.action_class_num + 1) * 2] end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:] # l1 loss loss_l1 = tf.reduce_mean(tf.abs(cls_score_vec)) # classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=cls_score_vec, labels=labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss pick_start_offset_pred = [] pick_end_offset_pred = [] for k in range(self.batch_size): pick_start_offset_pred.append(start_offset_pred[k, labels[k]]) pick_end_offset_pred.append(end_offset_pred[k, labels[k]]) pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred), [self.batch_size, 1]) pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred), [self.batch_size, 1]) labels_1 = tf.to_float(tf.not_equal(labels, 0)) label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1])) label_for_reg = tf.concat([label_tmp, label_tmp], 1) offset_pred = tf.concat((pick_start_offset_pred, pick_end_offset_pred), 1) loss_reg = tf.reduce_mean( tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls) if self.config.l1_loss: loss = tf.add(loss, loss_l1) else: loss = loss tf.summary.scalar("loss", loss) tf.summary.scalar("loss_reg", loss_reg) tf.summary.scalar("loss_cls", loss_cls) return loss, loss_reg, loss_cls