def __init__(self, batch_size, train_video_length_info, unit_feature_size, unit_size, lambda_reg, lr, train_clip_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.lr = lr self.lambda_reg = lambda_reg self.unit_feature_size = unit_feature_size # 4096 self.visual_feature_dim = unit_feature_size # 4096 self.train_set = TrainingDataSet(train_flow_feature_dir,train_appr_feature_dir,train_clip_path, batch_size, train_video_length_info,unit_feature_size,unit_size) self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size)
def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.005 self.lambda_regression = 0.01 self.alpha = 1.0/batch_size self.semantic_size = 1024 # the size of visual and semantic comparison size self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096*3 self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size) self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
def __init__(self, config): """Initialization """ self.config = config self.sess = None self.saver = None self.train_clip_path = self.config.train_clip_path self.background_path = self.config.background_path self.test_clip_path = self.config.test_clip_path self.train_flow_feature_dir = self.config.train_flow_feature_dir self.train_appr_feature_dir = self.config.train_appr_feature_dir self.test_flow_feature_dir = self.config.test_flow_feature_dir self.test_appr_feature_dir = self.config.test_appr_feature_dir self.test_len_dict = self.config.test_len_dict self.batch_size = self.config.batch_size self.test_batch_size = 1 self.middle_layer_size = 1000 self.lambda_reg = float(self.config.lambda_reg) self.action_class_num = self.config.action_class_num self.feat_type = self.config.feat_type self.visual_feature_dim = self.config.visual_feature_dim # Initialize the training data and testing data self.train_set = TrainingDataSet(self.config, self.train_flow_feature_dir, self.train_appr_feature_dir, self.train_clip_path, self.background_path) self.test_set = TestingDataSet(self.config, self.test_flow_feature_dir, self.test_appr_feature_dir, self.test_clip_path, self.test_batch_size, self.test_len_dict) # Path to save the summary of the models self.summary_dir = os.path.join('./summary', self.config.save_name) if not os.path.exists(self.summary_dir): os.mkdir(self.summary_dir) if self.config.issave == 'Yes': self.model_dir = os.path.join('./model', self.config.save_name) if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__(self, batch_size, train_video_length_info, ctx_num, unit_feature_size, unit_size, lambda_reg, lr, train_clip_path, background_path, test_clip_path, train_visual_feature_dir, test_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.lr = lr self.lambda_reg = lambda_reg self.unit_feature_size = unit_feature_size self.visual_feature_dim = unit_feature_size * 3 self.train_set = TrainingDataSet(train_visual_feature_dir, train_clip_path, background_path, batch_size, train_video_length_info, ctx_num, unit_feature_size, unit_size) self.test_set = TestingDataSet(test_visual_feature_dir, test_clip_path, self.test_batch_size, ctx_num)
def __init__(self, batch_size, pool_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.0001 self.lambda_regression = 0.01 self.alpha = 1.0 / batch_size #self.alpha=0.06 self.pool_size = pool_size self.semantic_size = 1024 self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096 self.train_set = TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size) self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size) self.context_num = 1
def __init__(self, batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size,lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_visual_feature_dir,test_visual_feature_dir): self.batch_size = batch_size self.test_batch_size=1 self.lr=lr self.lambda_reg=lambda_reg self.unit_feature_size=unit_feature_size self.visual_feature_dim=unit_feature_size*3 self.train_set=TrainingDataSet(train_visual_feature_dir,train_clip_path,background_path,batch_size, train_video_length_info,ctx_num,unit_feature_size,unit_size) self.test_set=TestingDataSet(test_visual_feature_dir,test_clip_path,self.test_batch_size,ctx_num)
def __init__(self, batch_size, ctx_num, unit_size, unit_feature_size, action_class_num, lr, lambda_reg, train_clip_path, background_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 # 测试的时候batch = 1个clip,训练的时候batcch = 128 self.middle_layer_size = 1000 self.vs_lr = lr self.lambda_reg = lambda_reg # 1.0 self.action_class_num = action_class_num # 20 self.visual_feature_dim = unit_feature_size * 3 # 4096*3 self.train_set = TrainingDataSet(train_flow_feature_dir, train_appr_feature_dir, train_clip_path, background_path, batch_size, ctx_num, unit_size, unit_feature_size, action_class_num) self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size, unit_size)
def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, sliding_dir, sliding_training_sample_file, test_clip_sentence_pairs_path, test_swin_txt_path, train_softmax_dir, test_softmax_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.005 self.lambda_regression = 0.01 self.alpha = 1.0 / batch_size self.semantic_size = 1024 # the size of visual and semantic comparison size self.action_semantic_size = 300 self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096 * 3 self.train_set = TrainingDataSet(sliding_dir, sliding_training_sample_file, train_csv_path, batch_size, train_softmax_dir) self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size, test_swin_txt_path, test_softmax_dir, test_clip_sentence_pairs_path)
class CTRL_Model(object): def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.005 self.lambda_regression = 0.01 self.alpha = 1.0/batch_size self.semantic_size = 1024 # the size of visual and semantic comparison size self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096*3 self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size) self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size) ''' used in training alignment model, CTRL(aln) ''' def fill_feed_dict_train(self): image_batch,sentence_batch,offset_batch = self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' used in training alignment+regression model, CTRL(reg) ''' def fill_feed_dict_train_reg(self): image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou() input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' cross modal processing module ''' def cross_modal_comb(self, visual_feat, sentence_embed, batch_size): vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]), [batch_size, batch_size, self.semantic_size]) ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size]) concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size]) print concat_feature.get_shape().as_list() mul_feature = tf.mul(vv_feature, ss_feature) add_feature = tf.add(vv_feature, ss_feature) comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4]) return comb_feature ''' visual semantic inference, including visual semantic alignment and clip location regression ''' def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name="CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test ''' compute alignment and regression loss ''' def compute_loss_reg(self, sim_reg_mat, offset_label): sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat) sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size]) l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size]) p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size]) # unit matrix with -2 I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size])) all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size]) # | -1 1 1... | # mask_mat = | 1 -1 -1... | # | 1 1 -1 ... | mask_mat = tf.add(I_2, all1) # loss cls, not considering iou I = tf.diag(tf.constant(1.0, shape=[self.batch_size])) I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size])) batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size]) para_mat = tf.add(I,batch_para_mat) loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat)))) loss_mat = tf.mul(loss_mat, para_mat) loss_align = tf.reduce_mean(loss_mat) # regression loss l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag)) loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label))) loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align) return loss, offset_pred, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim)) sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2)) visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size)) return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test def get_variables_by_name(self,name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <"+name+">" for v in v_dict[name]: print " "+v.name return v_dict def training(self, loss): v_dict = self.get_variables_by_name(["lt"]) vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"]) return vs_train_op def construct_model(self): # initialize the placeholder self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder() # build inference network sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test) # compute loss self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph) # optimize self.vs_train_op = self.training(self.loss_align_reg) return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
class CBR_Model(object): def __init__(self, batch_size, ctx_num, unit_size, unit_feature_size, action_class_num, lr, lambda_reg, train_clip_path, background_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 # 测试的时候batch = 1个clip,训练的时候batcch = 128 self.middle_layer_size = 1000 self.vs_lr = lr self.lambda_reg = lambda_reg # 1.0 self.action_class_num = action_class_num # 20 self.visual_feature_dim = unit_feature_size * 3 # 4096*3 self.train_set = TrainingDataSet(train_flow_feature_dir, train_appr_feature_dir, train_clip_path, background_path, batch_size, ctx_num, unit_size, unit_feature_size, action_class_num) self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size, unit_size) def fill_feed_dict_train(self): image_batch, label_batch, offset_batch, one_hot_label_batch = self.train_set.next_batch( ) input_feed = { self.visual_featmap_ph_train: image_batch, # 加载的inter特征和上下文特征 self.label_ph: label_batch, self.offset_ph: offset_batch, self.one_hot_label_ph: one_hot_label_batch } return input_feed # 计算总的loss和回归loss # vs_multilayer由两个全连接层构成,将该clip对应的特征输入,生成用于分类和回归的特征 def compute_loss_reg(self, visual_feature, offsets, labels, one_hot_labels): cls_reg_vec = vs_multilayer.vs_multilayer( visual_feature, "CBR", middle_layer_dim=self.middle_layer_size, output_layer_dim=(self.action_class_num + 1) * 3) cls_reg_vec = tf.reshape( cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3]) # [128,21*3] cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1] start_offset_pred = cls_reg_vec[:, self.action_class_num + 1:(self.action_class_num + 1) * 2] end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:] #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( cls_score_vec, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss pick_start_offset_pred = [] pick_end_offset_pred = [] # 选取第K个sampel的属于某个类别的回归值。参考论文中的回归计算 for k in range(self.batch_size): # 选取第K个sample的回归预测值 pick_start_offset_pred.append(start_offset_pred[k, labels[k]]) pick_end_offset_pred.append(end_offset_pred[k, labels[k]]) pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred), [self.batch_size, 1]) pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred), [self.batch_size, 1]) labels_1 = tf.to_float(tf.not_equal( labels, 0)) # 选取对应的类别的回归值,labels中保存的是该sample属于哪个类别 label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) # 按列进行拼接 [128,2] offset_pred = tf.concat( 1, (pick_start_offset_pred, pick_end_offset_pred)) # [128,2] loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, loss_reg # 创建placeholder def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.visual_feature_dim)) # (128,12288) label_ph = tf.placeholder(tf.int32, shape=(self.batch_size)) # (128,) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) # (128,2) one_hot_label_ph = tf.placeholder( tf.float32, shape=(self.batch_size, self.action_class_num + 1)) # (128,21) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) # (1,12288) return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph, one_hot_label_ph # 测试,输出结果 def eval(self, visual_feature_test): sim_score = vs_multilayer.vs_multilayer( visual_feature_test, "CBR", middle_layer_dim=self.middle_layer_size, output_layer_dim=(self.action_class_num + 1) * 3, dropout=False, reuse=True) sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3]) return sim_score def get_variables_by_name(self, name_list): # name_list : ['CBR'] v_list = tf.trainable_variables() #tf.trainable_variables返回的是需要训练的变量列表 v_dict = {} for name in name_list: v_dict[name] = [] # 遍历名为name的变量列表,将其添加到v_dict for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict # 训练,现货区可训练变量,再构造优化器,最终最小化loss def training(self, loss): v_dict = self.get_variables_by_name(["CBR"]) # 获取可训练变量列表 vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["CBR"]) return vs_train_op def construct_model(self): #construct the network: self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph, self.one_hot_label_ph = self.init_placeholder( ) visual_featmap_ph_train_norm = tf.nn.l2_normalize( self.visual_featmap_ph_train, dim=1) # 按行进行L2归一化 visual_featmap_ph_test_norm = tf.nn.l2_normalize( self.visual_featmap_ph_test, dim=1) # 计算分类loss和回归Loss,返回最终的总的loss和回归loss self.loss, loss_reg = self.compute_loss_reg( visual_featmap_ph_train_norm, self.offset_ph, self.label_ph, self.one_hot_label_ph) self.vs_train_op = self.training(self.loss) vs_eval_op = self.eval(visual_featmap_ph_test_norm) return self.loss, self.vs_train_op, vs_eval_op, loss_reg #返回总的loss,训练操作,测试操作,
class PATE_Model(object): def __init__(self, batch_size, train_video_length_info, unit_feature_size, unit_size, lambda_reg, lr, train_clip_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.lr = lr self.lambda_reg = lambda_reg self.unit_feature_size = unit_feature_size self.visual_feature_dim = unit_feature_size self.train_set = TrainingDataSet(train_flow_feature_dir, train_appr_feature_dir, train_clip_path, batch_size, train_video_length_info, unit_feature_size, unit_size) self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size) def fill_feed_dict_train_reg(self): image_batch, label_batch = self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.label_ph: label_batch } return input_feed # construct the top network and compute loss def compute_loss(self, visual_feature, labels): cls_vec = vs_multilayer.vs_multilayer(visual_feature, "PATE", middle_layer_dim=1000) cls_vec = tf.reshape(cls_vec, [self.batch_size, 2]) #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( cls_vec, labels) loss_cls = tf.reduce_mean(loss_cls_vec) return loss_cls def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.visual_feature_dim)) label_ph = tf.placeholder(tf.int32, shape=(self.batch_size)) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) return visual_featmap_ph_train, visual_featmap_ph_test, label_ph # set up the eval op def eval(self, visual_feature_test): outputs = vs_multilayer.vs_multilayer(visual_feature_test, "PATE", middle_layer_dim=1000, reuse=True) outputs = tf.reshape(outputs, [2]) return outputs # return all the variables that contains the name in name_list def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict # set up the optimizer def training(self, loss): v_dict = self.get_variables_by_name(["PATE"]) vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["PATE"]) return vs_train_op # construct the network def construct_model(self): self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph = self.init_placeholder( ) visual_featmap_ph_train_norm = tf.nn.l2_normalize( self.visual_featmap_ph_train, dim=1) visual_featmap_ph_test_norm = tf.nn.l2_normalize( self.visual_featmap_ph_test, dim=1) self.loss_cls = self.compute_loss(visual_featmap_ph_train_norm, self.label_ph) self.train_op = self.training(self.loss_cls) eval_op = self.eval(visual_featmap_ph_test_norm) return self.loss_cls, self.train_op, eval_op
def __init__(self, ): self.train_set = TrainingDataSet(self.batch_size) self.val_set = ValidationDataSet()
def __init__(self, ): self.train_set = TrainingDataSet(self.batch_size) self.test_set = TestingDataSet()
class TURN_Model(object): def __init__(self, batch_size, train_video_length_info, ctx_num, unit_feature_size, unit_size, lambda_reg, lr, train_clip_path, background_path, test_clip_path, train_visual_feature_dir, test_visual_feature_dir): self.batch_size = batch_size # 128 self.test_batch_size = 1 self.lr = lr self.lambda_reg = lambda_reg # 2.0 self.unit_feature_size = unit_feature_size # 2048 self.visual_feature_dim = unit_feature_size * 3 # 2048*3 = 6144 #主要是来加载对应clip的特征 self.train_set = TrainingDataSet(train_visual_feature_dir, train_clip_path, background_path, batch_size, train_video_length_info, ctx_num, unit_feature_size, unit_size) self.test_set = TestingDataSet(test_visual_feature_dir, test_clip_path, self.test_batch_size, ctx_num) # 该函数主要加载训练所需的数据 # image_batch中保存了一个batch(128个clip)的特征数据 # label_batch:一个batch数据的标签 # offset_batch:预先计算的真实坐标偏移值 def fill_feed_dict_train_reg(self): image_batch, label_batch, offset_batch = self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.label_ph: label_batch, self.offset_ph: offset_batch # 真实偏移 } return input_feed # 返回一个字典,里面包含一个批次的特征,标签和坐标偏移 # construct the top network and compute loss def compute_loss_reg(self, visual_feature, offsets, labels): cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature, "APN", middle_layer_dim=1000) cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4]) # [128,4] """ cls_score_vec_0 : (128,1) cls_score_vec_1 : (128,1) p_reg_vec : (128,1) l_reg_vec : (128,1) """ # 将分类和回归向量拆分和组合 cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split( 1, 4, cls_reg_vec) cls_score_vec = tf.concat(1, (cls_score_vec_0, cls_score_vec_1)) offset_pred = tf.concat(1, (p_reg_vec, l_reg_vec)) #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( cls_score_vec, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) # offset_pred为最后全连接层的预测的坐标偏移值 # offsets是真实的坐标偏移值 loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, offset_pred, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.visual_feature_dim)) label_ph = tf.placeholder(tf.int32, shape=(self.batch_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph # set up the eval op def eval(self, visual_feature_test): #visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) outputs = vs_multilayer.vs_multilayer(visual_feature_test, "APN", middle_layer_dim=1000, reuse=True) outputs = tf.reshape(outputs, [4]) return outputs # return all the variables that contains the name in name_list def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict # set up the optimizer def training(self, loss): v_dict = self.get_variables_by_name(["APN"]) #获取可训练参数 vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["APN"]) return vs_train_op # construct the network def construct_model(self): """ self.visual_featmap_ph_train : (128,6144) self.visual_featmap_ph_test : (1,6144) self.label_ph : (128,1) self.offset_ph : (128,2) """ self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph = self.init_placeholder( ) visual_featmap_ph_train_norm = tf.nn.l2_normalize( self.visual_featmap_ph_train, dim=1) visual_featmap_ph_test_norm = tf.nn.l2_normalize( self.visual_featmap_ph_test, dim=1) # 根据label计算loss,reg_loss,回归值 self.loss_cls_reg, offset_pred, loss_reg = self.compute_loss_reg( visual_featmap_ph_train_norm, self.offset_ph, self.label_ph) self.train_op = self.training(self.loss_cls_reg) # 训练操作主要是为了最小化总loss eval_op = self.eval(visual_featmap_ph_test_norm) # 获取fc层的输出结果用来后续的测试 return self.loss_cls_reg, self.train_op, eval_op, loss_reg
initial_steps = 0 max_steps = 20000 batch_size = 64 train_csv_path = "/home/wam/Action_Recognition/TACoS/train_clip-sentvec.pkl" test_csv_path = "/home/wam/Action_Recognition/TACoS/test_clip-sentvec.pkl" test_feature_dir="/home/wam/Action_Recognition/Interval128_256_overlap0.8_c3d_fc6/" train_feature_dir = "/home/wam/Action_Recognition/Interval64_128_256_512_overlap0.8_c3d_fc6/" test_batch_size = 1 vs_lr = 0.001 lambda_regression = 0.01 alpha = 1.0/batch_size semantic_size = 1024 # the size of visual and semantic comparison size sentence_embedding_size = 4800 visual_feature_dim = 4096*3 train_set=TrainingDataSet(train_feature_dir, train_csv_path, batch_size) test_set=TestingDataSet(test_feature_dir, test_csv_path, test_batch_size) def compute_loss_reg(sim_reg_mat, offset_label): sim_score_mat, p_reg_mat, l_reg_mat = tf.split(sim_reg_mat, 3, 2) sim_score_mat = tf.reshape(sim_score_mat, [batch_size, batch_size]) l_reg_mat = tf.reshape(l_reg_mat, [batch_size, batch_size]) p_reg_mat = tf.reshape(p_reg_mat, [batch_size, batch_size]) # unit matrix with -2 I_2 = tf.diag(tf.constant(-2.0, shape=[batch_size])) all1 = tf.constant(1.0, shape=[batch_size, batch_size]) mask_mat = tf.add(I_2, all1) # loss cls, not considering iou I = tf.diag(tf.constant(1.0, shape=[batch_size])) I_half = tf.diag(tf.constant(0.5, shape=[batch_size])) batch_para_mat = tf.constant(alpha, shape=[batch_size, batch_size])
class TURN_Model(object): def __init__(self, batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size,lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_visual_feature_dir,test_visual_feature_dir): self.batch_size = batch_size self.test_batch_size=1 self.lr=lr self.lambda_reg=lambda_reg self.unit_feature_size=unit_feature_size self.visual_feature_dim=unit_feature_size*3 self.train_set=TrainingDataSet(train_visual_feature_dir,train_clip_path,background_path,batch_size, train_video_length_info,ctx_num,unit_feature_size,unit_size) self.test_set=TestingDataSet(test_visual_feature_dir,test_clip_path,self.test_batch_size,ctx_num) def fill_feed_dict_train_reg(self): image_batch,label_batch,offset_batch=self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.label_ph: label_batch, self.offset_ph: offset_batch } return input_feed # construct the top network and compute loss def compute_loss_reg(self,visual_feature,offsets,labels): cls_reg_vec=vs_multilayer.vs_multilayer(visual_feature,"APN",middle_layer_dim=1000) cls_reg_vec=tf.reshape(cls_reg_vec,[self.batch_size,4]) cls_score_vec_0,cls_score_vec_1,p_reg_vec,l_reg_vec=tf.split(1,4,cls_reg_vec) cls_score_vec=tf.concat(1,(cls_score_vec_0,cls_score_vec_1)) offset_pred=tf.concat(1,(p_reg_vec,l_reg_vec)) #classification loss loss_cls_vec=tf.nn.sparse_softmax_cross_entropy_with_logits(cls_score_vec, labels) loss_cls=tf.reduce_mean(loss_cls_vec) # regression loss label_tmp=tf.to_float(tf.reshape(labels,[self.batch_size,1])) label_for_reg=tf.concat(1,[label_tmp,label_tmp]) loss_reg=tf.reduce_mean(tf.mul(tf.abs(tf.sub(offset_pred,offsets)),label_for_reg)) loss=tf.add(tf.mul(self.lambda_reg,loss_reg),loss_cls) return loss,offset_pred,loss_reg def init_placeholder(self): visual_featmap_ph_train=tf.placeholder(tf.float32, shape=(self.batch_size,self.visual_feature_dim)) label_ph=tf.placeholder(tf.int32, shape=(self.batch_size)) offset_ph=tf.placeholder(tf.float32, shape=(self.batch_size,2)) visual_featmap_ph_test=tf.placeholder(tf.float32, shape=(self.test_batch_size,self.visual_feature_dim)) return visual_featmap_ph_train,visual_featmap_ph_test,label_ph,offset_ph # set up the eval op def eval(self,visual_feature_test): #visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) outputs=vs_multilayer.vs_multilayer(visual_feature_test,"APN",middle_layer_dim=1000,reuse=True) outputs=tf.reshape(outputs,[4]) return outputs # return all the variables that contains the name in name_list def get_variables_by_name(self,name_list): v_list=tf.trainable_variables() v_dict={} for name in name_list: v_dict[name]=[] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <"+name+">" for v in v_dict[name]: print " "+v.name return v_dict # set up the optimizer def training(self, loss): v_dict=self.get_variables_by_name(["APN"]) vs_optimizer=tf.train.AdamOptimizer(self.lr,name='vs_adam') vs_train_op=vs_optimizer.minimize(loss,var_list=v_dict["APN"]) return vs_train_op # construct the network def construct_model(self): self.visual_featmap_ph_train,self.visual_featmap_ph_test,self.label_ph,self.offset_ph=self.init_placeholder() visual_featmap_ph_train_norm=tf.nn.l2_normalize(self.visual_featmap_ph_train,dim=1) visual_featmap_ph_test_norm=tf.nn.l2_normalize(self.visual_featmap_ph_test,dim=1) self.loss_cls_reg,offset_pred,loss_reg=self.compute_loss_reg(visual_featmap_ph_train_norm,self.offset_ph,self.label_ph) self.train_op=self.training(self.loss_cls_reg) eval_op=self.eval(visual_featmap_ph_test_norm) return self.loss_cls_reg,self.train_op, eval_op,loss_reg
class TURN_Model(object): ctx_num = 4 unit_size = 16.0 unit_feature_size = 2048 lr = 0.005 lambda_reg = 2.0 batch_size = 128 test_batch_size = 1 visual_feature_dim = unit_feature_size * 3 def __init__(self, ): self.train_set = TrainingDataSet(self.batch_size) self.test_set = TestingDataSet() def fill_feed_dict_train_reg(self): image_batch, label_batch, offset_batch = self.train_set.next_batch() image_batch = np.nan_to_num(image_batch) input_feed = { self.visual_featmap_ph_train: image_batch, self.label_ph: label_batch, self.offset_ph: offset_batch } return input_feed # construct the top network and compute loss def compute_loss_reg(self, visual_feature, offsets, labels, test=False): cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature, "APN", middle_layer_dim=1000, test=test) cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4]) cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split( cls_reg_vec, 4, 1) cls_score_vec = tf.concat((cls_score_vec_0, cls_score_vec_1), 1) offset_pred = tf.concat((p_reg_vec, l_reg_vec), 1) # classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=cls_score_vec, labels=labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1])) label_for_reg = tf.concat([label_tmp, label_tmp], 1) loss_reg = tf.reduce_mean( tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls) return loss, offset_pred, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.visual_feature_dim)) label_ph = tf.placeholder(tf.int32, shape=(self.batch_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) # print(visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph) return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph # set up the eval op def eval(self, visual_feature_test): # visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) outputs = vs_multilayer.vs_multilayer(visual_feature_test, "APN", middle_layer_dim=1000, reuse=True) outputs = tf.reshape(outputs, [4]) return outputs # return all the variables that contains the name in name_list def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict # set up the optimizer def training(self, loss): v_dict = self.get_variables_by_name(["APN"]) vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["APN"]) return vs_train_op # construct the network def construct_model(self, test=False): self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph = self.init_placeholder( ) visual_featmap_ph_train_norm = tf.nn.l2_normalize( self.visual_featmap_ph_train, dim=1) visual_featmap_ph_test_norm = tf.nn.l2_normalize( self.visual_featmap_ph_test, dim=1) self.loss_cls_reg, offset_pred, loss_reg = self.compute_loss_reg( visual_featmap_ph_train_norm, self.offset_ph, self.label_ph, test) self.train_op = self.training(self.loss_cls_reg) eval_op = self.eval(visual_featmap_ph_test_norm) return self.loss_cls_reg, self.train_op, eval_op, loss_reg
class TAR_Model(object): def __init__(self, batch_size, train_video_length_info, ctx_num, central_num, unit_feature_size, unit_size, lambda_reg, lr, train_clip_path, background_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.lr = lr self.lambda_reg = lambda_reg self.unit_feature_size = unit_feature_size self.visual_feature_dim = unit_feature_size self.train_set = TrainingDataSet(train_flow_feature_dir, train_appr_feature_dir, train_clip_path, background_path, batch_size, train_video_length_info, ctx_num, central_num, unit_feature_size, unit_size) self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size, ctx_num) self.ctx_num = ctx_num self.central_num = central_num def fill_feed_dict_train_reg(self): central_batch, left_batch, right_batch, label_batch, offset_batch = self.train_set.next_batch( ) input_feed = { self.central_ph_train: central_batch, self.left_ph_train: left_batch, self.right_ph_train: right_batch, self.label_ph: label_batch, self.offset_ph: offset_batch } return input_feed # construct the top network and compute loss def compute_loss_reg(self, central, start, end, offsets, labels): central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer( central, start, end, "BLA") offset_pred = tf.concat(1, (start_reg, end_reg)) #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( central_cls, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, offset_pred, loss_reg def init_placeholder(self): self.central_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.central_num, self.visual_feature_dim)) self.left_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.ctx_num, self.visual_feature_dim)) self.right_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.ctx_num, self.visual_feature_dim)) self.label_ph = tf.placeholder(tf.int32, shape=(self.batch_size)) self.offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) self.central_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.central_num, self.visual_feature_dim)) self.left_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.ctx_num, self.visual_feature_dim)) self.right_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.ctx_num, self.visual_feature_dim)) return # set up the eval op def eval(self, central, start, end): central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer( central, start, end, "BLA", reuse=True) outputs = tf.concat(1, (central_cls, start_reg, end_reg)) outputs = tf.reshape(outputs, [4]) print "eval output size: " + str(outputs.get_shape().as_list()) return outputs # return all the variables that contains the name in name_list def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict # set up the optimizer def training(self, loss): v_dict = self.get_variables_by_name(["BLA"]) vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["BLA"]) return vs_train_op # construct the network def construct_model(self): self.init_placeholder() self.loss_cls_reg, offset_pred, loss_reg = self.compute_loss_reg( self.central_ph_train, self.left_ph_train, self.right_ph_train, self.offset_ph, self.label_ph) self.train_op = self.training(self.loss_cls_reg) eval_op = self.eval(self.central_ph_test, self.left_ph_test, self.right_ph_test) return self.loss_cls_reg, self.train_op, eval_op, loss_reg
class ACRN_Model(object): def __init__(self, batch_size, pool_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.0001 self.lambda_regression = 0.01 self.alpha = 1.0 / batch_size #self.alpha=0.06 self.pool_size = pool_size self.semantic_size = 1024 self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096 self.train_set = TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size) self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size) self.context_num = 1 ''' used in training alignment model, CTRL(aln) ''' def fill_feed_dict_train(self): image_batch, sentence_batch, offset_batch = self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' used in training alignment+regression model, CTRL(reg) ''' def fill_feed_dict_train_reg(self): image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou( ) input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' cross modal processing module ''' def cross_modal_comb(self, visual_feat, sentence_embed, batch_size): vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]), [batch_size, batch_size, self.semantic_size]) ss_feature = tf.reshape(tf.tile(sentence_embed, [1, batch_size]), [batch_size, batch_size, self.semantic_size]) vv_feature1 = tf.reshape(vv_feature, [batch_size, batch_size, -1, 1]) ss_feature1 = tf.reshape(ss_feature, [batch_size, batch_size, -1, 1]) pool_vv = tf.nn.avg_pool(vv_feature1, ksize=[1, 1, self.pool_size, 1], strides=[1, 1, self.pool_size, 1], padding='SAME') pool_ss = tf.nn.avg_pool(ss_feature1, ksize=[1, 1, self.pool_size, 1], strides=[1, 1, self.pool_size, 1], padding='SAME') shape_vv = pool_vv.get_shape().as_list() shape_ss = pool_ss.get_shape().as_list() vv = tf.reshape(pool_vv, [batch_size * batch_size, shape_vv[2], 1]) ss = tf.reshape( pool_ss, [batch_size * batch_size, 1, shape_ss[2]]) #batchx batch, fea print vv.shape, ss.shape concat_feature = tf.matmul(vv, ss) #batch*batch, 1024*1024 print concat_feature.shape concat_feature = tf.reshape(concat_feature, [batch_size, batch_size, -1]) comb_feature = tf.reshape( tf.concat([vv_feature, ss_feature, concat_feature], 2), [1, batch_size, batch_size, -1]) return comb_feature ''' visual semantic inference, including visual semantic alignment and clip location regression ''' def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name = "CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" """ embedding into common space dim 1024""" visual_feature_train = tf.transpose(visual_feature_train, [0, 2, 1]) # batch num fea inputs = tf.reshape(visual_feature_train, [-1, self.visual_feature_dim]) #batch x num,fe transformed_clip_train = fc( 'v2s_lt', inputs, output_dim=self.semantic_size) # batch x num, embed transformed_clip_train = tf.reshape(transformed_clip_train, [ self.batch_size, 2 * self.context_num + 1, self.semantic_size ]) #batch num embe transformed_sentence_train = fc( 's2s_lt', sentence_embed_train, output_dim=self.semantic_size) # batch, embed #### attention part print "attention part tanh(sum(x_1:t))*tanh(s) " concat_previous_feature = tf.zeros( [self.batch_size, 1, self.semantic_size]) for j in range(2 * self.context_num): now = tf.slice(transformed_clip_train, [0, 0, 0], [-1, j + 1, -1]) # print now.get_shape().as_list() now = tf.reduce_sum(now, 1) # print now.get_shape().as_list() now = tf.expand_dims(now, 1) # print now.get_shape().as_list() concat_previous_feature = tf.concat( [concat_previous_feature, now], 1) # batch num embed v = tf.tanh(tf.add(transformed_clip_train, concat_previous_feature)) relu_t = tf.tanh(transformed_sentence_train) #batch, embed concat_text = tf.reshape( tf.tile(relu_t, [1, 2 * self.context_num + 1]), [ self.batch_size, 2 * self.context_num + 1, self.semantic_size ]) # batch cont_num embed # computing weight a e = tf.reduce_sum(tf.multiply(concat_text, v), 2) # batch cont_num alpha = tf.nn.softmax(e) # batch, num_ctx a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [ self.batch_size, self.semantic_size, 2 * self.context_num + 1 ]) # batch 4096 cont_num visual_feature_train = tf.transpose(transformed_clip_train, [0, 2, 1]) # batch embed num input_vision = tf.reduce_sum(tf.multiply(visual_feature_train, a), 2) #batch embed transformed_clip_train_norm = tf.nn.l2_normalize(input_vision, dim=1) transformed_sentence_train_norm = tf.nn.l2_normalize( transformed_sentence_train, dim=1) # print transformed_clip_train_norm.shape # print transformed_sentence_train_norm.shape # exit() cross_modal_vec_train = self.cross_modal_comb( transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) # batch batch 2*conmmon_space_dim # print cross_modal_vec_train.shape # exit() sim_score_mat_train = vs_multilayer.vs_multilayer( cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape( sim_score_mat_train, [self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" visual_feature_test = tf.transpose(visual_feature_test, [0, 2, 1]) # batch num fea inputs = tf.reshape(visual_feature_test, [-1, self.visual_feature_dim]) #batch x num,fe transformed_clip_test = fc('v2s_lt', inputs, output_dim=self.semantic_size) transformed_clip_test = tf.reshape(transformed_clip_test, [ self.test_batch_size, 2 * self.context_num + 1, self.semantic_size ]) #batch num embe transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) #### attention part print "attention part tanh(sum(x_1:t))*tanh(s) " concat_previous_feature = tf.zeros( [self.test_batch_size, 1, self.semantic_size]) for j in range(2 * self.context_num): now = tf.slice(transformed_clip_test, [0, 0, 0], [-1, j + 1, -1]) print now.get_shape().as_list() now = tf.reduce_sum(now, 1) print now.get_shape().as_list() now = tf.expand_dims(now, 1) print now.get_shape().as_list() concat_previous_feature = tf.concat( 1, [concat_previous_feature, now]) # batch num embed v = tf.tanh(tf.add(transformed_clip_test, concat_previous_feature)) # batchx num, embed relu_t = tf.tanh(transformed_sentence_test) #batch, feature_embed concat_text = tf.reshape( tf.tile(relu_t, [1, 2 * self.context_num + 1]), [ self.test_batch_size, 2 * self.context_num + 1, self.semantic_size ]) # batch cont_num feature e = tf.reduce_sum(tf.mul(concat_text, v), 2) # batch cont_num alpha = tf.nn.softmax(e) # batch, num_ctx a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [ self.test_batch_size, self.semantic_size, 2 * self.context_num + 1 ]) # batch embed cont_num visual_feature_test = tf.transpose(transformed_clip_test, [0, 2, 1]) input_vision = tf.reduce_sum(tf.mul(visual_feature_test, a), 2) #batch embed transformed_clip_test_norm = tf.nn.l2_normalize(input_vision, dim=1) transformed_sentence_test_norm = tf.nn.l2_normalize( transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb( transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer( cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test ''' compute alignment and regression loss ''' def compute_loss_reg(self, sim_reg_mat, offset_label): sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat) sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size]) l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size]) p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size]) # unit matrix with -2 I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size])) all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size]) # | -1 1 1... | # mask_mat = | 1 -1 -1... | # | 1 1 -1 ... | mask_mat = tf.add(I_2, all1) # loss cls, not considering iou I = tf.diag(tf.constant(1.0, shape=[self.batch_size])) I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size])) batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size]) para_mat = tf.add(I, batch_para_mat) loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat)))) loss_mat = tf.mul(loss_mat, para_mat) loss_align = tf.reduce_mean(loss_mat) # regression loss l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag)) loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label))) loss = tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align) return loss, offset_pred, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=( self.batch_size, self.visual_feature_dim, 2 * self.context_num + 1)) # input feature: current clip, pre-contex, and post contex sentence_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.sentence_embedding_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=( self.test_batch_size, self.visual_feature_dim, 2 * self.context_num + 1)) #input feature: current clip, pre-contex, and post contex sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size)) return visual_featmap_ph_train, sentence_ph_train, offset_ph, visual_featmap_ph_test, sentence_ph_test def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict def training(self, loss): v_dict = self.get_variables_by_name(["lt"]) vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"]) return vs_train_op def construct_model(self): # initialize the placeholder self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test = self.init_placeholder( ) # build inference network sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer( self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test) # compute loss self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg( sim_reg_mat, self.offset_ph) # optimize self.vs_train_op = self.training(self.loss_align_reg) return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
class CBR_Model(object): """ This is the body of the network we are using Here you will get access to the network structure, function of training, evaluation """ def __init__(self, config): """Initialization """ self.config = config self.sess = None self.saver = None self.train_clip_path = self.config.train_clip_path self.background_path = self.config.background_path self.test_clip_path = self.config.test_clip_path self.train_flow_feature_dir = self.config.train_flow_feature_dir self.train_appr_feature_dir = self.config.train_appr_feature_dir self.test_flow_feature_dir = self.config.test_flow_feature_dir self.test_appr_feature_dir = self.config.test_appr_feature_dir self.test_len_dict = self.config.test_len_dict self.batch_size = self.config.batch_size self.test_batch_size = 1 self.middle_layer_size = 1000 self.lambda_reg = float(self.config.lambda_reg) self.action_class_num = self.config.action_class_num self.feat_type = self.config.feat_type self.visual_feature_dim = self.config.visual_feature_dim # Initialize the training data and testing data self.train_set = TrainingDataSet(self.config, self.train_flow_feature_dir, self.train_appr_feature_dir, self.train_clip_path, self.background_path) self.test_set = TestingDataSet(self.config, self.test_flow_feature_dir, self.test_appr_feature_dir, self.test_clip_path, self.test_batch_size, self.test_len_dict) # Path to save the summary of the models self.summary_dir = os.path.join('./summary', self.config.save_name) if not os.path.exists(self.summary_dir): os.mkdir(self.summary_dir) if self.config.issave == 'Yes': self.model_dir = os.path.join('./model', self.config.save_name) if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) def init_session(self): """Create a session in tensorflow """ print('Initializing of session') gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3 ) # 30% memory of TITAN is enough self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=10) if self.config.ispretrain == 'Yes': self.restore_session('model/xx.ckpt') def get_feed_dict(self, lr_by_step): """Prepare training samples in each batch size to the network """ image_batch, label_batch, offset_batch, one_hot_label_batch = self.train_set.next_batch( ) input_feed = { self.visual_featmap_ph_train: image_batch, self.label_ph: label_batch, self.offset_ph: offset_batch, self.one_hot_label_ph: one_hot_label_batch, self.vs_lr: lr_by_step } return input_feed def add_loss_op(self, visual_feature, offsets, labels, one_hot_labels, name='CBR'): """This function is to compute the loss in tensorflow graph Args: visual_feature: Tensor, feature, (batch_size, visual_feature_dim) offsets: Tensor, boundary offset(both to the start and end in frame-level), (batch_size, 2) labels: Tensor, label, (batch_size) one_hot_labels: Tensor, one hot label, (batch_size, action_class_num+1) Returns: loss: loss_cls + lambda_reg * loss_reg loss_reg: L1 loss between ground truth offsets and prediction offsets loss_cls: cross entropy loss """ print('Add the standard loss') cls_reg_vec = vs_multilayer.vs_multilayer( visual_feature, name, middle_layer_dim=self.middle_layer_size, class_num=self.action_class_num, dropout=self.config.dropout) cls_reg_vec = tf.reshape( cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3]) cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1] start_offset_pred = cls_reg_vec[:, self.action_class_num + 1:(self.action_class_num + 1) * 2] end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:] # l1 loss loss_l1 = tf.reduce_mean(tf.abs(cls_score_vec)) # classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=cls_score_vec, labels=labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss pick_start_offset_pred = [] pick_end_offset_pred = [] for k in range(self.batch_size): pick_start_offset_pred.append(start_offset_pred[k, labels[k]]) pick_end_offset_pred.append(end_offset_pred[k, labels[k]]) pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred), [self.batch_size, 1]) pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred), [self.batch_size, 1]) labels_1 = tf.to_float(tf.not_equal(labels, 0)) label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1])) label_for_reg = tf.concat([label_tmp, label_tmp], 1) offset_pred = tf.concat((pick_start_offset_pred, pick_end_offset_pred), 1) loss_reg = tf.reduce_mean( tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls) if self.config.l1_loss: loss = tf.add(loss, loss_l1) else: loss = loss tf.summary.scalar("loss", loss) tf.summary.scalar("loss_reg", loss_reg) tf.summary.scalar("loss_cls", loss_cls) return loss, loss_reg, loss_cls def add_placeholders(self): """Add placeholders """ print('Add placeholders') self.visual_featmap_ph_train = tf.placeholder( tf.float32, name='train_featmap', shape=(self.batch_size, self.visual_feature_dim)) self.visual_featmap_ph_test = tf.placeholder( tf.float32, name='test_featmap', shape=(self.test_batch_size, self.visual_feature_dim)) self.label_ph = tf.placeholder(tf.int32, name='label', shape=(self.batch_size)) self.offset_ph = tf.placeholder(tf.float32, name='offset', shape=(self.batch_size, 2)) self.one_hot_label_ph = tf.placeholder( tf.float32, name='one_hot_label', shape=(self.batch_size, self.action_class_num + 1)) self.vs_lr = tf.placeholder(tf.float32, name='lr') def add_summary(self): """Add summary """ print('Add summay') self.merged = tf.summary.merge_all() self.file_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) def save_session(self, step): """Save the session if needed """ if self.config.issave == 'Yes': print('Save session') model_name = os.path.join(self.model_dir, str(step) + '.ckpt') self.saver.save(self.sess, model_name) def restore_session(self, dir_model): """Restore session """ print('Restore the Session') self.saver.restore(self.sess, dir_model) def close_session(self): """ Close session once finished """ print('Close session') self.sess.close() def predict(self, visual_feature_test): """Inference during testing Args: visual_feature_test: Tensor, feature, (test_batch_size, visual_feature_dim) Returns: sim_score: Tensor, (action_class_num+1)*3 (Note: [0:action_class_num+1]: classification scores; [action_class_num+1:(action_class_num+1)*2: start offsets; [(action_class_num+1)*2:(action_class_num+1)*3]: end offsets) """ print('To predict the label') sim_score = vs_multilayer.vs_multilayer( visual_feature_test, "CBR", middle_layer_dim=self.middle_layer_size, class_num=self.action_class_num, dropout=False, reuse=True) sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3]) return sim_score def get_variables_by_name(self, name_list): """Get variables by name """ v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print("Variables of <" + name + ">") for v in v_dict[name]: print(" " + v.name) return v_dict def add_train_op(self, loss): """Add train operation """ print('Add train operation') v_dict = self.get_variables_by_name(["CBR"]) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if self.config.opm_type == 'adam_wd': vs_optimizer = tf.contrib.opt.extend_with_decoupled_weight_decay( tf.train.AdamOptimizer) optimizer = vs_optimizer(weight_decay=1e-4, learning_rate=self.vs_lr, name='vs_adam') with tf.control_dependencies(update_ops): vs_train_op = optimizer.minimize(loss, var_list=v_dict["CBR"]) elif self.config.opm_type == 'adam': vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') with tf.control_dependencies(update_ops): vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["CBR"]) return vs_train_op def build(self): """Build the model """ print('Construct the network') self.add_placeholders() if self.config.norm == 'l2': visual_featmap_ph_train_norm = tf.nn.l2_normalize( self.visual_featmap_ph_train, dim=1) visual_featmap_ph_test_norm = tf.nn.l2_normalize( self.visual_featmap_ph_test, dim=1) elif self.config.norm == 'No': visual_featmap_ph_train_norm = self.visual_featmap_ph_train visual_featmap_ph_test_norm = self.visual_featmap_ph_test self.loss, self.loss_reg, self.loss_cls = self.add_loss_op( visual_featmap_ph_train_norm, self.offset_ph, self.label_ph, self.one_hot_label_ph) self.vs_train_op = self.add_train_op(self.loss) self.vs_eval_op = self.predict(visual_featmap_ph_test_norm) self.init_session() def train(self): """Training """ self.add_summary() for step in xrange(self.config.max_steps): # if step <= 3000: lr = self.config.lr # else: # lr = self.config.lr/10 start_time = time.time() feed_dict = self.get_feed_dict(lr) duration1 = time.time() - start_time [_, loss_value, loss_reg_value, loss_cls_value, summary] = self.sess.run([ self.vs_train_op, self.loss, self.loss_reg, self.loss_cls, self.merged ], feed_dict=feed_dict) duration2 = time.time() - start_time print( 'Step %d: loss=%.2f, loss_reg=%.2f, loss_cls=%.2f, (%.3f sec),(%.3f sec)' % (step, loss_value, loss_reg_value, loss_cls_value, duration1, duration2)) self.file_writer.add_summary(summary, step) if (step + 1) == 4000 or (step + 1) % self.config.test_steps == 0: self.save_session(step + 1) def do_eval_slidingclips(self, save_name): """Do evaluation based on proposals and save the coresponding score and offset to a pickle file in './eval/test_results' folder """ test_len_dict = tools.load_length_dict(type='test') reg_result_dict = {} for k, test_sample in enumerate(self.test_set.test_samples): reg_result_dict[k] = [] if k % 1000 == 0: print(str(k) + "/" + str(len(self.test_set.test_samples))) movie_name = test_sample[0] init_clip_start = test_sample[1] init_clip_end = test_sample[2] clip_start = init_clip_start clip_end = init_clip_end final_action_prob = np.zeros([ (self.config.action_class_num + 1) * 3 * self.config.cas_step ]) if clip_start >= clip_end: reg_result_dict[k].append(final_action_prob) continue for i in range(self.config.cas_step): if clip_start >= clip_end: break if self.config.feat_type == 'Pool': featmap = dataset.get_pooling_feature( self.test_set.flow_feat_dir, self.test_set.appr_feat_dir, movie_name, clip_start, clip_end, self.config.pool_level, self.config.unit_size, self.config.unit_feature_size, self.config.fusion_type) left_feat = dataset.get_left_context_feature( self.test_set.flow_feat_dir, self.test_set.appr_feat_dir, movie_name, clip_start, clip_end, self.config.ctx_num, self.config.unit_size, self.config.unit_feature_size, self.config.fusion_type) right_feat = dataset.get_right_context_feature( self.test_set.flow_feat_dir, self.test_set.appr_feat_dir, movie_name, clip_start, clip_end, self.config.ctx_num, self.config.unit_size, self.config.unit_feature_size, self.config.fusion_type) mean_ = np.hstack((left_feat, featmap, right_feat)) feat = mean_ elif self.config.feat_type == 'SSN': feat = dataset.get_SSN_feature( self.test_set.flow_feat_dir, self.test_set.appr_feat_dir, movie_name, clip_start, clip_end, self.config.unit_size, self.config.unit_feature_size, self.config.fusion_type) else: feat = dataset.get_BSP_feature( self.test_set.flow_feat_dir, self.test_set.appr_feat_dir, movie_name, clip_start, clip_end, self.config.unit_size, self.config.unit_feature_size, self.config.bsp_level) feat = np.reshape(feat, [1, self.config.visual_feature_dim]) feed_dict = {self.visual_featmap_ph_test: feat} outputs = self.sess.run(self.vs_eval_op, feed_dict=feed_dict) action_score = outputs[1:self.config.action_class_num + 1] action_prob = tools.softmax(action_score) final_action_prob[(i) * (self.config.action_class_num + 1) * 3:(i + 1) * (self.config.action_class_num + 1) * 3] = outputs action_cat = np.argmax(action_prob) + 1 round_reg_end = clip_end + round(outputs[ (self.config.action_class_num + 1) * 2 + action_cat]) * self.config.unit_size round_reg_start = clip_start + round( outputs[self.config.action_class_num + 1 + action_cat]) * self.config.unit_size if round_reg_start < 0 or round_reg_end > test_len_dict[ movie_name] - 15 or round_reg_start >= round_reg_end: round_reg_end = clip_end round_reg_start = clip_start reg_end = clip_end + outputs[ (self.config.action_class_num + 1) * 2 + action_cat] * self.config.unit_size reg_start = clip_start + outputs[ self.config.action_class_num + 1 + action_cat] * self.config.unit_size clip_start = round_reg_start clip_end = round_reg_end reg_result_dict[k].append(final_action_prob) pickle.dump( reg_result_dict, open("./eval/test_results/" + save_name + "_outputs.pkl", "wb"))
class CTRL_Model(object): def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.005 self.lambda_regression = 0.01 self.alpha = 1.0 / batch_size self.semantic_size = 1024 # the size of visual and semantic comparison size self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096 * 3 self.train_set = TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size) self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size) ''' used in training alignment model, CTRL(aln) ''' def fill_feed_dict_train(self): image_batch, sentence_batch, offset_batch = self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' used in training alignment+regression model, CTRL(reg) ''' def fill_feed_dict_train_reg(self): image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou( ) input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' cross modal processing module ''' def cross_modal_comb(self, visual_feat, sentence_embed, batch_size): vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]), [batch_size, batch_size, self.semantic_size]) ss_feature = tf.reshape(tf.tile(sentence_embed, [1, batch_size]), [batch_size, batch_size, self.semantic_size]) concat_feature = tf.reshape( tf.concat([vv_feature, ss_feature], 2), [batch_size, batch_size, self.semantic_size + self.semantic_size]) print concat_feature.get_shape().as_list() mul_feature = tf.multiply(vv_feature, ss_feature) add_feature = tf.add(vv_feature, ss_feature) comb_feature = tf.reshape( tf.concat([mul_feature, add_feature, concat_feature], 2), [1, batch_size, batch_size, self.semantic_size * 4]) return comb_feature ''' visual semantic inference, including visual semantic alignment and clip location regression ''' def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name = "CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize( transformed_clip_train, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize( transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb( transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer( cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape( sim_score_mat_train, [self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize( transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize( transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb( transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer( cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test ''' compute alignment and regression loss ''' def compute_loss_reg(self, sim_reg_mat, offset_label): sim_score_mat, p_reg_mat, l_reg_mat = tf.split(sim_reg_mat, 3, 2) sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size]) l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size]) p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size]) # unit matrix with -2 I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size])) all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size]) # | -1 1 1... | # mask_mat = | 1 -1 -1... | # | 1 1 -1 ... | mask_mat = tf.add(I_2, all1) # loss cls, not considering iou I = tf.diag(tf.constant(1.0, shape=[self.batch_size])) I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size])) batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size]) para_mat = tf.add(I, batch_para_mat) loss_mat = tf.log( tf.add(all1, tf.exp(tf.multiply(mask_mat, sim_score_mat)))) loss_mat = tf.multiply(loss_mat, para_mat) loss_align = tf.reduce_mean(loss_mat) # regression loss l_reg_diag = tf.matmul(tf.multiply(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) p_reg_diag = tf.matmul(tf.multiply(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) offset_pred = tf.concat((p_reg_diag, l_reg_diag), 1) loss_reg = tf.reduce_mean( tf.abs(tf.subtract(offset_pred, offset_label))) loss = tf.add(tf.multiply(self.lambda_regression, loss_reg), loss_align) return loss, offset_pred, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.visual_feature_dim)) sentence_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.sentence_embedding_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size)) return visual_featmap_ph_train, sentence_ph_train, offset_ph, visual_featmap_ph_test, sentence_ph_test def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict def training(self, loss): v_dict = self.get_variables_by_name(["lt"]) vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"]) return vs_train_op def construct_model(self): # initialize the placeholder self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test = self.init_placeholder( ) # build inference network sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer( self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test) # compute loss self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg( sim_reg_mat, self.offset_ph) # optimize self.vs_train_op = self.training(self.loss_align_reg) return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
class CBR_Model(object): def __init__(self, batch_size, ctx_num, unit_size, unit_feature_size, action_class_num, lr, lambda_reg, train_clip_path, background_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.middle_layer_size = 1000 self.vs_lr = lr self.lambda_reg = lambda_reg self.action_class_num = action_class_num self.visual_feature_dim = unit_feature_size * 3 self.train_set = TrainingDataSet(train_flow_feature_dir, train_appr_feature_dir, train_clip_path, background_path, batch_size, ctx_num, unit_size, unit_feature_size, action_class_num) self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size, unit_size) def fill_feed_dict_train(self): image_batch, label_batch, offset_batch, one_hot_label_batch = self.train_set.next_batch( ) input_feed = { self.visual_featmap_ph_train: image_batch, self.label_ph: label_batch, self.offset_ph: offset_batch, self.one_hot_label_ph: one_hot_label_batch } return input_feed def compute_loss_reg(self, visual_feature, offsets, labels, one_hot_labels): cls_reg_vec = vs_multilayer.vs_multilayer( visual_feature, "CBR", middle_layer_dim=self.middle_layer_size, output_layer_dim=(self.action_class_num + 1) * 3) cls_reg_vec = tf.reshape( cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3]) cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1] start_offset_pred = cls_reg_vec[:, self.action_class_num + 1:(self.action_class_num + 1) * 2] end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:] #classification loss loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( cls_score_vec, labels) loss_cls = tf.reduce_mean(loss_cls_vec) # regression loss pick_start_offset_pred = [] pick_end_offset_pred = [] for k in range(self.batch_size): pick_start_offset_pred.append(start_offset_pred[k, labels[k]]) pick_end_offset_pred.append(end_offset_pred[k, labels[k]]) pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred), [self.batch_size, 1]) pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred), [self.batch_size, 1]) labels_1 = tf.to_float(tf.not_equal(labels, 0)) label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1])) label_for_reg = tf.concat(1, [label_tmp, label_tmp]) offset_pred = tf.concat(1, (pick_start_offset_pred, pick_end_offset_pred)) loss_reg = tf.reduce_mean( tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg)) loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls) return loss, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder( tf.float32, shape=(self.batch_size, self.visual_feature_dim)) label_ph = tf.placeholder(tf.int32, shape=(self.batch_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2)) one_hot_label_ph = tf.placeholder(tf.float32, shape=(self.batch_size, self.action_class_num + 1)) visual_featmap_ph_test = tf.placeholder( tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph, one_hot_label_ph def eval(self, visual_feature_test): sim_score = vs_multilayer.vs_multilayer( visual_feature_test, "CBR", middle_layer_dim=self.middle_layer_size, output_layer_dim=(self.action_class_num + 1) * 3, dropout=False, reuse=True) sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3]) return sim_score def get_variables_by_name(self, name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <" + name + ">" for v in v_dict[name]: print " " + v.name return v_dict def training(self, loss): v_dict = self.get_variables_by_name(["CBR"]) vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["CBR"]) return vs_train_op def construct_model(self): #construct the network: self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph, self.one_hot_label_ph = self.init_placeholder( ) visual_featmap_ph_train_norm = tf.nn.l2_normalize( self.visual_featmap_ph_train, dim=1) visual_featmap_ph_test_norm = tf.nn.l2_normalize( self.visual_featmap_ph_test, dim=1) self.loss, loss_reg = self.compute_loss_reg( visual_featmap_ph_train_norm, self.offset_ph, self.label_ph, self.one_hot_label_ph) self.vs_train_op = self.training(self.loss) vs_eval_op = self.eval(visual_featmap_ph_test_norm) return self.loss, self.vs_train_op, vs_eval_op, loss_reg