Example #1
0
 def __init__(self, batch_size, train_video_length_info, unit_feature_size, unit_size, lambda_reg, lr, train_clip_path, test_clip_path, train_flow_feature_dir, train_appr_feature_dir, test_flow_feature_dir, test_appr_feature_dir):
     
     self.batch_size = batch_size
     self.test_batch_size = 1
     self.lr = lr
     self.lambda_reg = lambda_reg
     self.unit_feature_size = unit_feature_size  # 4096
     self.visual_feature_dim = unit_feature_size # 4096
     self.train_set = TrainingDataSet(train_flow_feature_dir,train_appr_feature_dir,train_clip_path, batch_size, train_video_length_info,unit_feature_size,unit_size)
     self.test_set = TestingDataSet(test_flow_feature_dir, test_appr_feature_dir, test_clip_path, self.test_batch_size)
Example #2
0
 def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):
     
     self.batch_size = batch_size
     self.test_batch_size = 1
     self.vs_lr = 0.005
     self.lambda_regression = 0.01
     self.alpha = 1.0/batch_size
     self.semantic_size = 1024 # the size of visual and semantic comparison size
     self.sentence_embedding_size = 4800
     self.visual_feature_dim = 4096*3
     self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)
     self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
Example #3
0
    def __init__(self, config):
        """Initialization
        """
        self.config = config

        self.sess = None
        self.saver = None

        self.train_clip_path = self.config.train_clip_path
        self.background_path = self.config.background_path
        self.test_clip_path = self.config.test_clip_path
        self.train_flow_feature_dir = self.config.train_flow_feature_dir
        self.train_appr_feature_dir = self.config.train_appr_feature_dir
        self.test_flow_feature_dir = self.config.test_flow_feature_dir
        self.test_appr_feature_dir = self.config.test_appr_feature_dir
        self.test_len_dict = self.config.test_len_dict

        self.batch_size = self.config.batch_size

        self.test_batch_size = 1
        self.middle_layer_size = 1000

        self.lambda_reg = float(self.config.lambda_reg)
        self.action_class_num = self.config.action_class_num
        self.feat_type = self.config.feat_type
        self.visual_feature_dim = self.config.visual_feature_dim

        # Initialize the training data and testing data
        self.train_set = TrainingDataSet(self.config,
                                         self.train_flow_feature_dir,
                                         self.train_appr_feature_dir,
                                         self.train_clip_path,
                                         self.background_path)
        self.test_set = TestingDataSet(self.config, self.test_flow_feature_dir,
                                       self.test_appr_feature_dir,
                                       self.test_clip_path,
                                       self.test_batch_size,
                                       self.test_len_dict)

        # Path to save the summary of the models
        self.summary_dir = os.path.join('./summary', self.config.save_name)

        if not os.path.exists(self.summary_dir):
            os.mkdir(self.summary_dir)

        if self.config.issave == 'Yes':
            self.model_dir = os.path.join('./model', self.config.save_name)

            if not os.path.exists(self.model_dir):
                os.mkdir(self.model_dir)
Example #4
0
    def __init__(self, batch_size, train_video_length_info, ctx_num,
                 unit_feature_size, unit_size, lambda_reg, lr, train_clip_path,
                 background_path, test_clip_path, train_visual_feature_dir,
                 test_visual_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.lr = lr
        self.lambda_reg = lambda_reg
        self.unit_feature_size = unit_feature_size
        self.visual_feature_dim = unit_feature_size * 3
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_clip_path, background_path,
                                         batch_size, train_video_length_info,
                                         ctx_num, unit_feature_size, unit_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_clip_path,
                                       self.test_batch_size, ctx_num)
    def __init__(self, batch_size, pool_size, train_csv_path, test_csv_path,
                 test_visual_feature_dir, train_visual_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.0001
        self.lambda_regression = 0.01
        self.alpha = 1.0 / batch_size
        #self.alpha=0.06
        self.pool_size = pool_size
        self.semantic_size = 1024
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_csv_path, self.batch_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path,
                                       self.test_batch_size)
        self.context_num = 1
Example #6
0
 def __init__(self, batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size,lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_visual_feature_dir,test_visual_feature_dir):
     
     self.batch_size = batch_size
     self.test_batch_size=1
     self.lr=lr
     self.lambda_reg=lambda_reg
     self.unit_feature_size=unit_feature_size
     self.visual_feature_dim=unit_feature_size*3
     self.train_set=TrainingDataSet(train_visual_feature_dir,train_clip_path,background_path,batch_size, train_video_length_info,ctx_num,unit_feature_size,unit_size)
     self.test_set=TestingDataSet(test_visual_feature_dir,test_clip_path,self.test_batch_size,ctx_num)
Example #7
0
    def __init__(self, batch_size, ctx_num, unit_size, unit_feature_size,
                 action_class_num, lr, lambda_reg, train_clip_path,
                 background_path, test_clip_path, train_flow_feature_dir,
                 train_appr_feature_dir, test_flow_feature_dir,
                 test_appr_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1  # 测试的时候batch = 1个clip,训练的时候batcch = 128
        self.middle_layer_size = 1000
        self.vs_lr = lr
        self.lambda_reg = lambda_reg  # 1.0
        self.action_class_num = action_class_num  # 20
        self.visual_feature_dim = unit_feature_size * 3  # 4096*3
        self.train_set = TrainingDataSet(train_flow_feature_dir,
                                         train_appr_feature_dir,
                                         train_clip_path, background_path,
                                         batch_size, ctx_num, unit_size,
                                         unit_feature_size, action_class_num)
        self.test_set = TestingDataSet(test_flow_feature_dir,
                                       test_appr_feature_dir, test_clip_path,
                                       self.test_batch_size, unit_size)
Example #8
0
 def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):
     
     self.batch_size = batch_size
     self.test_batch_size = 1
     self.vs_lr = 0.005
     self.lambda_regression = 0.01
     self.alpha = 1.0/batch_size
     self.semantic_size = 1024 # the size of visual and semantic comparison size
     self.sentence_embedding_size = 4800
     self.visual_feature_dim = 4096*3
     self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)
     self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
Example #9
0
    def __init__(self, batch_size, train_csv_path, test_csv_path,
                 test_visual_feature_dir, sliding_dir,
                 sliding_training_sample_file, test_clip_sentence_pairs_path,
                 test_swin_txt_path, train_softmax_dir, test_softmax_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.005
        self.lambda_regression = 0.01
        self.alpha = 1.0 / batch_size
        self.semantic_size = 1024  # the size of visual and semantic comparison size
        self.action_semantic_size = 300
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096 * 3

        self.train_set = TrainingDataSet(sliding_dir,
                                         sliding_training_sample_file,
                                         train_csv_path, batch_size,
                                         train_softmax_dir)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path,
                                       self.test_batch_size,
                                       test_swin_txt_path, test_softmax_dir,
                                       test_clip_sentence_pairs_path)
Example #10
0
class CTRL_Model(object):
    def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):
        
        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.005
        self.lambda_regression = 0.01
        self.alpha = 1.0/batch_size
        self.semantic_size = 1024 # the size of visual and semantic comparison size
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096*3
        self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)
        self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
   
    '''
    used in training alignment model, CTRL(aln)
    '''	
    def fill_feed_dict_train(self):
        image_batch,sentence_batch,offset_batch = self.train_set.next_batch()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.sentence_ph_train: sentence_batch,
                self.offset_ph: offset_batch
        }

        return input_feed
    
    '''
    used in training alignment+regression model, CTRL(reg)
    '''
    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.sentence_ph_train: sentence_batch,
                self.offset_ph: offset_batch
        }

        return input_feed

    
    '''
    cross modal processing module
    '''
    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
            [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size])
        concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size])
        print concat_feature.get_shape().as_list()
        mul_feature = tf.mul(vv_feature, ss_feature) 
        add_feature = tf.add(vv_feature, ss_feature)
        
        comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4])
        return comb_feature
    
    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"     
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n" 
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''
    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)
        sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I,batch_para_mat)
        loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat))))
        loss_mat = tf.mul(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))
        loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))

        loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)
        return loss, offset_pred, loss_reg


    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2))
        visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
        sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size))

        return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test
    

    def get_variables_by_name(self,name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <"+name+">"
            for v in v_dict[name]:
                print "    "+v.name
        return v_dict

    def training(self, loss):
        
        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op


    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder()
        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
Example #11
0
class CBR_Model(object):
    def __init__(self, batch_size, ctx_num, unit_size, unit_feature_size,
                 action_class_num, lr, lambda_reg, train_clip_path,
                 background_path, test_clip_path, train_flow_feature_dir,
                 train_appr_feature_dir, test_flow_feature_dir,
                 test_appr_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1  # 测试的时候batch = 1个clip,训练的时候batcch = 128
        self.middle_layer_size = 1000
        self.vs_lr = lr
        self.lambda_reg = lambda_reg  # 1.0
        self.action_class_num = action_class_num  # 20
        self.visual_feature_dim = unit_feature_size * 3  # 4096*3
        self.train_set = TrainingDataSet(train_flow_feature_dir,
                                         train_appr_feature_dir,
                                         train_clip_path, background_path,
                                         batch_size, ctx_num, unit_size,
                                         unit_feature_size, action_class_num)
        self.test_set = TestingDataSet(test_flow_feature_dir,
                                       test_appr_feature_dir, test_clip_path,
                                       self.test_batch_size, unit_size)

    def fill_feed_dict_train(self):
        image_batch, label_batch, offset_batch, one_hot_label_batch = self.train_set.next_batch(
        )
        input_feed = {
            self.visual_featmap_ph_train: image_batch,  # 加载的inter特征和上下文特征
            self.label_ph: label_batch,
            self.offset_ph: offset_batch,
            self.one_hot_label_ph: one_hot_label_batch
        }
        return input_feed

    # 计算总的loss和回归loss
    # vs_multilayer由两个全连接层构成,将该clip对应的特征输入,生成用于分类和回归的特征
    def compute_loss_reg(self, visual_feature, offsets, labels,
                         one_hot_labels):

        cls_reg_vec = vs_multilayer.vs_multilayer(
            visual_feature,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            output_layer_dim=(self.action_class_num + 1) * 3)
        cls_reg_vec = tf.reshape(
            cls_reg_vec, [self.batch_size,
                          (self.action_class_num + 1) * 3])  # [128,21*3]
        cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1]
        start_offset_pred = cls_reg_vec[:, self.action_class_num +
                                        1:(self.action_class_num + 1) * 2]
        end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:]

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            cls_score_vec, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)

        # regression loss
        pick_start_offset_pred = []
        pick_end_offset_pred = []
        # 选取第K个sampel的属于某个类别的回归值。参考论文中的回归计算
        for k in range(self.batch_size):  # 选取第K个sample的回归预测值
            pick_start_offset_pred.append(start_offset_pred[k, labels[k]])
            pick_end_offset_pred.append(end_offset_pred[k, labels[k]])
        pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred),
                                            [self.batch_size, 1])
        pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred),
                                          [self.batch_size, 1])
        labels_1 = tf.to_float(tf.not_equal(
            labels, 0))  # 选取对应的类别的回归值,labels中保存的是该sample属于哪个类别
        label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])  # 按列进行拼接 [128,2]
        offset_pred = tf.concat(
            1, (pick_start_offset_pred, pick_end_offset_pred))  # [128,2]
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, loss_reg

    # 创建placeholder
    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32,
            shape=(self.batch_size, self.visual_feature_dim))  # (128,12288)
        label_ph = tf.placeholder(tf.int32, shape=(self.batch_size))  # (128,)
        offset_ph = tf.placeholder(tf.float32,
                                   shape=(self.batch_size, 2))  # (128,2)
        one_hot_label_ph = tf.placeholder(
            tf.float32,
            shape=(self.batch_size, self.action_class_num + 1))  # (128,21)
        visual_featmap_ph_test = tf.placeholder(
            tf.float32,
            shape=(self.test_batch_size, self.visual_feature_dim))  # (1,12288)

        return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph, one_hot_label_ph

    # 测试,输出结果
    def eval(self, visual_feature_test):
        sim_score = vs_multilayer.vs_multilayer(
            visual_feature_test,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            output_layer_dim=(self.action_class_num + 1) * 3,
            dropout=False,
            reuse=True)
        sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3])
        return sim_score

    def get_variables_by_name(self, name_list):  # name_list : ['CBR']
        v_list = tf.trainable_variables()  #tf.trainable_variables返回的是需要训练的变量列表
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        # 遍历名为name的变量列表,将其添加到v_dict
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    # 训练,现货区可训练变量,再构造优化器,最终最小化loss
    def training(self, loss):
        v_dict = self.get_variables_by_name(["CBR"])  # 获取可训练变量列表
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["CBR"])
        return vs_train_op

    def construct_model(self):
        #construct the network:
        self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph, self.one_hot_label_ph = self.init_placeholder(
        )
        visual_featmap_ph_train_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_train, dim=1)  # 按行进行L2归一化
        visual_featmap_ph_test_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_test, dim=1)
        # 计算分类loss和回归Loss,返回最终的总的loss和回归loss
        self.loss, loss_reg = self.compute_loss_reg(
            visual_featmap_ph_train_norm, self.offset_ph, self.label_ph,
            self.one_hot_label_ph)
        self.vs_train_op = self.training(self.loss)
        vs_eval_op = self.eval(visual_featmap_ph_test_norm)
        return self.loss, self.vs_train_op, vs_eval_op, loss_reg  #返回总的loss,训练操作,测试操作,
Example #12
0
class PATE_Model(object):
    def __init__(self, batch_size, train_video_length_info, unit_feature_size,
                 unit_size, lambda_reg, lr, train_clip_path, test_clip_path,
                 train_flow_feature_dir, train_appr_feature_dir,
                 test_flow_feature_dir, test_appr_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.lr = lr
        self.lambda_reg = lambda_reg
        self.unit_feature_size = unit_feature_size
        self.visual_feature_dim = unit_feature_size
        self.train_set = TrainingDataSet(train_flow_feature_dir,
                                         train_appr_feature_dir,
                                         train_clip_path, batch_size,
                                         train_video_length_info,
                                         unit_feature_size, unit_size)
        self.test_set = TestingDataSet(test_flow_feature_dir,
                                       test_appr_feature_dir, test_clip_path,
                                       self.test_batch_size)

    def fill_feed_dict_train_reg(self):
        image_batch, label_batch = self.train_set.next_batch()
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.label_ph: label_batch
        }

        return input_feed

    # construct the top network and compute loss
    def compute_loss(self, visual_feature, labels):

        cls_vec = vs_multilayer.vs_multilayer(visual_feature,
                                              "PATE",
                                              middle_layer_dim=1000)
        cls_vec = tf.reshape(cls_vec, [self.batch_size, 2])

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            cls_vec, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        return loss_cls

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        label_ph = tf.placeholder(tf.int32, shape=(self.batch_size))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))

        return visual_featmap_ph_train, visual_featmap_ph_test, label_ph

    # set up the eval op
    def eval(self, visual_feature_test):
        outputs = vs_multilayer.vs_multilayer(visual_feature_test,
                                              "PATE",
                                              middle_layer_dim=1000,
                                              reuse=True)
        outputs = tf.reshape(outputs, [2])
        return outputs

    # return all the variables that contains the name in name_list
    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    # set up the optimizer
    def training(self, loss):
        v_dict = self.get_variables_by_name(["PATE"])
        vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["PATE"])
        return vs_train_op

    # construct the network
    def construct_model(self):
        self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph = self.init_placeholder(
        )
        visual_featmap_ph_train_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_train, dim=1)
        visual_featmap_ph_test_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_test, dim=1)
        self.loss_cls = self.compute_loss(visual_featmap_ph_train_norm,
                                          self.label_ph)
        self.train_op = self.training(self.loss_cls)
        eval_op = self.eval(visual_featmap_ph_test_norm)
        return self.loss_cls, self.train_op, eval_op
Example #13
0
 def __init__(self, ):
     self.train_set = TrainingDataSet(self.batch_size)
     self.val_set = ValidationDataSet()
Example #14
0
 def __init__(self, ):
     self.train_set = TrainingDataSet(self.batch_size)
     self.test_set = TestingDataSet()
Example #15
0
class TURN_Model(object):
    def __init__(self, batch_size, train_video_length_info, ctx_num,
                 unit_feature_size, unit_size, lambda_reg, lr, train_clip_path,
                 background_path, test_clip_path, train_visual_feature_dir,
                 test_visual_feature_dir):

        self.batch_size = batch_size  # 128
        self.test_batch_size = 1
        self.lr = lr
        self.lambda_reg = lambda_reg  # 2.0
        self.unit_feature_size = unit_feature_size  # 2048
        self.visual_feature_dim = unit_feature_size * 3  # 2048*3 = 6144
        #主要是来加载对应clip的特征
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_clip_path, background_path,
                                         batch_size, train_video_length_info,
                                         ctx_num, unit_feature_size, unit_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_clip_path,
                                       self.test_batch_size, ctx_num)

    # 该函数主要加载训练所需的数据
    # image_batch中保存了一个batch(128个clip)的特征数据
    # label_batch:一个batch数据的标签
    # offset_batch:预先计算的真实坐标偏移值
    def fill_feed_dict_train_reg(self):
        image_batch, label_batch, offset_batch = self.train_set.next_batch()
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.label_ph: label_batch,
            self.offset_ph: offset_batch  # 真实偏移
        }

        return input_feed  # 返回一个字典,里面包含一个批次的特征,标签和坐标偏移

    # construct the top network and compute loss
    def compute_loss_reg(self, visual_feature, offsets, labels):

        cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature,
                                                  "APN",
                                                  middle_layer_dim=1000)
        cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4])  # [128,4]
        """
        cls_score_vec_0 : (128,1)
        cls_score_vec_1 : (128,1)
        p_reg_vec : (128,1)
        l_reg_vec : (128,1)
        
        """
        # 将分类和回归向量拆分和组合
        cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split(
            1, 4, cls_reg_vec)
        cls_score_vec = tf.concat(1, (cls_score_vec_0, cls_score_vec_1))
        offset_pred = tf.concat(1, (p_reg_vec, l_reg_vec))

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            cls_score_vec, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])

        # offset_pred为最后全连接层的预测的坐标偏移值
        # offsets是真实的坐标偏移值
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        label_ph = tf.placeholder(tf.int32, shape=(self.batch_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))

        return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph

    # set up the eval op
    def eval(self, visual_feature_test):
        #visual_feature_test=tf.reshape(visual_feature_test,[1,4096])
        outputs = vs_multilayer.vs_multilayer(visual_feature_test,
                                              "APN",
                                              middle_layer_dim=1000,
                                              reuse=True)
        outputs = tf.reshape(outputs, [4])
        return outputs

    # return all the variables that contains the name in name_list
    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    # set up the optimizer
    def training(self, loss):
        v_dict = self.get_variables_by_name(["APN"])  #获取可训练参数
        vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["APN"])
        return vs_train_op

    # construct the network
    def construct_model(self):
        """
        self.visual_featmap_ph_train : (128,6144)
        self.visual_featmap_ph_test : (1,6144)
        self.label_ph : (128,1)
        self.offset_ph : (128,2)
        """
        self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph = self.init_placeholder(
        )
        visual_featmap_ph_train_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_train, dim=1)
        visual_featmap_ph_test_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_test, dim=1)

        # 根据label计算loss,reg_loss,回归值
        self.loss_cls_reg, offset_pred, loss_reg = self.compute_loss_reg(
            visual_featmap_ph_train_norm, self.offset_ph, self.label_ph)
        self.train_op = self.training(self.loss_cls_reg)  # 训练操作主要是为了最小化总loss
        eval_op = self.eval(visual_featmap_ph_test_norm)  # 获取fc层的输出结果用来后续的测试
        return self.loss_cls_reg, self.train_op, eval_op, loss_reg
Example #16
0
initial_steps = 0
max_steps = 20000
batch_size = 64
train_csv_path = "/home/wam/Action_Recognition/TACoS/train_clip-sentvec.pkl"
test_csv_path = "/home/wam/Action_Recognition/TACoS/test_clip-sentvec.pkl"
test_feature_dir="/home/wam/Action_Recognition/Interval128_256_overlap0.8_c3d_fc6/"
train_feature_dir = "/home/wam/Action_Recognition/Interval64_128_256_512_overlap0.8_c3d_fc6/"

test_batch_size = 1
vs_lr = 0.001
lambda_regression = 0.01
alpha = 1.0/batch_size
semantic_size = 1024 # the size of visual and semantic comparison size
sentence_embedding_size = 4800
visual_feature_dim = 4096*3
train_set=TrainingDataSet(train_feature_dir, train_csv_path, batch_size)
test_set=TestingDataSet(test_feature_dir, test_csv_path, test_batch_size)

def compute_loss_reg(sim_reg_mat, offset_label):
    sim_score_mat, p_reg_mat, l_reg_mat = tf.split(sim_reg_mat, 3, 2)
    sim_score_mat = tf.reshape(sim_score_mat, [batch_size, batch_size])
    l_reg_mat = tf.reshape(l_reg_mat, [batch_size, batch_size])
    p_reg_mat = tf.reshape(p_reg_mat, [batch_size, batch_size])
    # unit matrix with -2
    I_2 = tf.diag(tf.constant(-2.0, shape=[batch_size]))
    all1 = tf.constant(1.0, shape=[batch_size, batch_size])
    mask_mat = tf.add(I_2, all1)
    # loss cls, not considering iou
    I = tf.diag(tf.constant(1.0, shape=[batch_size]))
    I_half = tf.diag(tf.constant(0.5, shape=[batch_size]))
    batch_para_mat = tf.constant(alpha, shape=[batch_size, batch_size])
Example #17
0
class TURN_Model(object):
    def __init__(self, batch_size,train_video_length_info,ctx_num,unit_feature_size,unit_size,lambda_reg,lr,train_clip_path,background_path,test_clip_path,train_visual_feature_dir,test_visual_feature_dir):
        
        self.batch_size = batch_size
        self.test_batch_size=1
        self.lr=lr
        self.lambda_reg=lambda_reg
        self.unit_feature_size=unit_feature_size
        self.visual_feature_dim=unit_feature_size*3
        self.train_set=TrainingDataSet(train_visual_feature_dir,train_clip_path,background_path,batch_size, train_video_length_info,ctx_num,unit_feature_size,unit_size)
        self.test_set=TestingDataSet(test_visual_feature_dir,test_clip_path,self.test_batch_size,ctx_num)
   
    	    
    def fill_feed_dict_train_reg(self):
        image_batch,label_batch,offset_batch=self.train_set.next_batch()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.label_ph: label_batch,
                self.offset_ph: offset_batch
        }

        return input_feed
            
    # construct the top network and compute loss
    def compute_loss_reg(self,visual_feature,offsets,labels):

        cls_reg_vec=vs_multilayer.vs_multilayer(visual_feature,"APN",middle_layer_dim=1000)
        cls_reg_vec=tf.reshape(cls_reg_vec,[self.batch_size,4])
        cls_score_vec_0,cls_score_vec_1,p_reg_vec,l_reg_vec=tf.split(1,4,cls_reg_vec)
        cls_score_vec=tf.concat(1,(cls_score_vec_0,cls_score_vec_1))
        offset_pred=tf.concat(1,(p_reg_vec,l_reg_vec))

        #classification loss
        loss_cls_vec=tf.nn.sparse_softmax_cross_entropy_with_logits(cls_score_vec, labels)
        loss_cls=tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp=tf.to_float(tf.reshape(labels,[self.batch_size,1]))
        label_for_reg=tf.concat(1,[label_tmp,label_tmp])
        loss_reg=tf.reduce_mean(tf.mul(tf.abs(tf.sub(offset_pred,offsets)),label_for_reg))

        loss=tf.add(tf.mul(self.lambda_reg,loss_reg),loss_cls)
        return loss,offset_pred,loss_reg


    def init_placeholder(self):
        visual_featmap_ph_train=tf.placeholder(tf.float32, shape=(self.batch_size,self.visual_feature_dim))
        label_ph=tf.placeholder(tf.int32, shape=(self.batch_size))
        offset_ph=tf.placeholder(tf.float32, shape=(self.batch_size,2))
        visual_featmap_ph_test=tf.placeholder(tf.float32, shape=(self.test_batch_size,self.visual_feature_dim))

        return visual_featmap_ph_train,visual_featmap_ph_test,label_ph,offset_ph
    

    # set up the eval op
    def eval(self,visual_feature_test):
        #visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) 
        outputs=vs_multilayer.vs_multilayer(visual_feature_test,"APN",middle_layer_dim=1000,reuse=True)
        outputs=tf.reshape(outputs,[4])
        return outputs

    # return all the variables that contains the name in name_list
    def get_variables_by_name(self,name_list):
        v_list=tf.trainable_variables()
        v_dict={}
        for name in name_list:
            v_dict[name]=[]
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <"+name+">"
            for v in v_dict[name]:
                print "    "+v.name
        return v_dict

    # set up the optimizer
    def training(self, loss):
        v_dict=self.get_variables_by_name(["APN"])
        vs_optimizer=tf.train.AdamOptimizer(self.lr,name='vs_adam')
        vs_train_op=vs_optimizer.minimize(loss,var_list=v_dict["APN"])
        return vs_train_op

    # construct the network
    def construct_model(self):
        self.visual_featmap_ph_train,self.visual_featmap_ph_test,self.label_ph,self.offset_ph=self.init_placeholder()
        visual_featmap_ph_train_norm=tf.nn.l2_normalize(self.visual_featmap_ph_train,dim=1)
        visual_featmap_ph_test_norm=tf.nn.l2_normalize(self.visual_featmap_ph_test,dim=1)
        self.loss_cls_reg,offset_pred,loss_reg=self.compute_loss_reg(visual_featmap_ph_train_norm,self.offset_ph,self.label_ph)
        self.train_op=self.training(self.loss_cls_reg)
        eval_op=self.eval(visual_featmap_ph_test_norm)
        return self.loss_cls_reg,self.train_op, eval_op,loss_reg
Example #18
0
class TURN_Model(object):
    ctx_num = 4
    unit_size = 16.0
    unit_feature_size = 2048
    lr = 0.005
    lambda_reg = 2.0
    batch_size = 128
    test_batch_size = 1
    visual_feature_dim = unit_feature_size * 3

    def __init__(self, ):
        self.train_set = TrainingDataSet(self.batch_size)
        self.test_set = TestingDataSet()

    def fill_feed_dict_train_reg(self):
        image_batch, label_batch, offset_batch = self.train_set.next_batch()
        image_batch = np.nan_to_num(image_batch)
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.label_ph: label_batch,
            self.offset_ph: offset_batch
        }
        return input_feed

    # construct the top network and compute loss
    def compute_loss_reg(self, visual_feature, offsets, labels, test=False):

        cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature,
                                                  "APN",
                                                  middle_layer_dim=1000,
                                                  test=test)
        cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4])
        cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split(
            cls_reg_vec, 4, 1)
        cls_score_vec = tf.concat((cls_score_vec_0, cls_score_vec_1), 1)
        offset_pred = tf.concat((p_reg_vec, l_reg_vec), 1)
        # classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=cls_score_vec, labels=labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1]))
        label_for_reg = tf.concat([label_tmp, label_tmp], 1)
        loss_reg = tf.reduce_mean(
            tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)),
                        label_for_reg))

        loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        label_ph = tf.placeholder(tf.int32, shape=(self.batch_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
        # print(visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph)
        return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph

    # set up the eval op
    def eval(self, visual_feature_test):
        # visual_feature_test=tf.reshape(visual_feature_test,[1,4096])
        outputs = vs_multilayer.vs_multilayer(visual_feature_test,
                                              "APN",
                                              middle_layer_dim=1000,
                                              reuse=True)
        outputs = tf.reshape(outputs, [4])
        return outputs

    # return all the variables that contains the name in name_list
    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    # set up the optimizer
    def training(self, loss):
        v_dict = self.get_variables_by_name(["APN"])
        vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["APN"])
        return vs_train_op

    # construct the network
    def construct_model(self, test=False):
        self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph = self.init_placeholder(
        )
        visual_featmap_ph_train_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_train, dim=1)
        visual_featmap_ph_test_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_test, dim=1)
        self.loss_cls_reg, offset_pred, loss_reg = self.compute_loss_reg(
            visual_featmap_ph_train_norm, self.offset_ph, self.label_ph, test)
        self.train_op = self.training(self.loss_cls_reg)
        eval_op = self.eval(visual_featmap_ph_test_norm)
        return self.loss_cls_reg, self.train_op, eval_op, loss_reg
Example #19
0
class TAR_Model(object):
    def __init__(self, batch_size, train_video_length_info, ctx_num,
                 central_num, unit_feature_size, unit_size, lambda_reg, lr,
                 train_clip_path, background_path, test_clip_path,
                 train_flow_feature_dir, train_appr_feature_dir,
                 test_flow_feature_dir, test_appr_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.lr = lr
        self.lambda_reg = lambda_reg
        self.unit_feature_size = unit_feature_size
        self.visual_feature_dim = unit_feature_size
        self.train_set = TrainingDataSet(train_flow_feature_dir,
                                         train_appr_feature_dir,
                                         train_clip_path, background_path,
                                         batch_size, train_video_length_info,
                                         ctx_num, central_num,
                                         unit_feature_size, unit_size)
        self.test_set = TestingDataSet(test_flow_feature_dir,
                                       test_appr_feature_dir, test_clip_path,
                                       self.test_batch_size, ctx_num)
        self.ctx_num = ctx_num
        self.central_num = central_num

    def fill_feed_dict_train_reg(self):
        central_batch, left_batch, right_batch, label_batch, offset_batch = self.train_set.next_batch(
        )
        input_feed = {
            self.central_ph_train: central_batch,
            self.left_ph_train: left_batch,
            self.right_ph_train: right_batch,
            self.label_ph: label_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    # construct the top network and compute loss
    def compute_loss_reg(self, central, start, end, offsets, labels):

        central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer(
            central, start, end, "BLA")
        offset_pred = tf.concat(1, (start_reg, end_reg))

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            central_cls, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        self.central_ph_train = tf.placeholder(tf.float32,
                                               shape=(self.batch_size,
                                                      self.central_num,
                                                      self.visual_feature_dim))
        self.left_ph_train = tf.placeholder(tf.float32,
                                            shape=(self.batch_size,
                                                   self.ctx_num,
                                                   self.visual_feature_dim))
        self.right_ph_train = tf.placeholder(tf.float32,
                                             shape=(self.batch_size,
                                                    self.ctx_num,
                                                    self.visual_feature_dim))
        self.label_ph = tf.placeholder(tf.int32, shape=(self.batch_size))
        self.offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        self.central_ph_test = tf.placeholder(tf.float32,
                                              shape=(self.test_batch_size,
                                                     self.central_num,
                                                     self.visual_feature_dim))
        self.left_ph_test = tf.placeholder(tf.float32,
                                           shape=(self.test_batch_size,
                                                  self.ctx_num,
                                                  self.visual_feature_dim))
        self.right_ph_test = tf.placeholder(tf.float32,
                                            shape=(self.test_batch_size,
                                                   self.ctx_num,
                                                   self.visual_feature_dim))

        return

    # set up the eval op
    def eval(self, central, start, end):
        central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer(
            central, start, end, "BLA", reuse=True)
        outputs = tf.concat(1, (central_cls, start_reg, end_reg))
        outputs = tf.reshape(outputs, [4])
        print "eval output size: " + str(outputs.get_shape().as_list())

        return outputs

    # return all the variables that contains the name in name_list
    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    # set up the optimizer
    def training(self, loss):
        v_dict = self.get_variables_by_name(["BLA"])
        vs_optimizer = tf.train.AdamOptimizer(self.lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["BLA"])
        return vs_train_op

    # construct the network
    def construct_model(self):
        self.init_placeholder()
        self.loss_cls_reg, offset_pred, loss_reg = self.compute_loss_reg(
            self.central_ph_train, self.left_ph_train, self.right_ph_train,
            self.offset_ph, self.label_ph)
        self.train_op = self.training(self.loss_cls_reg)
        eval_op = self.eval(self.central_ph_test, self.left_ph_test,
                            self.right_ph_test)
        return self.loss_cls_reg, self.train_op, eval_op, loss_reg
class ACRN_Model(object):
    def __init__(self, batch_size, pool_size, train_csv_path, test_csv_path,
                 test_visual_feature_dir, train_visual_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.0001
        self.lambda_regression = 0.01
        self.alpha = 1.0 / batch_size
        #self.alpha=0.06
        self.pool_size = pool_size
        self.semantic_size = 1024
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_csv_path, self.batch_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path,
                                       self.test_batch_size)
        self.context_num = 1

    '''
    used in training alignment model, CTRL(aln)
    '''

    def fill_feed_dict_train(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch()
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    used in training alignment+regression model, CTRL(reg)
    '''

    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou(
        )
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    cross modal processing module
    '''

    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
                                [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed, [1, batch_size]),
                                [batch_size, batch_size, self.semantic_size])
        vv_feature1 = tf.reshape(vv_feature, [batch_size, batch_size, -1, 1])
        ss_feature1 = tf.reshape(ss_feature, [batch_size, batch_size, -1, 1])
        pool_vv = tf.nn.avg_pool(vv_feature1,
                                 ksize=[1, 1, self.pool_size, 1],
                                 strides=[1, 1, self.pool_size, 1],
                                 padding='SAME')
        pool_ss = tf.nn.avg_pool(ss_feature1,
                                 ksize=[1, 1, self.pool_size, 1],
                                 strides=[1, 1, self.pool_size, 1],
                                 padding='SAME')
        shape_vv = pool_vv.get_shape().as_list()
        shape_ss = pool_ss.get_shape().as_list()
        vv = tf.reshape(pool_vv, [batch_size * batch_size, shape_vv[2], 1])
        ss = tf.reshape(
            pool_ss,
            [batch_size * batch_size, 1, shape_ss[2]])  #batchx batch, fea
        print vv.shape, ss.shape
        concat_feature = tf.matmul(vv, ss)  #batch*batch, 1024*1024
        print concat_feature.shape
        concat_feature = tf.reshape(concat_feature,
                                    [batch_size, batch_size, -1])
        comb_feature = tf.reshape(
            tf.concat([vv_feature, ss_feature, concat_feature], 2),
            [1, batch_size, batch_size, -1])

        return comb_feature

    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''

    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train,
                              visual_feature_test, sentence_embed_test):
        name = "CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"
            """ embedding into common space dim 1024"""
            visual_feature_train = tf.transpose(visual_feature_train,
                                                [0, 2, 1])  # batch num fea
            inputs = tf.reshape(visual_feature_train,
                                [-1, self.visual_feature_dim])  #batch x num,fe
            transformed_clip_train = fc(
                'v2s_lt', inputs,
                output_dim=self.semantic_size)  # batch x num, embed
            transformed_clip_train = tf.reshape(transformed_clip_train, [
                self.batch_size, 2 * self.context_num + 1, self.semantic_size
            ])  #batch num embe
            transformed_sentence_train = fc(
                's2s_lt', sentence_embed_train,
                output_dim=self.semantic_size)  # batch, embed
            #### attention part
            print "attention part tanh(sum(x_1:t))*tanh(s) "
            concat_previous_feature = tf.zeros(
                [self.batch_size, 1, self.semantic_size])
            for j in range(2 * self.context_num):
                now = tf.slice(transformed_clip_train, [0, 0, 0],
                               [-1, j + 1, -1])
                #    print now.get_shape().as_list()
                now = tf.reduce_sum(now, 1)
                #    print now.get_shape().as_list()
                now = tf.expand_dims(now, 1)
                #    print now.get_shape().as_list()

                concat_previous_feature = tf.concat(
                    [concat_previous_feature, now], 1)  # batch num embed
            v = tf.tanh(tf.add(transformed_clip_train,
                               concat_previous_feature))
            relu_t = tf.tanh(transformed_sentence_train)  #batch, embed
            concat_text = tf.reshape(
                tf.tile(relu_t, [1, 2 * self.context_num + 1]), [
                    self.batch_size, 2 * self.context_num + 1,
                    self.semantic_size
                ])  # batch cont_num embed
            # computing weight a
            e = tf.reduce_sum(tf.multiply(concat_text, v), 2)  # batch cont_num
            alpha = tf.nn.softmax(e)  # batch, num_ctx
            a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [
                self.batch_size, self.semantic_size, 2 * self.context_num + 1
            ])  # batch 4096 cont_num
            visual_feature_train = tf.transpose(transformed_clip_train,
                                                [0, 2, 1])  # batch embed num
            input_vision = tf.reduce_sum(tf.multiply(visual_feature_train, a),
                                         2)  #batch embed

            transformed_clip_train_norm = tf.nn.l2_normalize(input_vision,
                                                             dim=1)
            transformed_sentence_train_norm = tf.nn.l2_normalize(
                transformed_sentence_train, dim=1)

            # print transformed_clip_train_norm.shape
            # print transformed_sentence_train_norm.shape
            # exit()
            cross_modal_vec_train = self.cross_modal_comb(
                transformed_clip_train_norm, transformed_sentence_train_norm,
                self.batch_size)  # batch batch 2*conmmon_space_dim
            # print cross_modal_vec_train.shape
            # exit()
            sim_score_mat_train = vs_multilayer.vs_multilayer(
                cross_modal_vec_train,
                "vs_multilayer_lt",
                middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(
                sim_score_mat_train, [self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n"
            visual_feature_test = tf.transpose(visual_feature_test,
                                               [0, 2, 1])  # batch num fea
            inputs = tf.reshape(visual_feature_test,
                                [-1, self.visual_feature_dim])  #batch x num,fe
            transformed_clip_test = fc('v2s_lt',
                                       inputs,
                                       output_dim=self.semantic_size)
            transformed_clip_test = tf.reshape(transformed_clip_test, [
                self.test_batch_size, 2 * self.context_num + 1,
                self.semantic_size
            ])  #batch num embe
            transformed_sentence_test = fc('s2s_lt',
                                           sentence_embed_test,
                                           output_dim=self.semantic_size)
            #### attention part
            print "attention part tanh(sum(x_1:t))*tanh(s) "
            concat_previous_feature = tf.zeros(
                [self.test_batch_size, 1, self.semantic_size])
            for j in range(2 * self.context_num):
                now = tf.slice(transformed_clip_test, [0, 0, 0],
                               [-1, j + 1, -1])
                print now.get_shape().as_list()
                now = tf.reduce_sum(now, 1)
                print now.get_shape().as_list()
                now = tf.expand_dims(now, 1)
                print now.get_shape().as_list()
                concat_previous_feature = tf.concat(
                    1, [concat_previous_feature, now])  # batch num embed
            v = tf.tanh(tf.add(transformed_clip_test,
                               concat_previous_feature))  # batchx num, embed
            relu_t = tf.tanh(transformed_sentence_test)  #batch, feature_embed

            concat_text = tf.reshape(
                tf.tile(relu_t, [1, 2 * self.context_num + 1]), [
                    self.test_batch_size, 2 * self.context_num + 1,
                    self.semantic_size
                ])  # batch cont_num feature

            e = tf.reduce_sum(tf.mul(concat_text, v), 2)  # batch cont_num

            alpha = tf.nn.softmax(e)  # batch, num_ctx
            a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [
                self.test_batch_size, self.semantic_size,
                2 * self.context_num + 1
            ])  # batch embed cont_num
            visual_feature_test = tf.transpose(transformed_clip_test,
                                               [0, 2, 1])
            input_vision = tf.reduce_sum(tf.mul(visual_feature_test, a),
                                         2)  #batch embed
            transformed_clip_test_norm = tf.nn.l2_normalize(input_vision,
                                                            dim=1)
            transformed_sentence_test_norm = tf.nn.l2_normalize(
                transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(
                transformed_clip_test_norm, transformed_sentence_test_norm,
                self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(
                cross_modal_vec_test,
                "vs_multilayer_lt",
                reuse=True,
                middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''

    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)
        sim_score_mat = tf.reshape(sim_score_mat,
                                   [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha,
                                     shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I, batch_para_mat)
        loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat,
                                                     sim_score_mat))))
        loss_mat = tf.mul(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))
        loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))

        loss = tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32,
            shape=(
                self.batch_size, self.visual_feature_dim,
                2 * self.context_num +
                1))  # input feature: current clip, pre-contex, and post contex
        sentence_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32,
            shape=(
                self.test_batch_size, self.visual_feature_dim,
                2 * self.context_num +
                1))  #input feature: current clip, pre-contex, and post contex
        sentence_ph_test = tf.placeholder(tf.float32,
                                          shape=(self.test_batch_size,
                                                 self.sentence_embedding_size))

        return visual_featmap_ph_train, sentence_ph_train, offset_ph, visual_featmap_ph_test, sentence_ph_test

    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    def training(self, loss):

        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op

    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test = self.init_placeholder(
        )

        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(
            self.visual_featmap_ph_train, self.sentence_ph_train,
            self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(
            sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
Example #21
0
class CBR_Model(object):
    """ This is the body of the network we are using

    Here you will get access to the network structure, function of training, evaluation

    """
    def __init__(self, config):
        """Initialization
        """
        self.config = config

        self.sess = None
        self.saver = None

        self.train_clip_path = self.config.train_clip_path
        self.background_path = self.config.background_path
        self.test_clip_path = self.config.test_clip_path
        self.train_flow_feature_dir = self.config.train_flow_feature_dir
        self.train_appr_feature_dir = self.config.train_appr_feature_dir
        self.test_flow_feature_dir = self.config.test_flow_feature_dir
        self.test_appr_feature_dir = self.config.test_appr_feature_dir
        self.test_len_dict = self.config.test_len_dict

        self.batch_size = self.config.batch_size

        self.test_batch_size = 1
        self.middle_layer_size = 1000

        self.lambda_reg = float(self.config.lambda_reg)
        self.action_class_num = self.config.action_class_num
        self.feat_type = self.config.feat_type
        self.visual_feature_dim = self.config.visual_feature_dim

        # Initialize the training data and testing data
        self.train_set = TrainingDataSet(self.config,
                                         self.train_flow_feature_dir,
                                         self.train_appr_feature_dir,
                                         self.train_clip_path,
                                         self.background_path)
        self.test_set = TestingDataSet(self.config, self.test_flow_feature_dir,
                                       self.test_appr_feature_dir,
                                       self.test_clip_path,
                                       self.test_batch_size,
                                       self.test_len_dict)

        # Path to save the summary of the models
        self.summary_dir = os.path.join('./summary', self.config.save_name)

        if not os.path.exists(self.summary_dir):
            os.mkdir(self.summary_dir)

        if self.config.issave == 'Yes':
            self.model_dir = os.path.join('./model', self.config.save_name)

            if not os.path.exists(self.model_dir):
                os.mkdir(self.model_dir)

    def init_session(self):
        """Create a session in tensorflow
        """
        print('Initializing of session')

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3
                                    )  # 30% memory of TITAN is enough
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(max_to_keep=10)
        if self.config.ispretrain == 'Yes':
            self.restore_session('model/xx.ckpt')

    def get_feed_dict(self, lr_by_step):
        """Prepare training samples in each batch size to the network
        """
        image_batch, label_batch, offset_batch, one_hot_label_batch = self.train_set.next_batch(
        )

        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.label_ph: label_batch,
            self.offset_ph: offset_batch,
            self.one_hot_label_ph: one_hot_label_batch,
            self.vs_lr: lr_by_step
        }

        return input_feed

    def add_loss_op(self,
                    visual_feature,
                    offsets,
                    labels,
                    one_hot_labels,
                    name='CBR'):
        """This function is to compute the loss in tensorflow graph

        Args:
            visual_feature: Tensor, feature, (batch_size, visual_feature_dim)
            offsets: Tensor, boundary offset(both to the start and end in frame-level), (batch_size, 2)
            labels: Tensor, label, (batch_size)
            one_hot_labels: Tensor, one hot label, (batch_size, action_class_num+1)

        Returns:
            loss: loss_cls + lambda_reg * loss_reg
            loss_reg: L1 loss between ground truth offsets and prediction offsets
            loss_cls: cross entropy loss

        """
        print('Add the standard loss')

        cls_reg_vec = vs_multilayer.vs_multilayer(
            visual_feature,
            name,
            middle_layer_dim=self.middle_layer_size,
            class_num=self.action_class_num,
            dropout=self.config.dropout)

        cls_reg_vec = tf.reshape(
            cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3])
        cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1]
        start_offset_pred = cls_reg_vec[:, self.action_class_num +
                                        1:(self.action_class_num + 1) * 2]
        end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:]

        # l1 loss
        loss_l1 = tf.reduce_mean(tf.abs(cls_score_vec))

        # classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=cls_score_vec, labels=labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)

        # regression loss
        pick_start_offset_pred = []
        pick_end_offset_pred = []
        for k in range(self.batch_size):

            pick_start_offset_pred.append(start_offset_pred[k, labels[k]])
            pick_end_offset_pred.append(end_offset_pred[k, labels[k]])

        pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred),
                                            [self.batch_size, 1])
        pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred),
                                          [self.batch_size, 1])
        labels_1 = tf.to_float(tf.not_equal(labels, 0))
        label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1]))
        label_for_reg = tf.concat([label_tmp, label_tmp], 1)
        offset_pred = tf.concat((pick_start_offset_pred, pick_end_offset_pred),
                                1)

        loss_reg = tf.reduce_mean(
            tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)),
                        label_for_reg))

        loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls)

        if self.config.l1_loss:
            loss = tf.add(loss, loss_l1)
        else:
            loss = loss

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("loss_reg", loss_reg)
        tf.summary.scalar("loss_cls", loss_cls)

        return loss, loss_reg, loss_cls

    def add_placeholders(self):
        """Add placeholders
        """
        print('Add placeholders')

        self.visual_featmap_ph_train = tf.placeholder(
            tf.float32,
            name='train_featmap',
            shape=(self.batch_size, self.visual_feature_dim))
        self.visual_featmap_ph_test = tf.placeholder(
            tf.float32,
            name='test_featmap',
            shape=(self.test_batch_size, self.visual_feature_dim))

        self.label_ph = tf.placeholder(tf.int32,
                                       name='label',
                                       shape=(self.batch_size))
        self.offset_ph = tf.placeholder(tf.float32,
                                        name='offset',
                                        shape=(self.batch_size, 2))
        self.one_hot_label_ph = tf.placeholder(
            tf.float32,
            name='one_hot_label',
            shape=(self.batch_size, self.action_class_num + 1))
        self.vs_lr = tf.placeholder(tf.float32, name='lr')

    def add_summary(self):
        """Add summary
        """
        print('Add summay')
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.summary_dir,
                                                 self.sess.graph)

    def save_session(self, step):
        """Save the session if needed
        """
        if self.config.issave == 'Yes':

            print('Save session')

            model_name = os.path.join(self.model_dir, str(step) + '.ckpt')
            self.saver.save(self.sess, model_name)

    def restore_session(self, dir_model):
        """Restore session
        """
        print('Restore the Session')

        self.saver.restore(self.sess, dir_model)

    def close_session(self):
        """ Close session once finished
        """
        print('Close session')

        self.sess.close()

    def predict(self, visual_feature_test):
        """Inference during testing

        Args:
            visual_feature_test: Tensor, feature,  (test_batch_size, visual_feature_dim)

        Returns:
            sim_score: Tensor, (action_class_num+1)*3 (Note: [0:action_class_num+1]: classification scores;
                [action_class_num+1:(action_class_num+1)*2: start offsets; [(action_class_num+1)*2:(action_class_num+1)*3]: end offsets)

        """
        print('To predict the label')

        sim_score = vs_multilayer.vs_multilayer(
            visual_feature_test,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            class_num=self.action_class_num,
            dropout=False,
            reuse=True)
        sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3])

        return sim_score

    def get_variables_by_name(self, name_list):
        """Get variables by name
        """
        v_list = tf.trainable_variables()

        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print("Variables of <" + name + ">")
            for v in v_dict[name]:
                print("    " + v.name)
        return v_dict

    def add_train_op(self, loss):
        """Add train operation
        """
        print('Add train operation')

        v_dict = self.get_variables_by_name(["CBR"])

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        if self.config.opm_type == 'adam_wd':
            vs_optimizer = tf.contrib.opt.extend_with_decoupled_weight_decay(
                tf.train.AdamOptimizer)
            optimizer = vs_optimizer(weight_decay=1e-4,
                                     learning_rate=self.vs_lr,
                                     name='vs_adam')
            with tf.control_dependencies(update_ops):
                vs_train_op = optimizer.minimize(loss, var_list=v_dict["CBR"])
        elif self.config.opm_type == 'adam':
            vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
            with tf.control_dependencies(update_ops):
                vs_train_op = vs_optimizer.minimize(loss,
                                                    var_list=v_dict["CBR"])

        return vs_train_op

    def build(self):
        """Build the model
        """
        print('Construct the network')

        self.add_placeholders()

        if self.config.norm == 'l2':
            visual_featmap_ph_train_norm = tf.nn.l2_normalize(
                self.visual_featmap_ph_train, dim=1)
            visual_featmap_ph_test_norm = tf.nn.l2_normalize(
                self.visual_featmap_ph_test, dim=1)
        elif self.config.norm == 'No':
            visual_featmap_ph_train_norm = self.visual_featmap_ph_train
            visual_featmap_ph_test_norm = self.visual_featmap_ph_test

        self.loss, self.loss_reg, self.loss_cls = self.add_loss_op(
            visual_featmap_ph_train_norm, self.offset_ph, self.label_ph,
            self.one_hot_label_ph)
        self.vs_train_op = self.add_train_op(self.loss)
        self.vs_eval_op = self.predict(visual_featmap_ph_test_norm)

        self.init_session()

    def train(self):
        """Training
        """
        self.add_summary()

        for step in xrange(self.config.max_steps):

            # if step <= 3000:
            lr = self.config.lr
            # else:
            #    lr = self.config.lr/10

            start_time = time.time()

            feed_dict = self.get_feed_dict(lr)
            duration1 = time.time() - start_time

            [_, loss_value, loss_reg_value, loss_cls_value,
             summary] = self.sess.run([
                 self.vs_train_op, self.loss, self.loss_reg, self.loss_cls,
                 self.merged
             ],
                                      feed_dict=feed_dict)

            duration2 = time.time() - start_time

            print(
                'Step %d: loss=%.2f, loss_reg=%.2f, loss_cls=%.2f, (%.3f sec),(%.3f sec)'
                % (step, loss_value, loss_reg_value, loss_cls_value, duration1,
                   duration2))

            self.file_writer.add_summary(summary, step)
            if (step + 1) == 4000 or (step + 1) % self.config.test_steps == 0:
                self.save_session(step + 1)

    def do_eval_slidingclips(self, save_name):
        """Do evaluation based on proposals and save the coresponding score and offset to a pickle file in './eval/test_results' folder
        """
        test_len_dict = tools.load_length_dict(type='test')
        reg_result_dict = {}

        for k, test_sample in enumerate(self.test_set.test_samples):

            reg_result_dict[k] = []

            if k % 1000 == 0:
                print(str(k) + "/" + str(len(self.test_set.test_samples)))
            movie_name = test_sample[0]

            init_clip_start = test_sample[1]
            init_clip_end = test_sample[2]

            clip_start = init_clip_start
            clip_end = init_clip_end
            final_action_prob = np.zeros([
                (self.config.action_class_num + 1) * 3 * self.config.cas_step
            ])

            if clip_start >= clip_end:
                reg_result_dict[k].append(final_action_prob)
                continue

            for i in range(self.config.cas_step):
                if clip_start >= clip_end:
                    break

                if self.config.feat_type == 'Pool':

                    featmap = dataset.get_pooling_feature(
                        self.test_set.flow_feat_dir,
                        self.test_set.appr_feat_dir, movie_name, clip_start,
                        clip_end, self.config.pool_level,
                        self.config.unit_size, self.config.unit_feature_size,
                        self.config.fusion_type)
                    left_feat = dataset.get_left_context_feature(
                        self.test_set.flow_feat_dir,
                        self.test_set.appr_feat_dir, movie_name, clip_start,
                        clip_end, self.config.ctx_num, self.config.unit_size,
                        self.config.unit_feature_size, self.config.fusion_type)
                    right_feat = dataset.get_right_context_feature(
                        self.test_set.flow_feat_dir,
                        self.test_set.appr_feat_dir, movie_name, clip_start,
                        clip_end, self.config.ctx_num, self.config.unit_size,
                        self.config.unit_feature_size, self.config.fusion_type)

                    mean_ = np.hstack((left_feat, featmap, right_feat))

                    feat = mean_

                elif self.config.feat_type == 'SSN':

                    feat = dataset.get_SSN_feature(
                        self.test_set.flow_feat_dir,
                        self.test_set.appr_feat_dir, movie_name, clip_start,
                        clip_end, self.config.unit_size,
                        self.config.unit_feature_size, self.config.fusion_type)

                else:
                    feat = dataset.get_BSP_feature(
                        self.test_set.flow_feat_dir,
                        self.test_set.appr_feat_dir, movie_name, clip_start,
                        clip_end, self.config.unit_size,
                        self.config.unit_feature_size, self.config.bsp_level)

                feat = np.reshape(feat, [1, self.config.visual_feature_dim])

                feed_dict = {self.visual_featmap_ph_test: feat}

                outputs = self.sess.run(self.vs_eval_op, feed_dict=feed_dict)

                action_score = outputs[1:self.config.action_class_num + 1]
                action_prob = tools.softmax(action_score)

                final_action_prob[(i) * (self.config.action_class_num + 1) *
                                  3:(i + 1) *
                                  (self.config.action_class_num + 1) *
                                  3] = outputs

                action_cat = np.argmax(action_prob) + 1
                round_reg_end = clip_end + round(outputs[
                    (self.config.action_class_num + 1) * 2 +
                    action_cat]) * self.config.unit_size
                round_reg_start = clip_start + round(
                    outputs[self.config.action_class_num + 1 +
                            action_cat]) * self.config.unit_size
                if round_reg_start < 0 or round_reg_end > test_len_dict[
                        movie_name] - 15 or round_reg_start >= round_reg_end:
                    round_reg_end = clip_end
                    round_reg_start = clip_start
                reg_end = clip_end + outputs[
                    (self.config.action_class_num + 1) * 2 +
                    action_cat] * self.config.unit_size
                reg_start = clip_start + outputs[
                    self.config.action_class_num + 1 +
                    action_cat] * self.config.unit_size
                clip_start = round_reg_start
                clip_end = round_reg_end

            reg_result_dict[k].append(final_action_prob)

        pickle.dump(
            reg_result_dict,
            open("./eval/test_results/" + save_name + "_outputs.pkl", "wb"))
class CTRL_Model(object):
    def __init__(self, batch_size, train_csv_path, test_csv_path,
                 test_visual_feature_dir, train_visual_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.005
        self.lambda_regression = 0.01
        self.alpha = 1.0 / batch_size
        self.semantic_size = 1024  # the size of visual and semantic comparison size
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096 * 3
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_csv_path, self.batch_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path,
                                       self.test_batch_size)

    '''
    used in training alignment model, CTRL(aln)
    '''

    def fill_feed_dict_train(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch()
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    used in training alignment+regression model, CTRL(reg)
    '''

    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou(
        )
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    cross modal processing module
    '''

    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
                                [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed, [1, batch_size]),
                                [batch_size, batch_size, self.semantic_size])
        concat_feature = tf.reshape(
            tf.concat([vv_feature, ss_feature], 2),
            [batch_size, batch_size, self.semantic_size + self.semantic_size])
        print concat_feature.get_shape().as_list()
        mul_feature = tf.multiply(vv_feature, ss_feature)
        add_feature = tf.add(vv_feature, ss_feature)

        comb_feature = tf.reshape(
            tf.concat([mul_feature, add_feature, concat_feature], 2),
            [1, batch_size, batch_size, self.semantic_size * 4])
        return comb_feature

    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''

    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train,
                              visual_feature_test, sentence_embed_test):
        name = "CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"
            transformed_clip_train = fc('v2s_lt',
                                        visual_feature_train,
                                        output_dim=self.semantic_size)
            transformed_clip_train_norm = tf.nn.l2_normalize(
                transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt',
                                            sentence_embed_train,
                                            output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(
                transformed_sentence_train, dim=1)
            cross_modal_vec_train = self.cross_modal_comb(
                transformed_clip_train_norm, transformed_sentence_train_norm,
                self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(
                cross_modal_vec_train,
                "vs_multilayer_lt",
                middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(
                sim_score_mat_train, [self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n"
            transformed_clip_test = fc('v2s_lt',
                                       visual_feature_test,
                                       output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(
                transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt',
                                           sentence_embed_test,
                                           output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(
                transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(
                transformed_clip_test_norm, transformed_sentence_test_norm,
                self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(
                cross_modal_vec_test,
                "vs_multilayer_lt",
                reuse=True,
                middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''

    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(sim_reg_mat, 3, 2)
        sim_score_mat = tf.reshape(sim_score_mat,
                                   [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha,
                                     shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I, batch_para_mat)
        loss_mat = tf.log(
            tf.add(all1, tf.exp(tf.multiply(mask_mat, sim_score_mat))))
        loss_mat = tf.multiply(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.multiply(l_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.multiply(p_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat((p_reg_diag, l_reg_diag), 1)
        loss_reg = tf.reduce_mean(
            tf.abs(tf.subtract(offset_pred, offset_label)))

        loss = tf.add(tf.multiply(self.lambda_regression, loss_reg),
                      loss_align)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        sentence_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
        sentence_ph_test = tf.placeholder(tf.float32,
                                          shape=(self.test_batch_size,
                                                 self.sentence_embedding_size))

        return visual_featmap_ph_train, sentence_ph_train, offset_ph, visual_featmap_ph_test, sentence_ph_test

    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    def training(self, loss):

        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op

    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test = self.init_placeholder(
        )
        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(
            self.visual_featmap_ph_train, self.sentence_ph_train,
            self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(
            sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
Example #23
0
class CBR_Model(object):
    def __init__(self, batch_size, ctx_num, unit_size, unit_feature_size,
                 action_class_num, lr, lambda_reg, train_clip_path,
                 background_path, test_clip_path, train_flow_feature_dir,
                 train_appr_feature_dir, test_flow_feature_dir,
                 test_appr_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.middle_layer_size = 1000
        self.vs_lr = lr
        self.lambda_reg = lambda_reg
        self.action_class_num = action_class_num
        self.visual_feature_dim = unit_feature_size * 3
        self.train_set = TrainingDataSet(train_flow_feature_dir,
                                         train_appr_feature_dir,
                                         train_clip_path, background_path,
                                         batch_size, ctx_num, unit_size,
                                         unit_feature_size, action_class_num)
        self.test_set = TestingDataSet(test_flow_feature_dir,
                                       test_appr_feature_dir, test_clip_path,
                                       self.test_batch_size, unit_size)

    def fill_feed_dict_train(self):
        image_batch, label_batch, offset_batch, one_hot_label_batch = self.train_set.next_batch(
        )
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.label_ph: label_batch,
            self.offset_ph: offset_batch,
            self.one_hot_label_ph: one_hot_label_batch
        }
        return input_feed

    def compute_loss_reg(self, visual_feature, offsets, labels,
                         one_hot_labels):

        cls_reg_vec = vs_multilayer.vs_multilayer(
            visual_feature,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            output_layer_dim=(self.action_class_num + 1) * 3)
        cls_reg_vec = tf.reshape(
            cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3])
        cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1]
        start_offset_pred = cls_reg_vec[:, self.action_class_num +
                                        1:(self.action_class_num + 1) * 2]
        end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:]

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            cls_score_vec, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss

        pick_start_offset_pred = []
        pick_end_offset_pred = []
        for k in range(self.batch_size):
            pick_start_offset_pred.append(start_offset_pred[k, labels[k]])
            pick_end_offset_pred.append(end_offset_pred[k, labels[k]])
        pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred),
                                            [self.batch_size, 1])
        pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred),
                                          [self.batch_size, 1])
        labels_1 = tf.to_float(tf.not_equal(labels, 0))
        label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])
        offset_pred = tf.concat(1,
                                (pick_start_offset_pred, pick_end_offset_pred))
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        label_ph = tf.placeholder(tf.int32, shape=(self.batch_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        one_hot_label_ph = tf.placeholder(tf.float32,
                                          shape=(self.batch_size,
                                                 self.action_class_num + 1))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))

        return visual_featmap_ph_train, visual_featmap_ph_test, label_ph, offset_ph, one_hot_label_ph

    def eval(self, visual_feature_test):
        sim_score = vs_multilayer.vs_multilayer(
            visual_feature_test,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            output_layer_dim=(self.action_class_num + 1) * 3,
            dropout=False,
            reuse=True)
        sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3])
        return sim_score

    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    def training(self, loss):
        v_dict = self.get_variables_by_name(["CBR"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["CBR"])
        return vs_train_op

    def construct_model(self):
        #construct the network:
        self.visual_featmap_ph_train, self.visual_featmap_ph_test, self.label_ph, self.offset_ph, self.one_hot_label_ph = self.init_placeholder(
        )
        visual_featmap_ph_train_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_train, dim=1)
        visual_featmap_ph_test_norm = tf.nn.l2_normalize(
            self.visual_featmap_ph_test, dim=1)
        self.loss, loss_reg = self.compute_loss_reg(
            visual_featmap_ph_train_norm, self.offset_ph, self.label_ph,
            self.one_hot_label_ph)
        self.vs_train_op = self.training(self.loss)
        vs_eval_op = self.eval(visual_featmap_ph_test_norm)
        return self.loss, self.vs_train_op, vs_eval_op, loss_reg