class ACRN_Model(object):
    def __init__(self, batch_size, pool_size, train_csv_path, test_csv_path,
                 test_visual_feature_dir, train_visual_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.0001
        self.lambda_regression = 0.01
        self.alpha = 1.0 / batch_size
        #self.alpha=0.06
        self.pool_size = pool_size
        self.semantic_size = 1024
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_csv_path, self.batch_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path,
                                       self.test_batch_size)
        self.context_num = 1

    '''
    used in training alignment model, CTRL(aln)
    '''

    def fill_feed_dict_train(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch()
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    used in training alignment+regression model, CTRL(reg)
    '''

    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou(
        )
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    cross modal processing module
    '''

    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
                                [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed, [1, batch_size]),
                                [batch_size, batch_size, self.semantic_size])
        vv_feature1 = tf.reshape(vv_feature, [batch_size, batch_size, -1, 1])
        ss_feature1 = tf.reshape(ss_feature, [batch_size, batch_size, -1, 1])
        pool_vv = tf.nn.avg_pool(vv_feature1,
                                 ksize=[1, 1, self.pool_size, 1],
                                 strides=[1, 1, self.pool_size, 1],
                                 padding='SAME')
        pool_ss = tf.nn.avg_pool(ss_feature1,
                                 ksize=[1, 1, self.pool_size, 1],
                                 strides=[1, 1, self.pool_size, 1],
                                 padding='SAME')
        shape_vv = pool_vv.get_shape().as_list()
        shape_ss = pool_ss.get_shape().as_list()
        vv = tf.reshape(pool_vv, [batch_size * batch_size, shape_vv[2], 1])
        ss = tf.reshape(
            pool_ss,
            [batch_size * batch_size, 1, shape_ss[2]])  #batchx batch, fea
        print vv.shape, ss.shape
        concat_feature = tf.matmul(vv, ss)  #batch*batch, 1024*1024
        print concat_feature.shape
        concat_feature = tf.reshape(concat_feature,
                                    [batch_size, batch_size, -1])
        comb_feature = tf.reshape(
            tf.concat([vv_feature, ss_feature, concat_feature], 2),
            [1, batch_size, batch_size, -1])

        return comb_feature

    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''

    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train,
                              visual_feature_test, sentence_embed_test):
        name = "CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"
            """ embedding into common space dim 1024"""
            visual_feature_train = tf.transpose(visual_feature_train,
                                                [0, 2, 1])  # batch num fea
            inputs = tf.reshape(visual_feature_train,
                                [-1, self.visual_feature_dim])  #batch x num,fe
            transformed_clip_train = fc(
                'v2s_lt', inputs,
                output_dim=self.semantic_size)  # batch x num, embed
            transformed_clip_train = tf.reshape(transformed_clip_train, [
                self.batch_size, 2 * self.context_num + 1, self.semantic_size
            ])  #batch num embe
            transformed_sentence_train = fc(
                's2s_lt', sentence_embed_train,
                output_dim=self.semantic_size)  # batch, embed
            #### attention part
            print "attention part tanh(sum(x_1:t))*tanh(s) "
            concat_previous_feature = tf.zeros(
                [self.batch_size, 1, self.semantic_size])
            for j in range(2 * self.context_num):
                now = tf.slice(transformed_clip_train, [0, 0, 0],
                               [-1, j + 1, -1])
                #    print now.get_shape().as_list()
                now = tf.reduce_sum(now, 1)
                #    print now.get_shape().as_list()
                now = tf.expand_dims(now, 1)
                #    print now.get_shape().as_list()

                concat_previous_feature = tf.concat(
                    [concat_previous_feature, now], 1)  # batch num embed
            v = tf.tanh(tf.add(transformed_clip_train,
                               concat_previous_feature))
            relu_t = tf.tanh(transformed_sentence_train)  #batch, embed
            concat_text = tf.reshape(
                tf.tile(relu_t, [1, 2 * self.context_num + 1]), [
                    self.batch_size, 2 * self.context_num + 1,
                    self.semantic_size
                ])  # batch cont_num embed
            # computing weight a
            e = tf.reduce_sum(tf.multiply(concat_text, v), 2)  # batch cont_num
            alpha = tf.nn.softmax(e)  # batch, num_ctx
            a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [
                self.batch_size, self.semantic_size, 2 * self.context_num + 1
            ])  # batch 4096 cont_num
            visual_feature_train = tf.transpose(transformed_clip_train,
                                                [0, 2, 1])  # batch embed num
            input_vision = tf.reduce_sum(tf.multiply(visual_feature_train, a),
                                         2)  #batch embed

            transformed_clip_train_norm = tf.nn.l2_normalize(input_vision,
                                                             dim=1)
            transformed_sentence_train_norm = tf.nn.l2_normalize(
                transformed_sentence_train, dim=1)

            # print transformed_clip_train_norm.shape
            # print transformed_sentence_train_norm.shape
            # exit()
            cross_modal_vec_train = self.cross_modal_comb(
                transformed_clip_train_norm, transformed_sentence_train_norm,
                self.batch_size)  # batch batch 2*conmmon_space_dim
            # print cross_modal_vec_train.shape
            # exit()
            sim_score_mat_train = vs_multilayer.vs_multilayer(
                cross_modal_vec_train,
                "vs_multilayer_lt",
                middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(
                sim_score_mat_train, [self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n"
            visual_feature_test = tf.transpose(visual_feature_test,
                                               [0, 2, 1])  # batch num fea
            inputs = tf.reshape(visual_feature_test,
                                [-1, self.visual_feature_dim])  #batch x num,fe
            transformed_clip_test = fc('v2s_lt',
                                       inputs,
                                       output_dim=self.semantic_size)
            transformed_clip_test = tf.reshape(transformed_clip_test, [
                self.test_batch_size, 2 * self.context_num + 1,
                self.semantic_size
            ])  #batch num embe
            transformed_sentence_test = fc('s2s_lt',
                                           sentence_embed_test,
                                           output_dim=self.semantic_size)
            #### attention part
            print "attention part tanh(sum(x_1:t))*tanh(s) "
            concat_previous_feature = tf.zeros(
                [self.test_batch_size, 1, self.semantic_size])
            for j in range(2 * self.context_num):
                now = tf.slice(transformed_clip_test, [0, 0, 0],
                               [-1, j + 1, -1])
                print now.get_shape().as_list()
                now = tf.reduce_sum(now, 1)
                print now.get_shape().as_list()
                now = tf.expand_dims(now, 1)
                print now.get_shape().as_list()
                concat_previous_feature = tf.concat(
                    1, [concat_previous_feature, now])  # batch num embed
            v = tf.tanh(tf.add(transformed_clip_test,
                               concat_previous_feature))  # batchx num, embed
            relu_t = tf.tanh(transformed_sentence_test)  #batch, feature_embed

            concat_text = tf.reshape(
                tf.tile(relu_t, [1, 2 * self.context_num + 1]), [
                    self.test_batch_size, 2 * self.context_num + 1,
                    self.semantic_size
                ])  # batch cont_num feature

            e = tf.reduce_sum(tf.mul(concat_text, v), 2)  # batch cont_num

            alpha = tf.nn.softmax(e)  # batch, num_ctx
            a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [
                self.test_batch_size, self.semantic_size,
                2 * self.context_num + 1
            ])  # batch embed cont_num
            visual_feature_test = tf.transpose(transformed_clip_test,
                                               [0, 2, 1])
            input_vision = tf.reduce_sum(tf.mul(visual_feature_test, a),
                                         2)  #batch embed
            transformed_clip_test_norm = tf.nn.l2_normalize(input_vision,
                                                            dim=1)
            transformed_sentence_test_norm = tf.nn.l2_normalize(
                transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(
                transformed_clip_test_norm, transformed_sentence_test_norm,
                self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(
                cross_modal_vec_test,
                "vs_multilayer_lt",
                reuse=True,
                middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''

    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)
        sim_score_mat = tf.reshape(sim_score_mat,
                                   [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha,
                                     shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I, batch_para_mat)
        loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat,
                                                     sim_score_mat))))
        loss_mat = tf.mul(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))
        loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))

        loss = tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32,
            shape=(
                self.batch_size, self.visual_feature_dim,
                2 * self.context_num +
                1))  # input feature: current clip, pre-contex, and post contex
        sentence_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32,
            shape=(
                self.test_batch_size, self.visual_feature_dim,
                2 * self.context_num +
                1))  #input feature: current clip, pre-contex, and post contex
        sentence_ph_test = tf.placeholder(tf.float32,
                                          shape=(self.test_batch_size,
                                                 self.sentence_embedding_size))

        return visual_featmap_ph_train, sentence_ph_train, offset_ph, visual_featmap_ph_test, sentence_ph_test

    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    def training(self, loss):

        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op

    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test = self.init_placeholder(
        )

        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(
            self.visual_featmap_ph_train, self.sentence_ph_train,
            self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(
            sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
class CTRL_Model(object):
    def __init__(self, batch_size, train_csv_path, test_csv_path,
                 test_visual_feature_dir, train_visual_feature_dir):

        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.005
        self.lambda_regression = 0.01
        self.alpha = 1.0 / batch_size
        self.semantic_size = 1024  # the size of visual and semantic comparison size
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096 * 3
        self.train_set = TrainingDataSet(train_visual_feature_dir,
                                         train_csv_path, self.batch_size)
        self.test_set = TestingDataSet(test_visual_feature_dir, test_csv_path,
                                       self.test_batch_size)

    '''
    used in training alignment model, CTRL(aln)
    '''

    def fill_feed_dict_train(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch()
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    used in training alignment+regression model, CTRL(reg)
    '''

    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou(
        )
        input_feed = {
            self.visual_featmap_ph_train: image_batch,
            self.sentence_ph_train: sentence_batch,
            self.offset_ph: offset_batch
        }

        return input_feed

    '''
    cross modal processing module
    '''

    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
                                [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed, [1, batch_size]),
                                [batch_size, batch_size, self.semantic_size])
        concat_feature = tf.reshape(
            tf.concat([vv_feature, ss_feature], 2),
            [batch_size, batch_size, self.semantic_size + self.semantic_size])
        print concat_feature.get_shape().as_list()
        mul_feature = tf.multiply(vv_feature, ss_feature)
        add_feature = tf.add(vv_feature, ss_feature)

        comb_feature = tf.reshape(
            tf.concat([mul_feature, add_feature, concat_feature], 2),
            [1, batch_size, batch_size, self.semantic_size * 4])
        return comb_feature

    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''

    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train,
                              visual_feature_test, sentence_embed_test):
        name = "CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"
            transformed_clip_train = fc('v2s_lt',
                                        visual_feature_train,
                                        output_dim=self.semantic_size)
            transformed_clip_train_norm = tf.nn.l2_normalize(
                transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt',
                                            sentence_embed_train,
                                            output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(
                transformed_sentence_train, dim=1)
            cross_modal_vec_train = self.cross_modal_comb(
                transformed_clip_train_norm, transformed_sentence_train_norm,
                self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(
                cross_modal_vec_train,
                "vs_multilayer_lt",
                middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(
                sim_score_mat_train, [self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n"
            transformed_clip_test = fc('v2s_lt',
                                       visual_feature_test,
                                       output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(
                transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt',
                                           sentence_embed_test,
                                           output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(
                transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(
                transformed_clip_test_norm, transformed_sentence_test_norm,
                self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(
                cross_modal_vec_test,
                "vs_multilayer_lt",
                reuse=True,
                middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''

    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(sim_reg_mat, 3, 2)
        sim_score_mat = tf.reshape(sim_score_mat,
                                   [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha,
                                     shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I, batch_para_mat)
        loss_mat = tf.log(
            tf.add(all1, tf.exp(tf.multiply(mask_mat, sim_score_mat))))
        loss_mat = tf.multiply(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.multiply(l_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.multiply(p_reg_mat, I),
                               tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat((p_reg_diag, l_reg_diag), 1)
        loss_reg = tf.reduce_mean(
            tf.abs(tf.subtract(offset_pred, offset_label)))

        loss = tf.add(tf.multiply(self.lambda_regression, loss_reg),
                      loss_align)
        return loss, offset_pred, loss_reg

    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        sentence_ph_train = tf.placeholder(
            tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size, 2))
        visual_featmap_ph_test = tf.placeholder(
            tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
        sentence_ph_test = tf.placeholder(tf.float32,
                                          shape=(self.test_batch_size,
                                                 self.sentence_embedding_size))

        return visual_featmap_ph_train, sentence_ph_train, offset_ph, visual_featmap_ph_test, sentence_ph_test

    def get_variables_by_name(self, name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <" + name + ">"
            for v in v_dict[name]:
                print "    " + v.name
        return v_dict

    def training(self, loss):

        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op

    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test = self.init_placeholder(
        )
        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(
            self.visual_featmap_ph_train, self.sentence_ph_train,
            self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(
            sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
Beispiel #3
0
class CTRL_Model(object):
    def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):
        
        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.005
        self.lambda_regression = 0.01
        self.alpha = 1.0/batch_size
        self.semantic_size = 1024 # the size of visual and semantic comparison size
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096*3
        self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)
        self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
   
    '''
    used in training alignment model, CTRL(aln)
    '''	
    def fill_feed_dict_train(self):
        image_batch,sentence_batch,offset_batch = self.train_set.next_batch()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.sentence_ph_train: sentence_batch,
                self.offset_ph: offset_batch
        }

        return input_feed
    
    '''
    used in training alignment+regression model, CTRL(reg)
    '''
    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.sentence_ph_train: sentence_batch,
                self.offset_ph: offset_batch
        }

        return input_feed

    
    '''
    cross modal processing module
    '''
    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
            [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size])
        concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size])
        print concat_feature.get_shape().as_list()
        mul_feature = tf.mul(vv_feature, ss_feature) 
        add_feature = tf.add(vv_feature, ss_feature)
        
        comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4])
        return comb_feature
    
    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"     
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n" 
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''
    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)
        sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I,batch_para_mat)
        loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat))))
        loss_mat = tf.mul(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))
        loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))

        loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)
        return loss, offset_pred, loss_reg


    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2))
        visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
        sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size))

        return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test
    

    def get_variables_by_name(self,name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <"+name+">"
            for v in v_dict[name]:
                print "    "+v.name
        return v_dict

    def training(self, loss):
        
        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op


    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder()
        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg