def visual_semantic_infer(self, visual_feature_train_pos, visual_feature_train_neg, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print("Building training network...............................\n")
            transformed_clip_train_mix = fc('v2s_lt', tf.concat([visual_feature_train_pos, visual_feature_train_neg], 0), output_dim=self.semantic_size)
            transformed_clip_train_norm_mix = tf.nn.l2_normalize(transformed_clip_train_mix, dim=1)

            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)
            cross_modal_vec_train_mix = self.cross_modal_comb(transformed_clip_train_norm_mix,
                                                              tf.tile(transformed_sentence_train_norm, [2,1]),
                                                              self.batch_size)

            sim_score_mat_train_mix = vs_multilayer.vs_multilayer(cross_modal_vec_train_mix, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train_mix = tf.reshape(sim_score_mat_train_mix, [self.batch_size*2, 3])

            tf.get_variable_scope().reuse_variables()
            print("Building test network...............................\n")
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train_mix, sim_score_mat_test
Beispiel #2
0
 def eval(self, visual_feature_test):
     outputs = vs_multilayer.vs_multilayer(visual_feature_test,
                                           "PATE",
                                           middle_layer_dim=1000,
                                           reuse=True)
     outputs = tf.reshape(outputs, [2])
     return outputs
Beispiel #3
0
    def compute_loss_reg(self, visual_feature, offsets, labels):

        cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature,
                                                  "APN",
                                                  middle_layer_dim=1000)
        cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4])  # [128,4]
        """
        cls_score_vec_0 : (128,1)
        cls_score_vec_1 : (128,1)
        p_reg_vec : (128,1)
        l_reg_vec : (128,1)
        
        """
        # 将分类和回归向量拆分和组合
        cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split(
            1, 4, cls_reg_vec)
        cls_score_vec = tf.concat(1, (cls_score_vec_0, cls_score_vec_1))
        offset_pred = tf.concat(1, (p_reg_vec, l_reg_vec))

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            cls_score_vec, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])

        # offset_pred为最后全连接层的预测的坐标偏移值
        # offsets是真实的坐标偏移值
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, offset_pred, loss_reg
Beispiel #4
0
    def eval(self, central, start, end):
        central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer(
            central, start, end, "BLA", reuse=True)
        outputs = tf.concat(1, (central_cls, start_reg, end_reg))
        outputs = tf.reshape(outputs, [4])
        print "eval output size: " + str(outputs.get_shape().as_list())

        return outputs
Beispiel #5
0
 def eval(self, visual_feature_test):
     # visual_feature_test=tf.reshape(visual_feature_test,[1,4096])
     outputs = vs_multilayer.vs_multilayer(visual_feature_test,
                                           "APN",
                                           middle_layer_dim=1000,
                                           reuse=True)
     outputs = tf.reshape(outputs, [4])
     return outputs
Beispiel #6
0
    def compute_loss(self,visual_feature,labels):
        # vs_multilayer的输出维度为2,指示该能否由TAG正确生成
        cls_vec=vs_multilayer.vs_multilayer(visual_feature,"PATE",middle_layer_dim=1000)
        cls_vec=tf.reshape(cls_vec,[self.batch_size,2])

        #classification loss
        loss_cls_vec=tf.nn.sparse_softmax_cross_entropy_with_logits(cls_vec, labels)
        loss_cls=tf.reduce_mean(loss_cls_vec)
        return loss_cls
Beispiel #7
0
 def eval(self, visual_feature_test):
     sim_score = vs_multilayer.vs_multilayer(
         visual_feature_test,
         "CBR",
         middle_layer_dim=self.middle_layer_size,
         output_layer_dim=(self.action_class_num + 1) * 3,
         dropout=False,
         reuse=True)
     sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3])
     return sim_score
Beispiel #8
0
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"     
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n" 
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test,
                              sentence_ph_train_len, sentence_ph_test_len):

        name="CTRL_Model"
        with tf.variable_scope(name):
            print("Building training network...............................\n")
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)

            if self.useLSTM:
                sentence_embed_train = self.lstm_embed(sentence_embed_train, sentence_ph_train_len)

            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print("Building test network...............................\n")
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)

            if self.useLSTM:
                sentence_embed_test = self.lstm_embed(sentence_embed_test, sentence_ph_test_len)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)

            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [self.test_batch_size, self.test_batch_size, 3])

            cross_modal_vec_test_1 = self.cross_modal_comb(tf.reshape(transformed_clip_test_norm[1], shape=(1,1024)),
                                                           tf.reshape(transformed_sentence_test_norm[1], shape=(1,1024)), 1)
            sim_score_mat_test_1 = vs_multilayer.vs_multilayer(cross_modal_vec_test_1, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test_1 = tf.reshape(sim_score_mat_test_1, [3])
            return sim_score_mat_train, sim_score_mat_test, sim_score_mat_test_1
Beispiel #10
0
    def compute_loss_reg(self, central, start, end, offsets, labels):

        central_cls, start_reg, end_reg = vs_multilayer.vs_multilayer(
            central, start, end, "BLA")
        offset_pred = tf.concat(1, (start_reg, end_reg))

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            central_cls, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, offset_pred, loss_reg
Beispiel #11
0
    def compute_loss_reg(self,visual_feature,offsets,labels):

        cls_reg_vec=vs_multilayer.vs_multilayer(visual_feature,"APN",middle_layer_dim=1000)
        cls_reg_vec=tf.reshape(cls_reg_vec,[self.batch_size,4])
        cls_score_vec_0,cls_score_vec_1,p_reg_vec,l_reg_vec=tf.split(1,4,cls_reg_vec)
        cls_score_vec=tf.concat(1,(cls_score_vec_0,cls_score_vec_1))
        offset_pred=tf.concat(1,(p_reg_vec,l_reg_vec))

        #classification loss
        loss_cls_vec=tf.nn.sparse_softmax_cross_entropy_with_logits(cls_score_vec, labels)
        loss_cls=tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp=tf.to_float(tf.reshape(labels,[self.batch_size,1]))
        label_for_reg=tf.concat(1,[label_tmp,label_tmp])
        loss_reg=tf.reduce_mean(tf.mul(tf.abs(tf.sub(offset_pred,offsets)),label_for_reg))

        loss=tf.add(tf.mul(self.lambda_reg,loss_reg),loss_cls)
        return loss,offset_pred,loss_reg
Beispiel #12
0
    def compute_loss_reg(self, visual_feature, offsets, labels,
                         one_hot_labels):

        cls_reg_vec = vs_multilayer.vs_multilayer(
            visual_feature,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            output_layer_dim=(self.action_class_num + 1) * 3)
        cls_reg_vec = tf.reshape(
            cls_reg_vec, [self.batch_size,
                          (self.action_class_num + 1) * 3])  # [128,21*3]
        cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1]
        start_offset_pred = cls_reg_vec[:, self.action_class_num +
                                        1:(self.action_class_num + 1) * 2]
        end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:]

        #classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            cls_score_vec, labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)

        # regression loss
        pick_start_offset_pred = []
        pick_end_offset_pred = []
        # 选取第K个sampel的属于某个类别的回归值。参考论文中的回归计算
        for k in range(self.batch_size):  # 选取第K个sample的回归预测值
            pick_start_offset_pred.append(start_offset_pred[k, labels[k]])
            pick_end_offset_pred.append(end_offset_pred[k, labels[k]])
        pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred),
                                            [self.batch_size, 1])
        pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred),
                                          [self.batch_size, 1])
        labels_1 = tf.to_float(tf.not_equal(
            labels, 0))  # 选取对应的类别的回归值,labels中保存的是该sample属于哪个类别
        label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1]))
        label_for_reg = tf.concat(1, [label_tmp, label_tmp])  # 按列进行拼接 [128,2]
        offset_pred = tf.concat(
            1, (pick_start_offset_pred, pick_end_offset_pred))  # [128,2]
        loss_reg = tf.reduce_mean(
            tf.mul(tf.abs(tf.sub(offset_pred, offsets)), label_for_reg))

        loss = tf.add(tf.mul(self.lambda_reg, loss_reg), loss_cls)
        return loss, loss_reg
Beispiel #13
0
    def predict(self, visual_feature_test):
        """Inference during testing

        Args:
            visual_feature_test: Tensor, feature,  (test_batch_size, visual_feature_dim)

        Returns:
            sim_score: Tensor, (action_class_num+1)*3 (Note: [0:action_class_num+1]: classification scores;
                [action_class_num+1:(action_class_num+1)*2: start offsets; [(action_class_num+1)*2:(action_class_num+1)*3]: end offsets)

        """
        print('To predict the label')

        sim_score = vs_multilayer.vs_multilayer(
            visual_feature_test,
            "CBR",
            middle_layer_dim=self.middle_layer_size,
            class_num=self.action_class_num,
            dropout=False,
            reuse=True)
        sim_score = tf.reshape(sim_score, [(self.action_class_num + 1) * 3])

        return sim_score
Beispiel #14
0
    def compute_loss_reg(self, visual_feature, offsets, labels, test=False):

        cls_reg_vec = vs_multilayer.vs_multilayer(visual_feature,
                                                  "APN",
                                                  middle_layer_dim=1000,
                                                  test=test)
        cls_reg_vec = tf.reshape(cls_reg_vec, [self.batch_size, 4])
        cls_score_vec_0, cls_score_vec_1, p_reg_vec, l_reg_vec = tf.split(
            cls_reg_vec, 4, 1)
        cls_score_vec = tf.concat((cls_score_vec_0, cls_score_vec_1), 1)
        offset_pred = tf.concat((p_reg_vec, l_reg_vec), 1)
        # classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=cls_score_vec, labels=labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)
        # regression loss
        label_tmp = tf.to_float(tf.reshape(labels, [self.batch_size, 1]))
        label_for_reg = tf.concat([label_tmp, label_tmp], 1)
        loss_reg = tf.reduce_mean(
            tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)),
                        label_for_reg))

        loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls)
        return loss, offset_pred, loss_reg
Beispiel #15
0
 def eval(self,visual_feature_test):
     #visual_feature_test=tf.reshape(visual_feature_test,[1,4096]) 
     outputs=vs_multilayer.vs_multilayer(visual_feature_test,"APN",middle_layer_dim=1000,reuse=True)
     outputs=tf.reshape(outputs,[4])
     return outputs
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train,
                              visual_feature_test, sentence_embed_test):
        name = "CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"
            """ embedding into common space dim 1024"""
            visual_feature_train = tf.transpose(visual_feature_train,
                                                [0, 2, 1])  # batch num fea
            inputs = tf.reshape(visual_feature_train,
                                [-1, self.visual_feature_dim])  #batch x num,fe
            transformed_clip_train = fc(
                'v2s_lt', inputs,
                output_dim=self.semantic_size)  # batch x num, embed
            transformed_clip_train = tf.reshape(transformed_clip_train, [
                self.batch_size, 2 * self.context_num + 1, self.semantic_size
            ])  #batch num embe
            transformed_sentence_train = fc(
                's2s_lt', sentence_embed_train,
                output_dim=self.semantic_size)  # batch, embed
            #### attention part
            print "attention part tanh(sum(x_1:t))*tanh(s) "
            concat_previous_feature = tf.zeros(
                [self.batch_size, 1, self.semantic_size])
            for j in range(2 * self.context_num):
                now = tf.slice(transformed_clip_train, [0, 0, 0],
                               [-1, j + 1, -1])
                #    print now.get_shape().as_list()
                now = tf.reduce_sum(now, 1)
                #    print now.get_shape().as_list()
                now = tf.expand_dims(now, 1)
                #    print now.get_shape().as_list()

                concat_previous_feature = tf.concat(
                    [concat_previous_feature, now], 1)  # batch num embed
            v = tf.tanh(tf.add(transformed_clip_train,
                               concat_previous_feature))
            relu_t = tf.tanh(transformed_sentence_train)  #batch, embed
            concat_text = tf.reshape(
                tf.tile(relu_t, [1, 2 * self.context_num + 1]), [
                    self.batch_size, 2 * self.context_num + 1,
                    self.semantic_size
                ])  # batch cont_num embed
            # computing weight a
            e = tf.reduce_sum(tf.multiply(concat_text, v), 2)  # batch cont_num
            alpha = tf.nn.softmax(e)  # batch, num_ctx
            a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [
                self.batch_size, self.semantic_size, 2 * self.context_num + 1
            ])  # batch 4096 cont_num
            visual_feature_train = tf.transpose(transformed_clip_train,
                                                [0, 2, 1])  # batch embed num
            input_vision = tf.reduce_sum(tf.multiply(visual_feature_train, a),
                                         2)  #batch embed

            transformed_clip_train_norm = tf.nn.l2_normalize(input_vision,
                                                             dim=1)
            transformed_sentence_train_norm = tf.nn.l2_normalize(
                transformed_sentence_train, dim=1)

            # print transformed_clip_train_norm.shape
            # print transformed_sentence_train_norm.shape
            # exit()
            cross_modal_vec_train = self.cross_modal_comb(
                transformed_clip_train_norm, transformed_sentence_train_norm,
                self.batch_size)  # batch batch 2*conmmon_space_dim
            # print cross_modal_vec_train.shape
            # exit()
            sim_score_mat_train = vs_multilayer.vs_multilayer(
                cross_modal_vec_train,
                "vs_multilayer_lt",
                middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(
                sim_score_mat_train, [self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n"
            visual_feature_test = tf.transpose(visual_feature_test,
                                               [0, 2, 1])  # batch num fea
            inputs = tf.reshape(visual_feature_test,
                                [-1, self.visual_feature_dim])  #batch x num,fe
            transformed_clip_test = fc('v2s_lt',
                                       inputs,
                                       output_dim=self.semantic_size)
            transformed_clip_test = tf.reshape(transformed_clip_test, [
                self.test_batch_size, 2 * self.context_num + 1,
                self.semantic_size
            ])  #batch num embe
            transformed_sentence_test = fc('s2s_lt',
                                           sentence_embed_test,
                                           output_dim=self.semantic_size)
            #### attention part
            print "attention part tanh(sum(x_1:t))*tanh(s) "
            concat_previous_feature = tf.zeros(
                [self.test_batch_size, 1, self.semantic_size])
            for j in range(2 * self.context_num):
                now = tf.slice(transformed_clip_test, [0, 0, 0],
                               [-1, j + 1, -1])
                print now.get_shape().as_list()
                now = tf.reduce_sum(now, 1)
                print now.get_shape().as_list()
                now = tf.expand_dims(now, 1)
                print now.get_shape().as_list()
                concat_previous_feature = tf.concat(
                    1, [concat_previous_feature, now])  # batch num embed
            v = tf.tanh(tf.add(transformed_clip_test,
                               concat_previous_feature))  # batchx num, embed
            relu_t = tf.tanh(transformed_sentence_test)  #batch, feature_embed

            concat_text = tf.reshape(
                tf.tile(relu_t, [1, 2 * self.context_num + 1]), [
                    self.test_batch_size, 2 * self.context_num + 1,
                    self.semantic_size
                ])  # batch cont_num feature

            e = tf.reduce_sum(tf.mul(concat_text, v), 2)  # batch cont_num

            alpha = tf.nn.softmax(e)  # batch, num_ctx
            a = tf.reshape(tf.tile(alpha, [1, self.semantic_size]), [
                self.test_batch_size, self.semantic_size,
                2 * self.context_num + 1
            ])  # batch embed cont_num
            visual_feature_test = tf.transpose(transformed_clip_test,
                                               [0, 2, 1])
            input_vision = tf.reduce_sum(tf.mul(visual_feature_test, a),
                                         2)  #batch embed
            transformed_clip_test_norm = tf.nn.l2_normalize(input_vision,
                                                            dim=1)
            transformed_sentence_test_norm = tf.nn.l2_normalize(
                transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(
                transformed_clip_test_norm, transformed_sentence_test_norm,
                self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(
                cross_modal_vec_test,
                "vs_multilayer_lt",
                reuse=True,
                middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test
Beispiel #17
0
    def add_loss_op(self,
                    visual_feature,
                    offsets,
                    labels,
                    one_hot_labels,
                    name='CBR'):
        """This function is to compute the loss in tensorflow graph

        Args:
            visual_feature: Tensor, feature, (batch_size, visual_feature_dim)
            offsets: Tensor, boundary offset(both to the start and end in frame-level), (batch_size, 2)
            labels: Tensor, label, (batch_size)
            one_hot_labels: Tensor, one hot label, (batch_size, action_class_num+1)

        Returns:
            loss: loss_cls + lambda_reg * loss_reg
            loss_reg: L1 loss between ground truth offsets and prediction offsets
            loss_cls: cross entropy loss

        """
        print('Add the standard loss')

        cls_reg_vec = vs_multilayer.vs_multilayer(
            visual_feature,
            name,
            middle_layer_dim=self.middle_layer_size,
            class_num=self.action_class_num,
            dropout=self.config.dropout)

        cls_reg_vec = tf.reshape(
            cls_reg_vec, [self.batch_size, (self.action_class_num + 1) * 3])
        cls_score_vec = cls_reg_vec[:, :self.action_class_num + 1]
        start_offset_pred = cls_reg_vec[:, self.action_class_num +
                                        1:(self.action_class_num + 1) * 2]
        end_offset_pred = cls_reg_vec[:, (self.action_class_num + 1) * 2:]

        # l1 loss
        loss_l1 = tf.reduce_mean(tf.abs(cls_score_vec))

        # classification loss
        loss_cls_vec = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=cls_score_vec, labels=labels)
        loss_cls = tf.reduce_mean(loss_cls_vec)

        # regression loss
        pick_start_offset_pred = []
        pick_end_offset_pred = []
        for k in range(self.batch_size):

            pick_start_offset_pred.append(start_offset_pred[k, labels[k]])
            pick_end_offset_pred.append(end_offset_pred[k, labels[k]])

        pick_start_offset_pred = tf.reshape(tf.stack(pick_start_offset_pred),
                                            [self.batch_size, 1])
        pick_end_offset_pred = tf.reshape(tf.stack(pick_end_offset_pred),
                                          [self.batch_size, 1])
        labels_1 = tf.to_float(tf.not_equal(labels, 0))
        label_tmp = tf.to_float(tf.reshape(labels_1, [self.batch_size, 1]))
        label_for_reg = tf.concat([label_tmp, label_tmp], 1)
        offset_pred = tf.concat((pick_start_offset_pred, pick_end_offset_pred),
                                1)

        loss_reg = tf.reduce_mean(
            tf.multiply(tf.abs(tf.subtract(offset_pred, offsets)),
                        label_for_reg))

        loss = tf.add(tf.multiply(self.lambda_reg, loss_reg), loss_cls)

        if self.config.l1_loss:
            loss = tf.add(loss, loss_l1)
        else:
            loss = loss

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("loss_reg", loss_reg)
        tf.summary.scalar("loss_cls", loss_cls)

        return loss, loss_reg, loss_cls