Example #1
0
    def discriminator(self, v_feature, q_feature, reuse_flag=False):

        with tf.variable_scope("discriminator", reuse=reuse_flag):
            v_feature = layers.linear_layer(v_feature,
                                            self.hidden_size,
                                            scope_name='v_transfer')
            # v_feature = tf.contrib.layers.dropout(v_feature, self.dropout, is_training=self.is_training)

            q_feature = layers.linear_layer(q_feature,
                                            self.hidden_size,
                                            scope_name='q_transfer')
            # q_feature = tf.contrib.layers.dropout(q_feature, self.dropout, is_training=self.is_training)

            fused_add = v_feature + q_feature
            fused_mul = v_feature * q_feature
            fused_cat = tf.concat([fused_add, fused_mul], axis=1)
            fused_fc = layers.linear_layer(fused_cat,
                                           self.hidden_size,
                                           scope_name='fc_transfer')
            fused_all = tf.concat([fused_add, fused_mul, fused_fc], axis=1)
            # fused_all = tf.contrib.layers.dropout(fused_all, self.dropout, is_training=self.is_training)

            with tf.variable_scope("output"):
                scores = layers.linear_layer(fused_all, 1, scope_name='output')
                scores = tf.squeeze(scores, 1)
            return scores
Example #2
0
    def build_model(self):
        # input layer (batch_size, n_steps, input_dim)
        self.ques_vecs = tf.placeholder(
            tf.float32, [None, self.max_words, self.input_ques_dim])
        self.ques_len = tf.placeholder(tf.int32, [None])
        self.frame_vecs = tf.placeholder(
            tf.float32, [None, self.max_frames, self.input_video_dim])
        self.frame_len = tf.placeholder(tf.int32, [None])
        self.batch_size = tf.placeholder(tf.int32, [])
        self.is_training = tf.placeholder(tf.bool)
        self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames])
        self.gt_windows = tf.placeholder(tf.float32, [None, 2])

        self.frame_mask = tf.sequence_mask(self.frame_len,
                                           maxlen=self.max_frames)
        self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words)

        with tf.variable_scope("Frame_Embedding_Encoder_Layer"):
            input_frame_vecs = tf.contrib.layers.dropout(
                self.frame_vecs, self.dropout, is_training=self.is_training)
            frame_embedding, _ = layers.dynamic_origin_bilstm_layer(
                input_frame_vecs,
                self.hidden_size,
                'frame_embedding',
                input_len=self.frame_len)

            frame_embedding = tf.contrib.layers.dropout(
                frame_embedding, self.dropout, is_training=self.is_training)

        with tf.variable_scope("Ques_Embedding_Encoder_Layer"):

            input_ques_vecs = tf.contrib.layers.dropout(
                self.ques_vecs, self.dropout, is_training=self.is_training)
            ques_embedding, ques_states = layers.dynamic_origin_bilstm_layer(
                input_ques_vecs,
                self.hidden_size,
                'ques_embedding',
                input_len=self.ques_len)

            ques_embedding = tf.contrib.layers.dropout(
                ques_embedding, self.dropout, is_training=self.is_training)

            q_feature = tf.concat([ques_states[0][1], ques_states[1][1]], 1)
            self.q_feature = tf.contrib.layers.dropout(
                q_feature, self.dropout, is_training=self.is_training)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):

            # att_score = tf.matmul(frame_embedding, ques_embedding, transpose_b=True)  # M*N1*K  ** M*N2*K  --> M*N1*N2
            # att_score = tf.nn.softmax(mask_logits(att_score, mask=tf.expand_dims(self.ques_mask, 1)))
            #
            # length = tf.cast(tf.shape(ques_embedding), tf.float32)
            # att_out = tf.matmul(att_score, ques_embedding) * length[1] * tf.sqrt(
            #     1.0 / length[1])  # M*N1*N2  ** M*N2*K   --> M*N1*k
            #
            # attention_outputs = tf.concat([frame_embedding, att_out, tf.multiply(frame_embedding,att_out)])

            att_score = tf.matmul(
                frame_embedding, ques_embedding,
                transpose_b=True)  # M*N1*K  ** M*N2*K  --> M*N1*N2
            mask_q = tf.expand_dims(self.ques_mask, 1)
            S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q))
            mask_v = tf.expand_dims(self.frame_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v),
                              axis=1), (0, 2, 1))
            self.v2q = tf.matmul(S_, ques_embedding)
            self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding)
            attention_outputs = tf.concat([
                frame_embedding, self.v2q, frame_embedding * self.v2q,
                frame_embedding * self.q2v
            ], 2)

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_outputs = tf.contrib.layers.dropout(
                attention_outputs, self.dropout, is_training=self.is_training)
            model_outputs, _ = layers.dynamic_origin_bilstm_layer(
                attention_outputs,
                self.hidden_size,
                'model_layer',
                input_len=self.frame_len)

            model_outputs = tf.contrib.layers.dropout(
                model_outputs, self.dropout, is_training=self.is_training)

        with tf.variable_scope("Output_Layer"):

            logit_score = layers.correlation_layer(model_outputs,
                                                   self.q_feature,
                                                   self.hidden_size,
                                                   scope_name='output_layer')
            # logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer')
            # logit_score = tf.squeeze(logit_score, 2)
            logit_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logit_score, labels=self.gt_predict)
            avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1))

            self.G_variables = tf.trainable_variables()
            G_regularization_cost = tf.reduce_sum(
                [tf.nn.l2_loss(v) for v in self.G_variables])
            G_reg_loss = self.regularization_beta * G_regularization_cost

            ground_prod = tf.nn.softmax(
                layers.mask_logits(self.gt_predict, self.gt_predict))
            ground_v_feature = tf.reduce_sum(
                tf.multiply(model_outputs, tf.expand_dims(ground_prod, 2)), 1)
            ground_out = self.discriminator(ground_v_feature, self.q_feature)

            generated_prod = tf.nn.sigmoid(logit_score) / tf.reduce_sum(
                tf.nn.sigmoid(logit_score), keepdims=True, axis=1)
            generated_v_feature = tf.reduce_sum(
                tf.multiply(model_outputs, tf.expand_dims(generated_prod, 2)),
                1)
            generated_out = self.discriminator(generated_v_feature,
                                               self.q_feature,
                                               reuse_flag=True)

            all_variable = tf.trainable_variables()
            self.D_variables = [
                vv for vv in all_variable if vv not in self.G_variables
            ]
            D_regularization_cost = tf.reduce_sum(
                [tf.nn.l2_loss(v) for v in self.D_variables])
            D_reg_loss = self.regularization_beta * D_regularization_cost

            ground_loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=tf.ones_like(ground_out), logits=ground_out))
            generated_loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=tf.zeros_like(generated_out), logits=generated_out))

            regularization_cost = tf.reduce_sum(
                [tf.nn.l2_loss(v) for v in all_variable])
            self.reg_loss = self.regularization_beta * regularization_cost
            self.dist_loss = avg_logit_loss

            self.G_pre_loss = avg_logit_loss + G_reg_loss
            self.D_loss = ground_loss + generated_loss + D_reg_loss
            scale = self.max_frames / 5
            self.G_loss = avg_logit_loss + G_reg_loss + scale * tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=tf.ones_like(generated_out), logits=generated_out))
            self.frame_score = tf.nn.sigmoid(logit_score)

        with tf.variable_scope('Pointer_Layer'):
            score_dist = tf.nn.sigmoid(logit_score)
            score_dist = conv_utils.normalize(score_dist, scope='layer_normal')
            output = tf.nn.relu(
                conv_utils.conv1d_with_bias(tf.expand_dims(score_dist, 2), 1,
                                            16, 5))
            # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training)
            output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 2, 32, 10))
            # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training)
            output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 3, 64, 20))
            # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training)
            output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 4, 1, 10))
            # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training)
            # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size,scope_name='pointer_1'))
            # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size/2,scope_name='pointer_2'))
            # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size/4,scope_name='pointer_3'))
            output = layers.linear_layer(tf.squeeze(output, 2),
                                         2,
                                         scope_name='pointrt_output')
            self.predict_start_end = output
            gt_start_end = self.gt_windows
            pointer_loss = tf.reduce_mean(
                tf.square(tf.subtract(self.predict_start_end, gt_start_end)))

            all_variable = tf.trainable_variables()
            self.pn_variables = [
                vv for vv in all_variable
                if vv not in self.G_variables and vv not in self.D_variables
            ]
            pn_regularization_cost = tf.reduce_sum(
                [tf.nn.l2_loss(v) for v in self.pn_variables])
            self.pn_loss = pointer_loss + self.regularization_beta * pn_regularization_cost

        print(self.G_variables)
        print(self.D_variables)
        print(self.pn_variables)
Example #3
0
    def build_model(self):
        # input layer (batch_size, n_steps, input_dim)
        self.ques_vecs = tf.placeholder(
            tf.float32, [None, self.max_words, self.input_ques_dim])
        self.ques_len = tf.placeholder(tf.int32, [None])
        self.frame_vecs = tf.placeholder(
            tf.float32, [None, self.max_frames, self.input_video_dim])
        self.frame_len = tf.placeholder(tf.int32, [None])
        self.batch_size = tf.placeholder(tf.int32, [])
        self.is_training = tf.placeholder(tf.bool)
        self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames])
        self.gt_windows = tf.placeholder(tf.float32, [None, 2])

        self.frame_mask = tf.sequence_mask(self.frame_len,
                                           maxlen=self.max_frames)
        self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words)

        with tf.variable_scope("Frame_Embedding_Encoder_Layer"):

            frame_next_layer = tf.contrib.layers.dropout(
                self.frame_vecs, self.dropout, is_training=self.is_training)
            frame_next_layer = conv_utils.linear_mapping(
                frame_next_layer,
                self.hidden_size,
                dropout=self.dropout,
                var_scope_name="linear_mapping_before_cnn")
            frame_next_layer = transformer.normalize(frame_next_layer)

            frame_next_layer += transformer.positional_encoding_v2(
                frame_next_layer,
                num_units=self.hidden_size,
                zero_pad=False,
                scale=False,
                scope="enc_pe")

            for i in range(3):
                with tf.variable_scope("stack_%s" % i):

                    frame_next_layer = conv_utils.conv_encoder_stack(
                        frame_next_layer,
                        [self.hidden_size, self.hidden_size, self.hidden_size],
                        [3, 3, 3], {
                            'src': self.dropout,
                            'hid': self.dropout
                        },
                        mode=self.is_training)

                    frame_next_layer = transformer.multihead_attention(
                        queries=frame_next_layer,
                        keys=frame_next_layer,
                        num_units=self.hidden_size,
                        num_heads=4,
                        dropout_rate=1 - self.dropout,
                        is_training=self.is_training,
                        causality=False)

                    frame_next_layer = transformer.feedforward(
                        frame_next_layer,
                        num_units=[2 * self.hidden_size, self.hidden_size],
                        is_training=self.is_training)

            frame_embedding = tf.contrib.layers.dropout(
                frame_next_layer, self.dropout, is_training=self.is_training)

        with tf.variable_scope("Ques_Embedding_Encoder_Layer"):

            ques_next_layer = tf.contrib.layers.dropout(
                self.ques_vecs, self.dropout, is_training=self.is_training)

            ques_next_layer = conv_utils.linear_mapping(
                ques_next_layer,
                self.hidden_size,
                dropout=self.dropout,
                var_scope_name="linear_mapping_before_cnn")
            ques_next_layer = transformer.normalize(ques_next_layer)

            ques_next_layer += transformer.positional_encoding_v2(
                ques_next_layer,
                num_units=self.hidden_size,
                zero_pad=False,
                scale=False,
                scope="enc_pe")

            for i in range(1):
                with tf.variable_scope("stack_%s" % i):

                    ques_next_layer = conv_utils.conv_encoder_stack(
                        ques_next_layer, [self.hidden_size, self.hidden_size],
                        [3, 3], {
                            'src': self.dropout,
                            'hid': self.dropout
                        },
                        mode=self.is_training)
                    ques_next_layer = transformer.multihead_attention(
                        queries=ques_next_layer,
                        keys=ques_next_layer,
                        num_units=self.hidden_size,
                        num_heads=4,
                        dropout_rate=1 - self.dropout,
                        is_training=self.is_training,
                        causality=False)

                    ques_next_layer = transformer.feedforward(
                        ques_next_layer,
                        num_units=[2 * self.hidden_size, self.hidden_size],
                        is_training=self.is_training)

            ques_embedding = tf.contrib.layers.dropout(
                ques_next_layer, self.dropout, is_training=self.is_training)

            # q_feature, _ = layers.weight_attention_layer(ques_embedding,self.hidden_size,scope_name='q_feature')
            ques_mask_embedding = layers.mask_zero(
                ques_next_layer, tf.expand_dims(self.ques_mask, 2))
            q_feature = tf.reduce_sum(
                ques_mask_embedding, axis=1) / tf.expand_dims(
                    tf.cast(self.ques_len, tf.float32), 1)
            # q_feature = tf.reduce_mean(ques_next_layer,axis=1)
            print(q_feature.shape)

            self.q_feature = tf.contrib.layers.dropout(
                q_feature, self.dropout, is_training=self.is_training)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):

            att_score = tf.matmul(
                frame_embedding, ques_embedding,
                transpose_b=True)  # M*N1*K  ** M*N2*K  --> M*N1*N2
            mask_q = tf.expand_dims(self.ques_mask, 1)
            S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q))
            mask_v = tf.expand_dims(self.frame_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v),
                              axis=1), (0, 2, 1))
            self.v2q = tf.matmul(S_, ques_embedding)
            self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding)
            attention_outputs = tf.concat([
                frame_embedding, self.v2q, frame_embedding * self.v2q,
                frame_embedding * self.q2v
            ], 2)

        with tf.variable_scope("Model_Encoder_Layer"):

            model_next_layer = conv_utils.linear_mapping(
                attention_outputs,
                self.hidden_size,
                dropout=self.dropout,
                var_scope_name="linear_mapping_before_model_layer")
            model_next_layer = transformer.normalize(model_next_layer)
            for i in range(2):
                with tf.variable_scope("stack_%s" % i):
                    model_next_layer = conv_utils.conv_encoder_stack(
                        model_next_layer, [self.hidden_size, self.hidden_size],
                        [3, 3], {
                            'src': self.dropout,
                            'hid': self.dropout
                        },
                        mode=self.is_training)

                    model_next_layer = transformer.multihead_attention(
                        queries=model_next_layer,
                        keys=model_next_layer,
                        num_units=self.hidden_size,
                        num_heads=4,
                        dropout_rate=1 - self.dropout,
                        is_training=self.is_training,
                        causality=False)

                    model_next_layer = transformer.feedforward(
                        model_next_layer,
                        num_units=[2 * self.hidden_size, self.hidden_size],
                        is_training=self.is_training)
            model_outputs = model_next_layer

        with tf.variable_scope("Output_Layer"):

            # logit_score = layers.correlation_layer(model_outputs,self.q_feature,self.hidden_size,scope_name='output_layer')
            logit_score = layers.linear_layer_3d(model_outputs,
                                                 1,
                                                 scope_name='output_layer')
            logit_score = tf.squeeze(logit_score, 2)

            logit_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logit_score, labels=self.gt_predict)
            avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1))

            self.G_variables = tf.trainable_variables()
            G_regularization_cost = tf.reduce_sum(
                [tf.nn.l2_loss(v) for v in self.G_variables])
            self.test_loss = avg_logit_loss
            self.loss = avg_logit_loss + self.regularization_beta * G_regularization_cost
            self.frame_score = tf.nn.sigmoid(logit_score)

        with tf.variable_scope('Pointer_Layer'):
            score_dist = tf.nn.sigmoid(logit_score)
            score_dist = conv_utils.normalize(score_dist, scope='layer_normal')

            output = tf.nn.relu(
                conv_utils.conv1d_with_bias(tf.expand_dims(score_dist, 2), 1,
                                            16, 5))
            output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 2, 32, 10))
            output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 3, 64, 20))
            output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 4, 1, 10))
            output = layers.linear_layer(tf.squeeze(output, 2),
                                         2,
                                         scope_name='pointrt_output')
            self.predict_start_end = output
            gt_start_end = self.gt_windows
            pointer_loss = tf.reduce_mean(
                tf.square(tf.subtract(self.predict_start_end, gt_start_end)))

            all_variable = tf.trainable_variables()
            self.pn_variables = [
                vv for vv in all_variable if vv not in self.G_variables
            ]
            pn_regularization_cost = tf.reduce_sum(
                [tf.nn.l2_loss(v) for v in self.pn_variables])
            self.pn_loss = pointer_loss + self.regularization_beta * pn_regularization_cost
    def build_train_proc(self):
        # input layer (batch_size, n_steps, input_dim)
        self.input_q = tf.placeholder(
            tf.float32, [None, self.max_n_q_words, self.input_ques_dim])
        self.input_q_len = tf.placeholder(tf.int32, [None])
        self.input_x = tf.placeholder(
            tf.float32, [None, self.input_n_frames, self.input_frame_dim])
        self.input_x_len = tf.placeholder(tf.int32, [None])
        self.y = tf.placeholder(tf.int32, [None, self.max_n_a_words])
        self.y_mask = tf.placeholder(tf.float32, [None, self.max_n_a_words])
        self.ans_vec = tf.placeholder(
            tf.float32, [None, self.max_n_a_words, self.input_ques_dim])
        self.batch_size = tf.placeholder(tf.int32, [])
        self.is_training = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])

        lstm_dim = self.lstm_dim
        # video LSTM layer, [n_steps * (batch_size, input_dim)] -> [n_steps * (batch_size, 2*lstm_dim)]
        input_x = tf.contrib.layers.dropout(self.input_x,
                                            self.dropout_prob,
                                            is_training=self.is_training)
        v_lstm_output, v_lstm_state = layers.dynamic_origin_lstm_layer(
            input_x, lstm_dim, 'v_lstm', input_len=self.input_x_len)

        # question LSTM layer
        q_lstm_output, q_lstm_state1 = layers.dynamic_origin_lstm_layer(
            self.input_q, lstm_dim, 'q_lstm', input_len=self.input_q_len)
        _, q_lstm_state2 = layers.dynamic_origin_lstm_layer(
            q_lstm_output, lstm_dim, 'q_lstm1', input_len=self.input_q_len)

        q_lstm_state_temp = tf.concat([q_lstm_state1[1], q_lstm_state2[1]], 1)
        q_lstm_state = layers.linear_layer(q_lstm_state_temp, self.lstm_dim,
                                           'linear0')

        qv_dot = tf.multiply(q_lstm_state, v_lstm_state[1])  # [None, 1024]

        # softmax projection [batch_size, 2*lstm_dim] -> [batch_size, n_classes]

        concat_output = tf.concat([q_lstm_state, qv_dot], axis=1)
        self.v_first_lstm_output = v_lstm_output
        self.q_last_state = q_lstm_state

        # decoder

        # output -> first_atten
        # self.decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.decode_dim)
        self.decoder_cell = tf.contrib.rnn.GRUCell(self.decode_dim)

        with tf.variable_scope('linear'):
            decoder_input_W = tf.get_variable(
                'w',
                shape=[concat_output.shape[1], self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            decoder_input_b = tf.get_variable(
                'b',
                shape=[self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            self.decoder_input = tf.matmul(
                concat_output,
                decoder_input_W) + decoder_input_b  # [None, decode_dim]

        # answer->word predict
        self.embed_word_W = tf.Variable(tf.random_uniform(
            [self.decode_dim, self.n_words], -0.1, 0.1),
                                        name='embed_word_W')
        self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1,
                                                          0.1),
                                        name='embed_word_b')

        # word dim -> decode_dim
        self.word_to_lstm_w = tf.Variable(tf.random_uniform(
            [self.input_ques_dim, self.decode_dim], -0.1, 0.1),
                                          name='word_to_lstm_W')
        self.word_to_lstm_b = tf.Variable(tf.random_uniform([self.decode_dim],
                                                            -0.1, 0.1),
                                          name='word_to_lstm_b')

        # decoder attention layer
        with tf.variable_scope('decoder_attention'):
            self.attention_w_q = tf.get_variable(
                'attention_w_q',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_x = tf.get_variable(
                'attention_w_x',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_h = tf.get_variable(
                'attention_w_h',
                shape=[self.decode_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_b = tf.get_variable(
                'attention_b',
                shape=[self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_a = tf.get_variable(
                'attention_a',
                shape=[self.attention_dim, 1],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_to_decoder = tf.get_variable(
                'attention_to_decoder',
                shape=[self.lstm_dim, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
        # decoder
        with tf.variable_scope('decoder'):
            self.decoder_r = tf.get_variable(
                'decoder_r',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_z = tf.get_variable(
                'decoder_z',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_w = tf.get_variable(
                'decoder_w',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())

        # embedding layer
        embeddings = load_file(self.params['word_embedding'])
        self.Wemb = tf.constant(embeddings, dtype=tf.float32)

        # generate training
        answer_train, train_loss = self.generate_answer_on_training()
        answer_test, test_loss = self.generate_answer_on_testing()

        # final
        variables = tf.trainable_variables()
        regularization_cost = tf.reduce_sum(
            [tf.nn.l2_loss(v) for v in variables])
        self.answer_word_train = answer_train
        self.train_loss = train_loss + self.regularization_beta * regularization_cost

        self.answer_word_test = answer_test
        self.test_loss = test_loss + self.regularization_beta * regularization_cost
        tf.summary.scalar('training cross entropy', self.train_loss)