Beispiel #1
0
    def logits_cnn_1d(self):
        logging.info("##########logit is cnn##########")
        embedded_words = self.embedding(self.sequence)
        embedded_words = tf.layers.dropout(embedded_words,
                                           self.fc_drop,
                                           training=self.is_train)
        self.global_fake = None
        self.represent_size = self.emb_size
        if self._config.attention_type == "same_init":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            if self._config.encode_type == "attend_rnn":
                global_encode = tf.reduce_sum(global_encode, axis=1)
            else:
                global_encode = tf.reduce_max(global_encode, axis=1)
            global_encode_mlp = tf.layers.batch_normalization(
                global_encode, training=self.is_train)
            global_encode_mlp = layer.fc_fun(
                global_encode_mlp,
                self.emb_size,
                initial_type=self._config.initial_type,
                activation="relu")
            global_encode_mlp = tf.tile(global_encode_mlp,
                                        [1, self.max_seq_len])
            global_encode_mlp = tf.reshape(
                global_encode_mlp,
                [self.batch_size, self.max_seq_len, self.emb_size])
            self.global_fake = global_encode_mlp
        elif self._config.attention_type == "attend_init":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            # global_encode_mlp = layer.fc_fun(
            #     global_encode, self.emb_size, initial_type=self._config.initial_type)
            # self.global_fake = tf.reshape(global_encode_mlp, [self.batch_size, self.max_seq_len, self.emb_size])
            self.global_fake = global_encode
        else:
            if self._config.attention_type is not None:
                raise NotImplementedError
        outputs = self.cnn(embedded_words, global_infor=self.global_fake)

        with tf.variable_scope("output"):
            outputs = tf.nn.leaky_relu(outputs)
            outputs = tf.layers.batch_normalization(outputs,
                                                    training=self.is_train)

            fcl_output = layer.fc_fun(outputs,
                                      2000,
                                      initial_type='xavier',
                                      activation=self._config.fc_activation_1)
            fcl_output = tf.layers.dropout(fcl_output,
                                           rate=self.fc_drop,
                                           training=self.is_train)
            fcl_output = layer.fc_fun(fcl_output,
                                      self.n_classes,
                                      initial_type='xavier')
        return fcl_output
Beispiel #2
0
    def encoder(self, sequence, embedded_words):
        # return: (b_s, max_seq_len, global_encode_units)
        last_state = None
        global_encode = None
        with tf.variable_scope('encode_module'):
            if self._config.encode_type == "transformer":
                global_encode = self.encoder_func(sequence, embedded_words)
                global_encode_units = self.emb_size
            elif self._config.encode_type == "other_transformer":
                global_encode = layer.attention_fun(
                    embedded_words,
                    dropout_rate=self.attention_drop,
                    is_training=self.is_train,
                    config=self._config,
                    scope="attention_encode")
                global_encode_units = self.emb_size
            elif self._config.encode_type == "disan":
                sequence_length = tf.squeeze(self.sequence_length)
                rep_mask = tf.sequence_mask(sequence_length,
                                            self._config.max_sequence_length)
                global_encode = utils_fast_disa.fast_directional_self_attention(
                    embedded_words,
                    rep_mask,
                    hn=self._config.disan_units,
                    head_num=self._config.attention_head,
                    msl=self.max_seq_len,
                    is_train=self.is_train)
                global_encode_units = self._config.disan_units
            elif self._config.encode_type == "cnn":
                global_encode = self.cnn_enc(embedded_words)
                global_encode_units = 128
                print "encode is cnn"
            elif self._config.encode_type == "rnn":
                rnn_encode_cell = tf.nn.rnn_cell.GRUCell(
                    name="encode_gru",
                    num_units=self.rnn_units,
                    kernel_initializer=tf.initializers.orthogonal())
                # self.tmp2 = tf.get_variable("rnn/encode_gru/gates/bias:0")

                x = tf.unstack(embedded_words, self.max_seq_len, 1)
                global_encode, last_state = tf.nn.static_rnn(
                    cell=rnn_encode_cell, dtype=tf.float32, inputs=x)
                # global_encode, last_state = tf.nn.dynamic_rnn(cell=rnn_encode_cell, dtype=tf.float32, inputs=embedded_words)
                # global_encode = tf.reduce_max(outputs, 1)
                # last_state = tf.get_variable("encode_v", [self.batch_size, self.rnn_units])
                global_encode_units = self.rnn_units
                print "*****encode is rnn*******"
            elif self._config.encode_type == "w":
                global_encode = layer.fc_fun(
                    embedded_words,
                    self.emb_size,
                    initial_type=self._config.initial_type,
                    factor=self._config.xavier_factor,
                    activation="leaky_relu")
                global_encode_units = self.emb_size
                print "encode is w"
            else:
                raise NotImplementedError
            self.global_encode = global_encode
        return global_encode, global_encode_units, last_state
Beispiel #3
0
    def logits(self):
        embedded_words = self.embedding(self.sequence)
        sequence_length = tf.squeeze(self.sequence_length)
        rep_mask = tf.sequence_mask(sequence_length,
                                    self._config.max_sequence_length)
        if self._config.disan_type == 'origin':
            outputs = utils_disan.disan(embedded_words,
                                        rep_mask,
                                        self._config,
                                        is_train=self.is_train,
                                        keep_prob=self.disan_keep)
        elif self._config.disan_type == 'fast':
            # outputs, tmp1, tmp2 = utils_fast_disa.fast_directional_self_attention(embedded_words, rep_mask, hn=512, msl=self.max_seq_len, is_train=self.is_train)
            outputs = utils_fast_disa.fast_directional_self_attention(
                embedded_words,
                rep_mask,
                hn=self._config.disan_units,
                head_num=self._config.attention_head,
                msl=self.max_seq_len,
                is_train=self.is_train)
            outputs = tf.reduce_max(outputs, 1)
        else:
            raise NotImplementedError

        # self.tmp1= tmp1
        # self.tmp2 = tmp2
        # self.tmp1 = outputs

        # self.tmp2 = outputs
        # outputs = layer.fc_fun(tf.layers.flatten(embedded_words), self.n_classes)
        # return outputs
        fcl_output = layer.fc_fun(outputs,
                                  self.mlp_units,
                                  initial_type='xavier',
                                  activation=self._config.fc_activation_1)
        fcl_output = tf.layers.dropout(fcl_output,
                                       rate=self.fc_drop,
                                       training=self.is_train)
        fcl_output = layer.fc_fun(fcl_output,
                                  self.n_classes,
                                  initial_type='xavier')
        # self.tmp2 = fcl_output
        return fcl_output
Beispiel #4
0
    def logits(self):
        # x is fcl_output
        embedded_words = self.embedding(self.sequence)
        # outputs = self.transformer(self.sequence, embedded_words)
        outputs = self.transformer(self.sequence, embedded_words)
        # self.train_symbol_show = train_show

        outputs = tf.reduce_max(outputs, axis=1)

        fcl_output = layer.fc_fun(outputs, self.n_classes, initial_type='xavier')
        return fcl_output
Beispiel #5
0
    def pre_pad(self, x):
        '''
        pre pad for d_rnn
        :param x: A 3d tensor with shape of [batch_size, sequence_length, emb_size]
        :return: A 4d tensor for d_rnn with shape of [batch_size, block, window_size, emb_size]
        '''
        pad_input = tf.pad(x, [[0, 0], [self.window_size - 1, 0], [0, 0]],
                           mode="CONSTANT")
        # print("pad_input:", pad_input.get_shape()) (batch_size, seq_max_len + window_size - 1, embed_size)
        # self.tmp1 = pad_input
        rnn_inputs = []
        for i in range(self.max_seq_len):
            rnn_inputs.append(
                tf.slice(pad_input, [0, i, 0], [-1, self.window_size, -1],
                         name='rnn_input'))
        rnn_input_tensor = tf.stack(
            rnn_inputs,
            1)  # (batch_size, seq_max_len, window_size, embed_size)

        if self._config.attention_type in ["same_init", "diff_init"]:
            # self.global_fake: [self.batch_size, self.max_seq_len, self.emb_size])
            fake_input = tf.reshape(
                self.global_fake,
                [self.batch_size, self.max_seq_len, 1, self.emb_size])
            rnn_input_tensor = tf.concat([fake_input, rnn_input_tensor], 2)
            self.real_window_size = self.window_size + 1
        elif self._config.attention_type in ["attend_init"]:
            block_rep = tf.reduce_mean(
                rnn_input_tensor, 2)  # (batch_size, seq_max_len, embed_size)
            fake_input = layer.basic_attention(block_rep, self.global_fake,
                                               "rnn")
            fake_input = tf.layers.batch_normalization(fake_input,
                                                       training=self.is_train)
            fake_input = layer.fc_fun(fake_input,
                                      self.emb_size,
                                      initial_type=self._config.initial_type,
                                      activation="relu")
            fake_input = tf.reshape(
                fake_input,
                [self.batch_size, self.max_seq_len, 1, self.emb_size])
            rnn_input_tensor = tf.concat([fake_input, rnn_input_tensor], 2)
            self.real_window_size = self.window_size + 1
        # print("rnn_input_tensor:", rnn_input_tensor.get_shape())
        # self.tmp2 = rnn_input_tensor
        return rnn_input_tensor
Beispiel #6
0
 def loss_encoder(self):
     if self._config.attention_type is None\
             or (not self._config.encoder_fixed_epoch):
         return None
     label = self.label
     global_logits = tf.reduce_max(self.global_encode, axis=1)
     logits = layer.fc_fun(global_logits, self.n_classes)
     if self._config.type == 'single_label':
         global_loss = tf.reduce_sum(
             tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                     labels=label))
     elif self._config.type == 'multi_label':
         global_loss = tf.reduce_sum(
             tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                     labels=label))
     else:
         raise NotImplementedError
     return global_loss
Beispiel #7
0
    def logits_cnn_dp(self):
        logging.info("DPCNN")
        with tf.name_scope("embedding"):
            self.filter_num = 250
            self.kernel_size = 3
            embedding = self.embedding(self.sequence)
            self.embedding_dim = self.emb_size
            # embedding_inputs = tf.expand_dims(embedding, axis=-1)  # [None,seq,embedding,1]
            # region_embedding  # [batch,seq-3+1,1,250]
            # region_embedding = tf.layers.conv2d(embedding_inputs, self.num_filters,
            #                                     [self.kernel_size, self.embedding_dim])
            region_embedding = self.conv1d(embedding,
                                           3,
                                           self.emb_size,
                                           no_act=True)
            region_embedding = tf.expand_dims(region_embedding, axis=2)
            # (4, 254, 1, 250), max_sl: 256

            pre_activation = tf.nn.relu(region_embedding, name='preactivation')
        with tf.name_scope("conv3_0"):
            conv3 = tf.layers.conv2d(pre_activation,
                                     self.filter_num,
                                     self.kernel_size,
                                     padding="same")
            conv3 = tf.layers.batch_normalization(conv3,
                                                  training=self.is_train)

        with tf.name_scope("conv3_1"):
            conv3 = tf.layers.conv2d(conv3,
                                     self.filter_num,
                                     self.kernel_size,
                                     padding="same")
            conv3 = tf.layers.batch_normalization(conv3,
                                                  training=self.is_train)

        # print conv3.shape
        # (4, 254, 1, 250)

        # resdul
        conv3 = conv3 + region_embedding

        for block in range(6):
            with tf.name_scope("block_{}".format(block)):
                with tf.name_scope("pool_1"):
                    pool = tf.pad(conv3,
                                  paddings=[[0, 0], [0, 1], [0, 0], [0, 0]])
                    # print "pool", pool.shape  # (4, 255, 1, 250)
                    pool = tf.nn.max_pool(pool, [1, 3, 1, 1],
                                          strides=[1, 2, 1, 1],
                                          padding='VALID')
                    # print "pool", pool.shape  # (4, 127, 1, 250)

                with tf.name_scope("conv3_2"):
                    conv3 = tf.layers.conv2d(pool,
                                             self.filter_num,
                                             self.kernel_size,
                                             padding="same",
                                             activation=tf.nn.relu)
                    # print conv3.shape   # (4, 127, 1, 250)
                    conv3 = tf.layers.batch_normalization(
                        conv3, training=self.is_train)

                with tf.name_scope("conv3_3"):
                    conv3 = tf.layers.conv2d(conv3,
                                             self.filter_num,
                                             self.kernel_size,
                                             padding="same",
                                             activation=tf.nn.relu)
                    # print conv3.shape  # (4, 127, 1, 250)
                    conv3 = tf.layers.batch_normalization(
                        conv3, training=self.is_train)
                # resdul
                conv3 = conv3 + pool
            # print conv3.shape
        # pool_size = int((self.max_seq_len - 3 + 1) / 2)
        # conv3 = tf.layers.max_pooling1d(tf.squeeze(conv3, [2]), pool_size, 1)
        conv3 = tf.reduce_max(conv3, 1)

        conv3 = tf.squeeze(conv3)  # [batch,250]
        conv3 = tf.layers.dropout(conv3, self.fc_drop, training=self.is_train)
        fcl_output = layer.fc_fun(conv3, self.n_classes, initial_type='xavier')
        return fcl_output
Beispiel #8
0
    def cnn(self, x, global_infor):
        logging.info("##########cnn#########")
        self.conv_out = []
        with tf.variable_scope("extractor_cnn"):
            concat_vec = []
            if self._config.attention_type == "attend_init":
                global_infor_abstract = tf.reduce_max(global_infor, axis=1)
                global_infor_abstract = tf.layers.batch_normalization(
                    global_infor_abstract, training=self.is_train)
                global_infor_abstract = layer.fc_fun(global_infor_abstract,
                                                     self.emb_size,
                                                     activation="relu")

                global_infor_abstract = tf.reshape(
                    tf.tile(global_infor_abstract, [1, self.max_seq_len]),
                    [self.batch_size, self.max_seq_len, -1])
            for filter_i in self.filter_size:
                # filter_shape = [filter_i, self.represent_size, self.filter_num]
                with tf.variable_scope("cnn_filter_{}".format(filter_i)):
                    # conv_W = layer.conv_weight_variable(filter_shape, name=filter_i)
                    # conv_b = layer.bias_variable([self.filter_num], name=filter_i)
                    if self._config.attention_type == "attend_init":
                        pad_input = tf.pad(x,
                                           [[0, 0], [filter_i - 1, 0], [0, 0]],
                                           mode="CONSTANT")
                        #  print("pad_input:", pad_input.get_shape()) (batch_size, seq_max_len + window_size - 1, embed_size)
                        cnn_blocks = []
                        for tmp_i in range(self.max_seq_len):
                            cnn_blocks.append(
                                tf.slice(pad_input, [0, tmp_i, 0],
                                         [-1, filter_i, -1],
                                         name='cnn_block'))
                        cnn_blocks = tf.stack(
                            cnn_blocks, 1
                        )  # (batch_size, seq_max_len, filter_i, embed_size)
                        cnn_blocks = tf.reduce_mean(cnn_blocks, 2)

                        global_infor_attend = layer.basic_attention(
                            cnn_blocks, global_infor, name="cnn")
                        global_infor_attend = tf.layers.batch_normalization(
                            global_infor_attend, training=self.is_train)
                        global_infor_attend = layer.fc_fun(global_infor_attend,
                                                           self.emb_size,
                                                           activation="relu")

                        global_infor = tf.concat(
                            [global_infor_attend, global_infor_abstract], 2)
                        global_infor = tf.layers.batch_normalization(
                            global_infor, training=self.is_train)
                        global_infor = layer.fc_fun(
                            global_infor,
                            self.emb_size,
                            initial_type=self._config.initial_type,
                            activation="relu")
                conved = self.conv1d(x,
                                     filter_i,
                                     self.represent_size,
                                     global_infor=global_infor)
                # conved = self.conv1d(x, filter_i, self.represent_size, global_infor=global_infor, conv_W=conv_W, conv_b=conv_b)
                self.conv_out.append(conved)
                conved = tf.reduce_max(conved, axis=1)
                concat_vec.append(conved)
        self.conv_out = tf.stack(self.conv_out)
        self.conv_out = tf.transpose(self.conv_out, [1, 0, 2, 3])
        return tf.concat(concat_vec, -1)
Beispiel #9
0
    def logits_drnn(self):
        logging.info("##########logit is rnn##########")
        self.represent_size = self.emb_size
        # self.initial_state = self.rnn_cell.zero_state(self.batch_size*self.max_seq_len, dtype=tf.float32)
        # self.initial_state = tf.cast(self.initial_state, tf.float32)

        embedded_words = self.embedding(self.sequence)
        embedded_words = tf.layers.dropout(embedded_words,
                                           self.fc_drop,
                                           training=self.is_train)
        if self._config.attention_type == "same_init":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            if self._config.encode_type == "attend_rnn":
                global_encode = tf.reduce_sum(global_encode, axis=1)
            else:
                global_encode = tf.reduce_max(global_encode, axis=1)
            global_encode_mlp = tf.layers.batch_normalization(
                global_encode, training=self.is_train)
            global_encode_mlp = layer.fc_fun(
                global_encode_mlp,
                self.emb_size,
                initial_type=self._config.initial_type,
                activation="relu")

            global_encode_mlp = tf.tile(global_encode_mlp,
                                        [1, self.max_seq_len])
            global_encode_mlp = tf.reshape(
                global_encode_mlp,
                [self.batch_size, self.max_seq_len, self.emb_size])
            self.global_fake = global_encode_mlp
        elif self._config.attention_type == "attend_init":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                self.emb_size,
                initial_type=self._config.initial_type)
            self.global_fake = tf.reshape(
                global_encode_mlp,
                [self.batch_size, self.max_seq_len, self.emb_size])
            #  even if we set self.initial_state as a variable, it stills dons't update, for there is no grad in the variable
            # init_state = tf.get_variable(name='initial_state', shape=[self.batch_size*self.max_seq_len, self.rnn_units])
        else:
            if self._config.attention_type is not None:
                raise NotImplementedError

        input_pad = self.pre_pad(
            embedded_words)  # [batch_size, block, window_size, emb_size]
        drnn_output = self.d_rnn(input_pad)  # [batch_size, block, rnn_units]

        # self.tmp2 = drnn_output
        drnn_output = tf.reshape(drnn_output, [-1, self.mlp_units])
        drnn_output = tf.matmul(drnn_output, self.WC)
        drnn_output = tf.reshape(drnn_output,
                                 [self.batch_size, -1, self.mlp_units])

        mask = tf.sequence_mask(
            self.sequence_length, self.max_seq_len,
            dtype=drnn_output.dtype)  # [batch_size, max_seq_len]
        mask = tf.reshape(mask, [self.batch_size, self.max_seq_len, 1])
        # self.tmp1 = mask
        drnn_output = drnn_output * mask

        hs = tf.reduce_max(drnn_output, axis=1)
        hs = tf.layers.dropout(hs, self.fc_drop, training=self.is_train)

        mlp = tf.matmul(hs, self.W)

        mlp = tf.layers.batch_normalization(mlp, training=self.is_train)
        mlp = tf.nn.relu(mlp)

        fcl_output = tf.matmul(mlp, self.u)
        # self.tmp2 = fcl_output

        return fcl_output
Beispiel #10
0
    def logits_cnn_1d(self):
        embedded_words = self.embedding(self.sequence)
        embedded_words = tf.layers.dropout(embedded_words,
                                           self.fc_drop,
                                           training=self.is_train)
        if self._config.global_size:
            global_size = self._config.global_size
        else:
            global_size = self.rnn_units
        self.global_fake = None
        self.represent_size = self.emb_size
        if self._config.attention_type == "pre_attention":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            embedded_words = global_encode
        elif self._config.attention_type == "diff_concat":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                global_size,
                initial_type=self._config.initial_type,
                factor=self._config.xavier_factor)
            self.represent_size += global_size
            print("represent_size: {}".format(self.represent_size))
            embedded_words = tf.concat([embedded_words, global_encode_mlp],
                                       axis=-1)
        elif self._config.attention_type == "same_concat":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                global_size,
                initial_type=self._config.initial_type,
                factor=self._config.xavier_factor)
            global_encode_mlp = tf.reduce_max(global_encode_mlp, axis=1)
            global_encode_mlp = tf.reshape(
                tf.tile(global_encode_mlp, [1, self.max_seq_len]),
                [self.batch_size, self.max_seq_len, -1])
            self.represent_size += global_size
            print("represent_size: {}".format(self.represent_size))
            embedded_words = tf.concat([embedded_words, global_encode_mlp],
                                       axis=-1)
        elif self._config.attention_type == "same_init":
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            global_encode = tf.reduce_max(global_encode, axis=1)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                self.rnn_units,
                initial_type=self._config.initial_type)
            global_encode_mlp = tf.tile(global_encode_mlp,
                                        [1, self.max_seq_len])
            global_encode_mlp = tf.reshape(
                global_encode_mlp,
                [self.batch_size, self.max_seq_len, self.emb_size])
            self.global_fake = global_encode_mlp
        elif self._config.attention_type in ["diff_init", "attend_init"]:
            global_encode, global_encode_units = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                self.rnn_units,
                initial_type=self._config.initial_type)
            self.global_fake = tf.reshape(
                global_encode_mlp,
                [self.batch_size, self.max_seq_len, self.emb_size])
        else:
            if self._config.attention_type is not None:
                raise NotImplementedError
        outputs = self.cnn(embedded_words, global_infor=self.global_fake)

        outputs = tf.nn.leaky_relu(outputs)
        outputs = tf.layers.batch_normalization(outputs,
                                                training=self.is_train)

        fcl_output = layer.fc_fun(outputs,
                                  2000,
                                  initial_type='xavier',
                                  activation=self._config.fc_activation_1)
        fcl_output = tf.layers.dropout(fcl_output,
                                       rate=self.fc_drop,
                                       training=self.is_train)
        fcl_output = layer.fc_fun(fcl_output,
                                  self.n_classes,
                                  initial_type='xavier')
        return fcl_output
Beispiel #11
0
    def logits_rnn(self):
        self.represent_size = self.emb_size
        self.initial_state = None
        # self.initial_state = self.rnn_cell.zero_state(self.batch_size*self.max_seq_len, dtype=tf.float32)
        # self.initial_state = tf.cast(self.initial_state, tf.float32)

        embedded_words = self.embedding(self.sequence)
        # embedded_words = tf.layers.dropout(embedded_words, self.fc_drop, training=self.is_train)
        if self._config.global_size:
            global_size = self._config.global_size
        else:
            global_size = self.rnn_units
        if self._config.attention_type == "pre_attention":
            global_encode, global_encode_units, last_state = self.encoder(
                self.sequence, embedded_words)
            embedded_words = global_encode
        elif self._config.attention_type == "diff_concat":
            global_encode, global_encode_units, last_state = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                global_size,
                initial_type=self._config.initial_type)
            self.represent_size += global_size
            embedded_words = tf.concat([embedded_words, global_encode_mlp],
                                       axis=-1)
        elif self._config.attention_type == "same_concat":
            global_encode, global_encode_units, last_state = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                global_size,
                initial_type=self._config.initial_type)
            global_encode_mlp = tf.reduce_max(global_encode_mlp, axis=1)
            global_encode_mlp = tf.reshape(
                tf.tile(global_encode_mlp, [1, self.max_seq_len]),
                [self.batch_size, self.max_seq_len, -1])
            self.represent_size += global_size
            print self.represent_size
            embedded_words = tf.concat([embedded_words, global_encode_mlp],
                                       axis=-1)
        elif self._config.attention_type == "same_init":
            global_encode, global_encode_units, last_state = self.encoder(
                self.sequence, embedded_words)
            if self._config.encode_type == "rnn":
                last_state = last_state
                # last_state = layer.fc_fun(
                #     last_state, self.rnn_units, initial_type=self._config.initial_type,
                #     factor=self._config.xavier_factor, activation="relu")
            elif self._config.encode_type == "cnn":
                last_state = tf.reduce_max(global_encode, 1)
                last_state = tf.layers.batch_normalization(
                    last_state, training=self.is_train)
                last_state = layer.fc_fun(
                    last_state,
                    self.rnn_units,
                    initial_type=self._config.initial_type,
                    activation="relu")
            print("********initial state*********")
            last_state = tf.tile(last_state, [1, self.max_seq_len])
            # last_state = tf.reshape(last_state, [self.batch_size, self.rnn_units, self.max_seq_len])
            last_state = tf.reshape(
                last_state,
                [self.batch_size * self.max_seq_len, self.rnn_units])
            print("initial_state.shape {}".format(last_state.shape))
            self.initial_state = last_state
            # self.initial_state = tf.get_variable(name='initial_state', shape=[self.batch_size*self.max_seq_len, self.rnn_units])
        elif self._config.attention_type in ["diff_init", "attend_init"]:
            global_encode, global_encode_units, last_state = self.encoder(
                self.sequence, embedded_words)
            global_encode_mlp = layer.fc_fun(
                global_encode,
                self.rnn_units,
                initial_type=self._config.initial_type)
            self.global_fake = tf.reshape(
                global_encode_mlp,
                [self.batch_size, self.max_seq_len, self.emb_size])
            #  even if we set self.initial_state as a variable, it stills dons't update, for there is no grad in the variable
            # init_state = tf.get_variable(name='initial_state', shape=[self.batch_size*self.max_seq_len, self.rnn_units])
        else:
            if self._config.attention_type is not None:
                raise NotImplementedError

        input_pad = self.pre_pad(
            embedded_words)  # [batch_size, block, window_size, emb_size]
        drnn_output = self.d_rnn(input_pad)  # [batch_size, block, rnn_units]

        # self.tmp2 = drnn_output
        drnn_output = tf.reshape(drnn_output, [-1, self.mlp_units])
        drnn_output = tf.matmul(drnn_output, self.WC)
        drnn_output = tf.reshape(drnn_output,
                                 [self.batch_size, -1, self.mlp_units])

        mask = tf.sequence_mask(
            self.sequence_length, self.max_seq_len,
            dtype=drnn_output.dtype)  # [batch_size, max_seq_len]
        mask = tf.reshape(mask, [self.batch_size, self.max_seq_len, 1])
        # self.tmp1 = mask
        drnn_output = drnn_output * mask

        hs = tf.reduce_max(drnn_output, axis=1)
        # hs = tf.layers.dropout(hs, self.fc_drop, training=self.is_train)

        mlp = tf.matmul(hs, self.W)

        mlp = tf.layers.batch_normalization(mlp, training=self.is_train)
        mlp = tf.nn.relu(mlp)

        # mlp = tf.layers.dropout(mlp, self.fc_drop, training=self.is_train)
        fcl_output = tf.matmul(mlp, self.u)
        # self.tmp2 = fcl_output

        return fcl_output