def __init__(self, FLAGS=None):
        self.FLAGS = FLAGS
        self.config = config

        self.diff_len = config.max_diff_len
        self.seq_len = config.max_sent_len
        self.embed_size = config.word_dim
        self.num_class = config.num_class
        self.lstm_size = config.lstm_size

        # Add Word Embedding
        self.we = tf.Variable(FLAGS.we, name='emb')

        # Add PlaceHolder

        # define basic four input layers - for warrant0, warrant1, reason, claim
        self.input_warrant0 = tf.placeholder(tf.int32, (None, self.seq_len), name='warrant0')  # [batch_size, sent_len]
        self.input_warrant1 = tf.placeholder(tf.int32, (None, self.seq_len), name='warrant1')  # [batch_size, sent_len]
        self.input_reason = tf.placeholder(tf.int32, (None, self.seq_len), name='reason')  # [batch_size, sent_len]
        self.input_claim = tf.placeholder(tf.int32, (None, self.seq_len), name='claim')  # [batch_size, sent_len]
        self.input_debate = tf.placeholder(tf.int32, (None, self.seq_len), name='debate')  # [batch_size, sent_len]

        self.warrant0_len = tf.placeholder(tf.int32, (None, ), name='warrant0_len')  # [batch_size,]
        self.warrant1_len = tf.placeholder(tf.int32, (None, ), name='warrant1_len')  # [batch_size,]
        self.reason_len = tf.placeholder(tf.int32, (None, ), name='reason_len')  # [batch_size,]
        self.claim_len = tf.placeholder(tf.int32, (None, ), name='claim_len')  # [batch_size,]
        self.debate_len = tf.placeholder(tf.int32, (None, ), name='debate_len')  # [batch_size,]

        self.target_label = tf.placeholder(tf.int32, (None, self.num_class), name='label')  # [batch_size, num_class]

        self.drop_keep_rate = tf.placeholder(tf.float32)
        self.learning_rate = tf.placeholder(tf.float32)

        self.input_diff_warrant0 = tf.placeholder(tf.int32, (None, self.diff_len), name='diff_warrant0')  # [batch_size, sent_len]
        self.input_diff_warrant1 = tf.placeholder(tf.int32, (None, self.diff_len), name='diff_warrant1')  # [batch_size, sent_len]
        self.diff_warrant0_len = tf.placeholder(tf.int32, (None,), name='diff_warrant0_len')  # [batch_size,]
        self.diff_warrant1_len = tf.placeholder(tf.int32, (None,), name='diff_warrant1_len')  # [batch_size,]

        self.input_diff_claim = tf.placeholder(tf.int32, (None, self.diff_len), name='diff_claim')
        self.diff_claim_len = tf.placeholder(tf.int32, (None,), name='diff_claim_len')

        # now define embedded layers of the input
        embedded_warrant0 =  tf.nn.embedding_lookup(self.we, self.input_warrant0)
        embedded_warrant1 =  tf.nn.embedding_lookup(self.we, self.input_warrant1)
        embedded_reason = tf.nn.embedding_lookup(self.we, self.input_reason)
        embedded_claim = tf.nn.embedding_lookup(self.we, self.input_claim)
        embedded_debate = tf.nn.embedding_lookup(self.we, self.input_debate)

        embedded_diff_warrant0 = tf.nn.embedding_lookup(self.we, self.input_diff_warrant0)
        embedded_diff_warrant1 = tf.nn.embedding_lookup(self.we, self.input_diff_warrant1)

        embedded_diff_claim = tf.nn.embedding_lookup(self.we, self.input_diff_claim)

        def conv_ngram(input_x, filter_sizes=(1, 2, 3), num_filters=32):
            """
            Conv ngram
            """
            sent_len = input_x.get_shape()[1]
            embed_size = input_x.get_shape()[2]
            input_x = tf.expand_dims(input_x, axis=-1)
            outputs = []
            for i, filter_size in enumerate(filter_sizes):
                with tf.variable_scope("conv-maxpool-%s" % filter_size):
                    filter_shape = [filter_size, embed_size, 1, num_filters]
                    W = tf.get_variable("W", filter_shape, initializer=tf.random_normal_initializer())
                    b = tf.get_variable("b", [num_filters], initializer=tf.constant_initializer(0.0))
                    conv = tf.nn.conv2d(input_x, W, strides=[1, 1, embed_size, 1], padding='SAME', name="conv")
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                    h = tf.squeeze(h, axis=2)
                    outputs.append(h)
            outputs = tf.concat(outputs, axis=2)
            return outputs

        with tf.variable_scope("conv") as s:
            conv_warrant0 = conv_ngram(embedded_warrant0)
            s.reuse_variables()
            conv_warrant1 = conv_ngram(embedded_warrant1)
            conv_reason = conv_ngram(embedded_reason)
            conv_claim = conv_ngram(embedded_claim)
            conv_debate = conv_ngram(embedded_debate)
            conv_diff_warrant0 = conv_ngram(embedded_diff_warrant0)
            conv_diff_warrant1 = conv_ngram(embedded_diff_warrant1)
            conv_diff_claim = conv_ngram(embedded_diff_claim)

        def AttBiLSTM(attention_vector, input_x, input_x_len, hidden_size, rnn_type='lstm', return_sequence=True):
            """
            AttBiLSTM layer
            """
            if rnn_type == 'lstm':
                Cell = AttBasicLSTMCell
            elif rnn_type == 'gru':
                Cell = AttGRUCell
            else:
                raise NotImplementedError

            cell_fw = Cell(attention_vector, num_units=hidden_size)
            cell_bw = Cell(attention_vector, num_units=hidden_size)

            b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, input_x,
                                                                  sequence_length=input_x_len, dtype=tf.float32)
            if return_sequence:
                outputs = tf.concat(b_outputs, axis=2)
            else:
                # states: [c, h]
                if rnn_type == 'lstm':
                    outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1)
                elif rnn_type == 'gru':
                    outputs = tf.concat(b_states, axis=-1)
                else:
                    raise NotImplementedError
            return outputs

        pooling_diff_warrant0 = tf_utils.MaxPooling(conv_diff_warrant0, self.diff_warrant0_len)
        pooling_diff_warrant1 = tf_utils.MaxPooling(conv_diff_warrant1, self.diff_warrant1_len)
        print(self.diff_claim_len.shape)
        pooling_diff_claim = tf_utils.MaxPooling(conv_diff_claim, self.diff_claim_len)

        with tf.variable_scope("att_warrant_lstm") as s:
            bilstm_warrant0 = AttBiLSTM(pooling_diff_warrant0, conv_warrant0, self.warrant0_len, self.lstm_size,
                                        rnn_type=FLAGS.rnn_type)
            s.reuse_variables()
            bilstm_warrant1 = AttBiLSTM(pooling_diff_warrant1, conv_warrant1, self.warrant1_len, self.lstm_size,
                                        rnn_type=FLAGS.rnn_type)
            bilstm_claim = AttBiLSTM(pooling_diff_claim, conv_claim, self.claim_len, self.lstm_size)

        with tf.variable_scope("bi_lstm") as s:
            bilstm_reason = tf_utils.BiLSTM(conv_reason, self.reason_len, self.lstm_size, rnn_type=FLAGS.rnn_type)
            s.reuse_variables()
            # bilstm_claim = tf_utils.BiLSTM(conv_claim, self.claim_len, self.lstm_size, rnn_type=FLAGS.rnn_type)
            bilstm_debate = tf_utils.BiLSTM(conv_debate, self.debate_len, self.lstm_size, rnn_type=FLAGS.rnn_type)

        with tf.variable_scope("pooling") as s:
            ''' Pooling Layer '''
            pooling_warrant0 = tf_utils.MaxPooling(bilstm_warrant0, self.warrant0_len)
            pooling_warrant1 = tf_utils.MaxPooling(bilstm_warrant1, self.warrant1_len)
            pooling_reason = tf_utils.MaxPooling(bilstm_reason, self.reason_len)
            pooling_claim = tf_utils.MaxPooling(bilstm_claim, self.claim_len)
            pooling_debate = tf_utils.MaxPooling(bilstm_debate, self.debate_len)

        attention_vector_for_W0 = tf.concat([pooling_debate, pooling_reason, pooling_warrant0, pooling_claim, pooling_diff_warrant0], axis=-1)
        attention_vector_for_W1 = tf.concat([pooling_debate, pooling_reason, pooling_warrant1, pooling_claim, pooling_diff_warrant1], axis=-1)

        with tf.variable_scope("att_lstm") as s:
            attention_warrant0 = AttBiLSTM(attention_vector_for_W0, bilstm_warrant0, self.warrant0_len, self.lstm_size,
                                           rnn_type = FLAGS.rnn_type,
                                           return_sequence=False)
            s.reuse_variables()
            attention_warrant1 = AttBiLSTM(attention_vector_for_W1, bilstm_warrant1, self.warrant1_len, self.lstm_size,
                                           rnn_type=FLAGS.rnn_type,
                                           return_sequence=False)

        self.attention_warrant0 = attention_warrant0
        self.attention_warrant1 = attention_warrant1

        # concatenate them
        merge_warrant = tf.concat([pooling_reason * pooling_claim,
                                   attention_warrant0, attention_warrant1,
                                   attention_warrant0 - attention_warrant1,
                                   attention_warrant0 * attention_warrant1], axis=-1)
        dropout_warrant = tf.nn.dropout(merge_warrant, self.drop_keep_rate)

        # and add one extra layer with ReLU
        with tf.variable_scope("linear") as s:
            dense1 = tf.nn.relu(tf_utils.linear(dropout_warrant, int(self.lstm_size / 2), bias=True, scope='dense'))
            logits = tf_utils.linear(dense1, self.num_class, bias=True, scope='logit')

        # Obtain the Predict, Loss, Train_op
        predict_prob = tf.nn.softmax(logits, name='predict_prob')
        predict_label = tf.cast(tf.argmax(logits, axis=1), tf.int32)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.target_label)
        loss = tf.reduce_mean(loss)

        # Build the loss
        global_step = tf.Variable(0, name='global_step', trainable=False)
        train_op = tf_utils.optimize(loss, 'adam', FLAGS.lambda_l2, self.learning_rate, global_step, FLAGS.clipper)

        self.predict_prob = predict_prob
        self.predict_label = predict_label
        self.loss = loss
        self.train_op = train_op
        self.global_step = global_step
    def __init__(self, FLAGS=None):
        self.FLAGS = FLAGS
        self.config = config
        self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step")
        self.epoch_increment = tf.assign(self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.seq_len = config.max_sent_len
        self.embed_size = config.word_dim
        self.num_class = config.num_class
        self.filter_sizes = [1, 2, 3, 4]
        self.num_filters = FLAGS.num_filters
        self.initializer = tf.random_normal_initializer(stddev=0.1)
        # Add PlaceHolder
        self.input_x = tf.placeholder(tf.int32, (None, self.seq_len))  # [batch_size, sent_len]
        self.input_x_len = tf.placeholder(tf.int32, (None,))
        self.input_y = tf.placeholder(tf.int32, (None, self.num_class))
        # self.mlp_h1_size = 200
        # self.mlp_h2_size = 140
        self.drop_keep_rate = tf.placeholder(tf.float32)
        self.drop_hidden1 = tf.placeholder(tf.float32)
        self.drop_hidden2 = tf.placeholder(tf.float32)
        self.learning_rate = tf.placeholder(tf.float32)

        # Add Word Embedding
        self.we = tf.Variable(FLAGS.we, name='emb')

        # Build the Computation Graph
        def CNN(input_x, seq_len, filter_sizes, num_filters=1, dropout_rate=None):
            """
            CNN Layer
            Args:
                input_x: [batch, sent_len, emb_size, 1]
                seq_len: int
                filter_sizes: list
                num_filters: int
                dropout_rate: float
            Returns:
                outputs: [batch, num_filters * len(filter_sizes)]
            """
            pooled_outputs = []
            for i, filter_size in enumerate(filter_sizes):
                with tf.name_scope("convolution-pooling-%s" % filter_size):
                    # ====>a.create filter
                    filter = tf.get_variable("filter-%s" % filter_size, [filter_size, self.embed_size, 1, num_filters],
                                             initializer=self.initializer)
                    # ====>b.conv operation: conv2d===>computes a 2-D convolution given 4-D `input` and `filter` tensors.
                    # Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]`
                    # Conv.Returns: A `Tensor`. Has the same type as `input`.
                    # A 4-D tensor. The dimension order is determined by the value of `data_format`, see below for details.
                    # 1) each filter with conv2d's output a shape:[1, sent_len-filter_size+1, 1, 1];2) * num_filters--->[1, sent_len - filter_size+1,1,num_filters];3)*batch_size--->[batch_size,sequence_length-filter_size+1,1,num_filters]
                    # input data format: NHWC: [batch, height, width, channels]; output:4-D

                    # shape:[batch_size, sent_len - filter_size + 1, 1, num_filters]
                    conv = tf.nn.conv2d(input_x, filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                    # ====>c. apply nolinearity
                    b = tf.get_variable("b-%s" % filter_size, [num_filters])  # ADD 2017-06-09
                    # shape: [batch_size, sent_len - filter_size + 1, 1, num_filters]. tf.nn.bias_add:adds `bias` to `value`
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), "relu")
                    # ====>. max-pooling.  value: A 4-D `Tensor` with shape `[batch, height, width, channels]
                    # ksize: A list of ints that has length >= 4.  The size of the window for each dimension of the input tensor.
                    # strides: A list of ints that has length >= 4.  The stride of the sliding window for each dimension of the input tensor.
                    # shape:[batch_size, 1, 1, num_filters].max_pool: performs the max pooling on the input.
                    pooled = tf.nn.max_pool(h, ksize=[1, seq_len - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool")
                    pooled_outputs.append(pooled)
            # 3.=====>combine all pooled features, and flatten the feature.output' shape is a [1,None]
            # e.g. >>> x1=tf.ones([3,3]);x2=tf.ones([3,3]);x=[x1,x2]
            #         x12_0=tf.concat(x,0)---->x12_0' shape:[6,3]
            #         x12_1=tf.concat(x,1)---->x12_1' shape;[3,6]
            # shape:[batch_size, 1, 1, num_filters_total]. tf.concat=>concatenates tensors along one dimension.where num_filters_total=num_filters_1+num_filters_2+num_filters_3
            h_pool = tf.concat(pooled_outputs, -1)
            num_filters_total = num_filters * len(filter_sizes)
            # shape should be:[None,num_filters_total]. here this operation has some result as tf.sequeeze().e.g. x's shape:[3,3];tf.reshape(-1,x) & (3, 3)---->(1,9)
            outputs = tf.reshape(h_pool, [-1, num_filters_total])

            # 4.=====>add dropout: use tf.nn.dropout
            if dropout_rate is not None:
                # [None, num_filters_total]
                outputs = tf.nn.dropout(outputs, keep_prob=dropout_rate)

            # 5. logits(use linear layer)and predictions(argmax)
            # with tf.name_scope("output"):
            #     # shape:[None, self.num_classes]==tf.matmul([None,self.embed_size],[self.embed_size,self.num_classes])
            #     logits = tf.matmul(self.h_drop, self.W_projection) + self.b_projection
            return outputs

        # TODO: implenment CNN Begin:
        inputs = tf.nn.embedding_lookup(self.we, self.input_x)  # [batch_size, sent_len, emd_size]
        inputs_embeddings_expanded = tf.expand_dims(inputs, -1)
        cnn_x = CNN(inputs_embeddings_expanded, self.seq_len, self.filter_sizes, self.num_filters, self.drop_keep_rate)
        # TODO: implenment CNN end
        # hidden1 = tf.nn.relu(tf_utils.linear(cnn_x, self.mlp_h1_size, bias=True, scope='h1'))
        # hidden1_drop = tf.nn.dropout(hidden1, keep_prob=self.drop_keep_rate)
        # hidden2 = tf.nn.relu(tf_utils.linear(hidden1_drop, self.mlp_h2_size, bias=True, scope='h2'))
        # hidden2_drop = tf.nn.dropout(hidden2, keep_prob=self.drop_keep_rate)
        logits = tf_utils.linear(cnn_x, self.num_class, bias=True, scope='softmax')

        # Obtain the Predict, Loss, Train_op
        predict_prob = tf.nn.softmax(logits, name='predict_prob')
        predict_label = tf.cast(tf.argmax(logits, 1), tf.int32)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y)
        loss = tf.reduce_mean(loss)

        l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1])
        reg_loss = loss + FLAGS.lambda_l2 * l2_loss

        # Build the loss
        global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # optimizer = tf.train.AdadeltaOptimizer(self.learning_rate)
        # optimizer = tf.train.AdagradOptimizer(self.learning_rate)

        if FLAGS.clipper:
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), FLAGS.clipper)
            train_op = optimizer.apply_gradients(list(zip(grads, tvars)))
        else:
            train_op = optimizer.minimize(loss, global_step=global_step)

        self.predict_prob = predict_prob
        self.predict_label = predict_label
        self.seq_res = cnn_x
        self.logits = logits
        self.loss = loss
        self.reg_loss = reg_loss
        self.train_op = train_op
        self.global_step = global_step
Exemple #3
0
    def __init__(self, FLAGS=None):
        self.FLAGS = FLAGS
        self.config = config
        self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step")
        self.epoch_increment = tf.assign(
            self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.seq_len = config.max_sent_len
        self.word_len = config.max_word_len
        self.word_embed_size = config.word_dim
        self.char_embed_size = config.char_dim
        self.num_class = config.num_class
        self.num_vocab = FLAGS.num_vocab
        self.initializer = tf.random_normal_initializer(stddev=0.1)
        self.filter_sizes = [1, 2, 3, 4]
        self.num_filters = FLAGS.num_filters
        self.char_lstm_size = 50
        self.lstm_size = 512
        self.mlp_h1_size = 200
        self.layer_size = FLAGS.layer_size
        self.with_char = FLAGS.with_char
        self.char_type = FLAGS.char_type
        self.with_ner = FLAGS.with_ner
        self.with_pos = FLAGS.with_pos
        self.with_rf = FLAGS.with_rf
        self.with_pun = FLAGS.with_pun
        self.with_senti = FLAGS.with_senti
        self.with_attention = FLAGS.with_attention
        self.with_cnn = FLAGS.with_cnn
        self.with_cnn_lstm = FLAGS.with_cnn_lstm
        self.drop_keep_rate = tf.placeholder(tf.float32)
        self.drop_hidden1 = tf.placeholder(tf.float32)
        self.learning_rate = tf.placeholder(tf.float32)

        # Add PlaceHolder
        self.input_x = tf.placeholder(
            tf.int32, (None, self.seq_len))  # [batch_size, sent_len]
        self.input_x_len = tf.placeholder(tf.int32, (None, ))  # [batch_len]
        self.input_y = tf.placeholder(
            tf.int32, (None, self.num_class))  # [batch_size, label_size]
        # Add Word Embedding
        self.we = tf.Variable(FLAGS.we, name='emb')
        if self.with_ner:
            self.input_x_ner = tf.placeholder(tf.int32, (None, self.seq_len))
            self.ner_we = tf.Variable(FLAGS.ner_we, name='ner_emb')
        if self.with_pos:
            self.input_x_pos = tf.placeholder(tf.int32, (None, self.seq_len))
            self.pos_we = tf.Variable(FLAGS.pos_we, name='pos_emb')
        if self.with_rf:
            self.input_rf = tf.placeholder(tf.float32, (None, self.num_vocab))
        if self.with_pun:
            self.input_x_pun = tf.placeholder(tf.float32, (None, 9))
        if self.with_senti:
            self.input_x_senti = tf.placeholder(tf.float32, (None, 110))
        if self.with_char:
            # [batch_size, sent_len, word_len]
            self.input_x_char = tf.placeholder(
                tf.int32, (None, self.seq_len, self.word_len))
            self.input_x_char_len = tf.placeholder(
                tf.int32, (None, self.seq_len))  # [batch_size, sen_len]
            # The Char Embedding is Random Initialization
            self.char_we = tf.Variable(FLAGS.char_we, name='char_emb')

        # attention process:
        # 1.get logits for each word in the sentence.
        # 2.get possibility distribution for each word in the sentence.
        # 3.get weighted sum for the sentence as sentence representation.
        def attention_word_level(hidden_state,
                                 hidden_size,
                                 sequence_length,
                                 seq_len,
                                 scope=None,
                                 reuse=None):
            """
            hidden_state: [batch_size, sequence_length, hidden_size*2]
            context vector:
            :return [batch_size*num_sentences, hidden_size*2]
            """
            with tf.variable_scope(scope or "attention", reuse=reuse):
                self.W_w_attention_word = tf.get_variable(
                    "W_w_attention_word",
                    shape=[hidden_size * 2, hidden_size * 2])
                self.W_b_attention_word = tf.get_variable(
                    "W_b_attention_word", shape=[hidden_size * 2])
                self.context_vecotor_word = tf.get_variable(
                    "what_is_the_informative_word",
                    shape=[hidden_size * 2
                           ])  # TODO o.k to use batch_size in first demension?
                # 0) one layer of feed forward network
                # shape: [batch_size*sequence_length, hidden_size*2]
                hidden_state_ = tf.reshape(hidden_state,
                                           shape=[-1, hidden_size * 2])
                # hidden_state_: [batch_size*sequence_length, hidden_size*2]
                # W_w_attention_sentence: [hidden_size*2, hidden_size*2]
                hidden_representation = tf.nn.tanh(
                    tf.matmul(hidden_state_, self.W_w_attention_word) +
                    self.W_b_attention_word)
                # shape: [batch_size, sequence_length, hidden_size*2]
                hidden_representation = tf.reshape(
                    hidden_representation,
                    shape=[-1, sequence_length, hidden_size * 2])

                # 1) get logits for each word in the sentence.
                # hidden_representation: [batch_size, sequence_length, hidden_size*2]
                # context_vecotor_word: [hidden_size*2]
                hidden_state_context_similiarity = tf.multiply(
                    hidden_representation, self.context_vecotor_word)
                # 对应相乘再求和,得到权重
                # shape: [batch_size, sequence_length]
                attention_logits = tf.reduce_sum(
                    hidden_state_context_similiarity, axis=2)
                # subtract max for numerical stability (softmax is shift invariant).
                # tf.reduce_max:Computes the maximum of elements across dimensions of a tensor.
                # shape: [batch_size, 1]
                attention_logits_max = tf.reduce_max(attention_logits,
                                                     axis=1,
                                                     keep_dims=True)
                # 2) get possibility distribution for each word in the sentence.
                # shape: [batch_size, sequence_length]
                # 归一化
                p_attention = tf.nn.softmax(attention_logits -
                                            attention_logits_max)
                # 3) get weighted hidden state by attention vector
                # shape: [batch_size, sequence_length, 1]
                p_attention_expanded = tf.expand_dims(p_attention, axis=2)
                # below sentence_representation
                # shape:[batch_size, sequence_length, hidden_size*2]<----
                # p_attention_expanded: [batch_size, sequence_length, 1]
                # hidden_state_: [batch_size, sequence_length, hidden_size*2]
                # shape: [batch_size, sequence_length, hidden_size*2]
                sentence_representation = tf.multiply(p_attention_expanded,
                                                      hidden_state)
                # shape: [batch_size, hidden_size*2]
                sentence_representation = tf_utils.Mask(
                    sentence_representation, seq_len, config.max_sent_len)
                sentence_representation = tf.reduce_sum(
                    sentence_representation, axis=1)
                # shape: [batch_size, hidden_size*2]
                return sentence_representation

        def BiLSTM(input_x,
                   input_x_len,
                   hidden_size,
                   num_layers=1,
                   dropout_rate=None,
                   return_sequence=True):
            """
            Update 2017.11.21
            fix a bug
            ref: https://stackoverflow.com/questions/44615147/valueerror-trying-to-share-variable-rnn-multi-rnn-cell-cell-0-basic-lstm-cell-k
            ======
            BiLSTM Layer
            Args:
                input_x: [batch, sent_len, emb_size]
                input_x_len: [batch, ]
                hidden_size: int
                num_layers: int
                dropout_rate: float
                return_sequence: True/False
            Returns:
                if return_sequence=True:
                    outputs: [batch, sent_len, hidden_size*2]
                else:
                    output: [batch, hidden_size*2]
            """
            def lstm_cell():
                return tf.contrib.rnn.BasicLSTMCell(hidden_size)

            # cell = tf.contrib.rnn.GRUCell(hidden_size)
            # cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size)
            # cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size)

            if num_layers >= 1:
                # Warning! Please consider that whether the cell to stack are the same
                cell_fw = tf.contrib.rnn.MultiRNNCell(
                    [lstm_cell() for _ in range(num_layers)])
                cell_bw = tf.contrib.rnn.MultiRNNCell(
                    [lstm_cell() for _ in range(num_layers)])

            if dropout_rate is not None:
                cell_fw = tf.contrib.rnn.DropoutWrapper(
                    cell_fw, output_keep_prob=dropout_rate)
                cell_bw = tf.contrib.rnn.DropoutWrapper(
                    cell_bw, output_keep_prob=dropout_rate)

            b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                input_x,
                sequence_length=input_x_len,
                dtype=tf.float32)
            if return_sequence:
                # b_outputs: [[b, sl, h],[b, sl, h]]
                outputs = tf.concat(b_outputs, axis=2)
            else:
                # b_states: (([b, c], [b, h]), ([b, c], [b, h]))
                outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1)
            return outputs

        def CNN(input_x,
                seq_len,
                filter_sizes,
                num_filters,
                embed_size,
                dropout_rate=None):
            """
            CNN Layer
            Args:
                input_x: [batch, sent_len, emb_size, 1]
                seq_len: int
                filter_sizes: list
                num_filters: int
                dropout_rate: float
            Returns:
                outputs: [batch, num_filters * len(filter_sizes)]
            """
            pooled_outputs = []
            for i, filter_size in enumerate(filter_sizes):
                with tf.name_scope("convolution-pooling-%s" % filter_size):
                    # ====>a.create filter
                    filter = tf.get_variable(
                        "filter-%s" % filter_size,
                        [filter_size, embed_size, 1, num_filters],
                        initializer=self.initializer)
                    # ====>b.conv operation: conv2d===>computes a 2-D convolution given 4-D `input` and `filter` tensors.
                    # Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]`
                    # Conv.Returns: A `Tensor`. Has the same type as `input`.
                    # A 4-D tensor. The dimension order is determined by the value of `data_format`, see below for details.
                    # 1) each filter with conv2d's output a shape:[1, sent_len-filter_size+1, 1, 1];2) * num_filters--->[1, sent_len - filter_size+1,1,num_filters];3)*batch_size--->[batch_size,sequence_length-filter_size+1,1,num_filters]
                    # input data format: NHWC: [batch, height, width, channels]; output:4-D

                    # shape:[batch_size, sent_len - filter_size + 1, 1, num_filters]
                    conv = tf.nn.conv2d(input_x,
                                        filter,
                                        strides=[1, 1, 1, 1],
                                        padding="VALID",
                                        name="conv")
                    # ====>c. apply nolinearity
                    b = tf.get_variable("b-%s" % filter_size, [num_filters])
                    # shape: [batch_size, sent_len - filter_size + 1, 1, num_filters]. tf.nn.bias_add:adds `bias` to `value`
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), "relu")
                    # ====>. max-pooling.  value: A 4-D `Tensor` with shape `[batch, height, width, channels]
                    # ksize: A list of ints that has length >= 4.  The size of the window for each dimension of the input tensor.
                    # strides: A list of ints that has length >= 4.  The stride of the sliding window for each dimension of the input tensor.
                    # shape:[batch_size, 1, 1, num_filters].max_pool: performs the max pooling on the input.
                    pooled = tf.nn.max_pool(
                        h,
                        ksize=[1, seq_len - filter_size + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name="pool")
                    pooled_outputs.append(pooled)
            # 3.=====>combine all pooled features, and flatten the feature.output' shape is a [1,None]
            # e.g. >>> x1=tf.ones([3,3]);x2=tf.ones([3,3]);x=[x1,x2]
            #         x12_0=tf.concat(x,0)---->x12_0' shape:[6,3]
            #         x12_1=tf.concat(x,1)---->x12_1' shape;[3,6]
            # shape:[batch_size, 1, 1, num_filters_total]. tf.concat=>concatenates tensors along one dimension.where num_filters_total=num_filters_1+num_filters_2+num_filters_3
            h_pool = tf.concat(pooled_outputs, -1)
            num_filters_total = num_filters * len(filter_sizes)
            # shape should be:[None,num_filters_total]. here this operation has some result as tf.sequeeze().e.g. x's shape:[3,3];tf.reshape(-1,x) & (3, 3)---->(1,9)
            outputs = tf.reshape(h_pool, [-1, num_filters_total])
            # 4.=====>add dropout: use tf.nn.dropout
            if dropout_rate is not None:
                # [None, num_filters_total]
                outputs = tf.nn.dropout(outputs, keep_prob=dropout_rate)

            # 5. logits(use linear layer)and predictions(argmax)
            # with tf.name_scope("output"):
            #     # shape:[None, self.num_classes]==tf.matmul([None,self.embed_size],[self.embed_size,self.num_classes])
            #     logits = tf.matmul(self.h_drop, self.W_projection) + self.b_projection
            return outputs

        # Build the Computation Graph
        # [batch_size, sent_len, word_emd_size]
        embedded_x = tf.nn.embedding_lookup(self.we, self.input_x)
        batch_size = tf.shape(embedded_x)[0]
        if self.with_char:
            if self.char_type == 'lstm':
                # [batch_size, sent_len, word_len, char_emd_size]
                embedded_x_char = tf.nn.embedding_lookup(
                    self.char_we, self.input_x_char)
                # batch_size = tf.shape(embedded_x_char)[0]
                # [batch_size * sent_len, word_len, char_emd_size]
                embedded_x_char = tf.reshape(
                    embedded_x_char, [-1, self.word_len, self.char_embed_size])
                input_x_char_lens = tf.reshape(self.input_x_char_len, [-1])
                with tf.variable_scope("char_bilstm") as clstm:
                    # [batch_size * sent_len, word_len, char_emd_size]
                    char_lstm_x = BiLSTM(embedded_x_char,
                                         input_x_char_lens,
                                         self.char_lstm_size,
                                         dropout_rate=1.0,
                                         return_sequence=True)
                    char_lstm_x = char_lstm_x[:, -1, :]
                    char_x = tf.reshape(
                        char_lstm_x,
                        [batch_size, self.seq_len, self.char_lstm_size * 2])
            if self.char_type == 'cnn':
                embedded_x_char = tf.nn.embedding_lookup(
                    self.char_we, self.input_x_char)
                embedded_x_char = tf.reshape(
                    embedded_x_char, [-1, self.word_len, self.char_embed_size])
                with tf.variable_scope("char_cnn") as ccnn:
                    inputs_char_embeddings_expanded = tf.expand_dims(
                        embedded_x_char, -1)
                    char_cnn_x = CNN(inputs_char_embeddings_expanded,
                                     self.word_len, self.filter_sizes,
                                     self.num_filters, self.char_embed_size,
                                     self.drop_keep_rate)
                    num_filters_total = self.num_filters * len(
                        self.filter_sizes)
                    char_x = tf.reshape(
                        char_cnn_x,
                        [batch_size, self.seq_len, num_filters_total])
        if self.with_ner:
            embedded_x_ner = tf.nn.embedding_lookup(self.ner_we,
                                                    self.input_x_ner)
        if self.with_pos:
            embedded_x_pos = tf.nn.embedding_lookup(self.pos_we,
                                                    self.input_x_pos)
        with tf.variable_scope("seq_bilstm") as s:
            if self.with_ner:
                embedded_x = tf.concat([embedded_x, embedded_x_ner], axis=-1)
            if self.with_pos:
                embedded_x = tf.concat([embedded_x, embedded_x_pos], axis=-1)
            if self.with_char:
                embedded_x = tf.concat([embedded_x, char_x], axis=-1)
            lstm_x = BiLSTM(embedded_x,
                            self.input_x_len,
                            self.lstm_size,
                            self.layer_size,
                            self.drop_keep_rate,
                            return_sequence=True)
        if self.with_cnn:
            inputs_embeddings_expanded = tf.expand_dims(embedded_x, -1)
            cnn_x = CNN(inputs_embeddings_expanded, self.seq_len,
                        self.filter_sizes, self.num_filters,
                        self.word_embed_size, self.drop_keep_rate)
        if self.with_cnn_lstm:
            inputs_hidden_expanded = tf.expand_dims(lstm_x, -1)
            cnn_x = CNN(inputs_hidden_expanded, self.seq_len,
                        self.filter_sizes, self.num_filters,
                        self.lstm_size * 2, self.drop_keep_rate)
        avg_pooling = tf_utils.AvgPooling(lstm_x, self.input_x_len,
                                          self.seq_len)
        max_pooling = tf_utils.MaxPooling(lstm_x, self.input_x_len)
        last_lstm = lstm_x[:, -1, :]
        last_lstm = tf.reshape(last_lstm, [batch_size, self.lstm_size * 2])
        seq_distribution = tf.concat([avg_pooling, max_pooling, last_lstm],
                                     axis=-1)
        if self.with_attention:
            attention = attention_word_level(lstm_x, self.lstm_size,
                                             self.seq_len, self.input_x_len)
            seq_distribution = tf.concat([last_lstm, attention], axis=-1)
        if self.with_rf:
            seq_distribution = tf.concat([seq_distribution, self.input_rf],
                                         axis=-1)
        if self.with_pun:
            seq_distribution = tf.concat([seq_distribution, self.input_x_pun],
                                         axis=-1)
        if self.with_senti:
            seq_distribution = tf.concat(
                [seq_distribution, self.input_x_senti], axis=-1)
        if self.with_cnn:
            seq_distribution = tf.concat([seq_distribution, cnn_x], axis=-1)
        if self.with_cnn_lstm:
            seq_distribution = tf.concat([seq_distribution, cnn_x], axis=-1)

        hidden1 = tf.nn.relu(
            tf_utils.linear(seq_distribution,
                            self.mlp_h1_size,
                            bias=True,
                            scope='h1'))
        logits = tf_utils.linear(hidden1,
                                 self.num_class,
                                 bias=True,
                                 scope='softmax')

        # Obtain the Predict, Loss, Train_op
        predict_prob = tf.nn.softmax(logits, name='predict_prob')
        predict_label = tf.cast(tf.argmax(logits, 1), tf.int32)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=self.input_y)
        loss = tf.reduce_mean(loss)

        l2_loss = tf.add_n([
            tf.nn.l2_loss(v) for v in tf.trainable_variables()
            if v.get_shape().ndims > 1
        ])
        reg_loss = loss + FLAGS.lambda_l2 * l2_loss

        # Build the loss
        global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # optimizer = tf.train.AdadeltaOptimizer(self.learning_rate)
        # optimizer = tf.train.AdagradOptimizer(self.learning_rate)

        if FLAGS.clipper:
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                              FLAGS.clipper)
            train_op = optimizer.apply_gradients(list(zip(grads, tvars)),
                                                 global_step=global_step)
        else:
            train_op = optimizer.minimize(loss, global_step=global_step)

        self.predict_prob = predict_prob
        self.predict_label = predict_label
        self.seq_res = hidden1
        self.logits = logits
        self.loss = loss
        self.reg_loss = reg_loss
        self.train_op = train_op
        self.global_step = global_step
Exemple #4
0
    def __init__(self, FLAGS=None):
        self.FLAGS = FLAGS
        self.config = config
        self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step")
        self.epoch_increment = tf.assign(
            self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.seq_len = config.max_sent_len
        self.embed_size = config.word_dim
        self.num_class = config.num_class
        self.mlp_h1_size = 140
        self.mlp_h2_size = 140

        # Add PlaceHolder
        self.input_x = tf.placeholder(
            tf.int32, (None, self.seq_len))  # [batch_size, sent_len]
        self.input_x_len = tf.placeholder(tf.int32, (None, ))
        self.input_y = tf.placeholder(tf.int32, (None, self.num_class))

        self.drop_keep_rate = tf.placeholder(tf.float32)
        self.drop_hidden1 = tf.placeholder(tf.float32)
        self.drop_hidden2 = tf.placeholder(tf.float32)
        self.learning_rate = tf.placeholder(tf.float32)

        # Add Word Embedding
        self.we = tf.Variable(FLAGS.we, name='emb')

        # Build the Computation Graph
        inputs = tf.nn.embedding_lookup(
            self.we, self.input_x)  # [batch_size, sent_len, emd_size]
        avg_pooling = tf_utils.AvgPooling(inputs, self.input_x_len,
                                          self.seq_len)
        hidden1 = tf.nn.relu(
            tf_utils.linear(avg_pooling,
                            self.mlp_h1_size,
                            bias=True,
                            scope='h1'))
        hidden1_drop = tf.nn.dropout(hidden1, keep_prob=self.drop_hidden1)
        hidden2 = tf.nn.relu(
            tf_utils.linear(hidden1_drop,
                            self.mlp_h2_size,
                            bias=True,
                            scope='h2'))
        hidden2_drop = tf.nn.dropout(hidden2, keep_prob=self.drop_hidden2)
        logits = tf_utils.linear(hidden2_drop,
                                 self.num_class,
                                 bias=True,
                                 scope='softmax')
        # logits = tf_utils.linear(avg_pooling, self.num_class, bias=True, scope='softmax')

        # Obtain the Predict, Loss, Train_op
        predict_prob = tf.nn.softmax(logits, name='predict_prob')
        predict_label = tf.cast(tf.argmax(logits, 1), tf.int32)

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=self.input_y)
        loss = tf.reduce_mean(loss)

        l2_loss = tf.add_n([
            tf.nn.l2_loss(v) for v in tf.trainable_variables()
            if v.get_shape().ndims > 1
        ])
        reg_loss = loss + FLAGS.lambda_l2 * l2_loss

        # Build the loss
        global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # optimizer = tf.train.AdagradOptimizer(self.learning_rate)

        if FLAGS.clipper:
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                              FLAGS.clipper)
            train_op = optimizer.apply_gradients(list(zip(grads, tvars)))
        else:
            train_op = optimizer.minimize(loss, global_step=global_step)

        self.predict_prob = predict_prob
        self.predict_label = predict_label
        self.seq_res = hidden2_drop
        self.logits = logits
        self.loss = loss
        self.reg_loss = reg_loss
        self.train_op = train_op
        self.global_step = global_step
Exemple #5
0
    def __init__(self, init='xavier', num_inputs=None, input_dim=None, embed_size=None, l2_w=None, l2_v=None,
                 norm=False, real_inputs=None, comb_mask=None, weight_base=0.6, third_prune=False, 
                 comb_mask_third=None, weight_base_third=0.6, retrain_stage=0):
        self.l2_w = l2_w
        self.l2_v = l2_v
        self.l2_ps = l2_v
        self.third_prune = third_prune
        self.retrain_stage = retrain_stage

        self.inputs, self.labels, self.training = create_placeholder(num_inputs, tf, True)

        inputs, mask, flag, num_inputs = split_data_mask(self.inputs, num_inputs, norm=norm, real_inputs=real_inputs)

        self.xw, self.xv, b, self.xps = embedding_lookup(init=init, input_dim=input_dim, factor=embed_size, inputs=inputs,
                                               apply_mask=flag, mask=mask, third_order=third_prune)

        l = linear(self.xw)
        self.cols, self.rows = generate_pairs(range(self.xv.shape[1]),mask=comb_mask)
        t_embedding_matrix = tf.transpose(self.xv, perm=[1, 0, 2])
        left = tf.transpose(tf.gather(t_embedding_matrix, self.rows), perm=[1, 0, 2])
        right = tf.transpose(tf.gather(t_embedding_matrix, self.cols), perm=[1, 0, 2])
        level_2_matrix = tf.reduce_sum(tf.multiply(left, right), axis=-1)
        with tf.variable_scope("edge_weight", reuse=tf.AUTO_REUSE):
            self.edge_weights = tf.get_variable('weights', shape=[len(self.cols)],
                                                initializer=tf.random_uniform_initializer(
                                                minval=weight_base - 0.001,
                                                maxval=weight_base + 0.001))
            normed_wts = tf.identity(self.edge_weights, name="normed_wts")
            tf.add_to_collection("structure", self.edge_weights)
            tf.add_to_collection("edge_weights", self.edge_weights)
            mask = tf.identity(normed_wts, name="unpruned_mask")
            mask = tf.expand_dims(mask, axis=0)
        level_2_matrix = tf.layers.batch_normalization(level_2_matrix, axis=-1, training=self.training,
                                                    reuse=tf.AUTO_REUSE, scale=False, center=False, name='prune_BN')
        level_2_matrix *= mask                                          
        if third_prune:
            self.first, self.second, self.third = generate_pairs(range(self.xps.shape[1]), mask=comb_mask_third, order=3)
            t_embedding_matrix = tf.transpose(self.xps, perm=[1, 0, 2])
            first_embed = tf.transpose(tf.gather(t_embedding_matrix, self.first), perm=[1, 0, 2])
            second_embed = tf.transpose(tf.gather(t_embedding_matrix, self.second), perm=[1, 0, 2])
            third_embed = tf.transpose(tf.gather(t_embedding_matrix, self.third), perm=[1, 0, 2])
            level_3_matrix = tf.reduce_sum(tf.multiply(tf.multiply(first_embed, second_embed), third_embed), axis=-1)
            with tf.variable_scope("third_edge_weight", reuse=tf.AUTO_REUSE):
                self.third_edge_weights = tf.get_variable('third_weights', shape=[len(self.first)],
                                                          initializer=tf.random_uniform_initializer(
                                                              minval=weight_base_third - 0.001,
                                                              maxval=weight_base_third + 0.001))
                third_normed_wts = tf.identity(self.third_edge_weights, name="third_normed_wts")
                tf.add_to_collection("third_structure", self.third_edge_weights)
                tf.add_to_collection("third_edge_weights", self.third_edge_weights)
                third_mask = tf.identity(third_normed_wts, name="third_unpruned_mask")
                third_mask = tf.expand_dims(third_mask, axis=0)
            level_3_matrix = tf.layers.batch_normalization(level_3_matrix, axis=-1, training=self.training,
                                                           reuse=tf.AUTO_REUSE, scale=False, center=False,
                                                           name="level_3_matrix_BN")
            level_3_matrix *= third_mask

        fm_out = tf.reduce_sum(level_2_matrix, axis=-1)
        if third_prune:
            fm_out2 = tf.reduce_sum(level_3_matrix, axis=-1)
        if third_prune:
            self.logits, self.outputs = output([l, fm_out,fm_out2, b, ])
        else:
            self.logits, self.outputs = output([l, fm_out, b, ])
    def __init__(self, FLAGS=None):
        self.FLAGS = FLAGS
        self.config = config

        self.seq_len = config.max_sent_len
        self.embed_size = config.word_dim
        self.num_class = config.num_class
        self.lstm_size = config.lstm_size

        # Add Word Embedding
        self.we = tf.Variable(FLAGS.we, name='emb')

        # Add PlaceHolder

        # define basic four input layers - for warrant0, warrant1, reason, claim
        self.input_warrant0 = tf.placeholder(
            tf.int32, (None, self.seq_len),
            name='warrant0')  # [batch_size, sent_len]
        self.input_warrant1 = tf.placeholder(
            tf.int32, (None, self.seq_len),
            name='warrant1')  # [batch_size, sent_len]
        self.input_reason = tf.placeholder(
            tf.int32, (None, self.seq_len),
            name='reason')  # [batch_size, sent_len]
        self.input_claim = tf.placeholder(
            tf.int32, (None, self.seq_len),
            name='claim')  # [batch_size, sent_len]
        self.input_debate = tf.placeholder(
            tf.int32, (None, self.seq_len),
            name='debate')  # [batch_size, sent_len]

        self.warrant0_len = tf.placeholder(
            tf.int32, (None, ), name='warrant0_len')  # [batch_size,]
        self.warrant1_len = tf.placeholder(
            tf.int32, (None, ), name='warrant1_len')  # [batch_size,]
        self.reason_len = tf.placeholder(tf.int32, (None, ),
                                         name='reason_len')  # [batch_size,]
        self.claim_len = tf.placeholder(tf.int32, (None, ),
                                        name='claim_len')  # [batch_size,]
        self.debate_len = tf.placeholder(tf.int32, (None, ),
                                         name='debate_len')  # [batch_size,]

        self.target_label = tf.placeholder(
            tf.int32, (None, self.num_class),
            name='label')  # [batch_size, num_class]

        self.drop_keep_rate = tf.placeholder(tf.float32)
        self.learning_rate = tf.placeholder(tf.float32)

        # now define embedded layers of the input
        embedded_warrant0 = tf.nn.embedding_lookup(self.we,
                                                   self.input_warrant0)
        embedded_warrant1 = tf.nn.embedding_lookup(self.we,
                                                   self.input_warrant1)
        embedded_reason = tf.nn.embedding_lookup(self.we, self.input_reason)
        embedded_claim = tf.nn.embedding_lookup(self.we, self.input_claim)
        embedded_debate = tf.nn.embedding_lookup(self.we, self.input_debate)
        ''' BiLSTM layer '''
        def BiLSTM(input_x,
                   input_x_len,
                   hidden_size,
                   num_layers=1,
                   dropout_rate=None,
                   return_sequence=True):
            """
            TODO: return_sequence Bug
            """
            # cell = tf.contrib.rnn.GRUCell(hidden_size)
            cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size)
            cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size)

            if num_layers > 1:
                # Warning! Please consider that whether the cell to stack are the same
                cell_fw = tf.contrib.rnn.MultiRNNCell(
                    [cell_fw for _ in range(num_layers)])
                cell_bw = tf.contrib.rnn.MultiRNNCell(
                    [cell_bw for _ in range(num_layers)])

            if dropout_rate:
                cell_fw = tf.contrib.rnn.DropoutWrapper(
                    cell_fw, output_keep_prob=(1 - dropout_rate))
                cell_bw = tf.contrib.rnn.DropoutWrapper(
                    cell_bw, output_keep_prob=(1 - dropout_rate))

            b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                input_x,
                sequence_length=input_x_len,
                dtype=tf.float32)
            if return_sequence:
                outputs = tf.concat(b_outputs, axis=2)
            else:
                outputs = tf.concat([b_states[0][0], b_outputs[1][0]], axis=-1)
            return outputs

        with tf.variable_scope("bi_lstm") as s:
            bilstm_warrant0 = BiLSTM(embedded_warrant0, self.warrant0_len,
                                     self.lstm_size)
            s.reuse_variables()
            bilstm_warrant1 = BiLSTM(embedded_warrant1, self.warrant1_len,
                                     self.lstm_size)
            bilstm_reason = BiLSTM(embedded_reason, self.reason_len,
                                   self.lstm_size)
            bilstm_claim = BiLSTM(embedded_claim, self.claim_len,
                                  self.lstm_size)
            bilstm_debate = BiLSTM(embedded_debate, self.debate_len,
                                   self.lstm_size)
        ''' MaxPooling Layer '''
        pooling_warrant0 = tf_utils.MaxPooling(bilstm_warrant0,
                                               self.warrant0_len)
        pooling_warrant1 = tf_utils.MaxPooling(bilstm_warrant1,
                                               self.warrant1_len)
        pooling_reason = tf_utils.MaxPooling(bilstm_reason, self.reason_len)
        pooling_claim = tf_utils.MaxPooling(bilstm_claim, self.claim_len)
        pooling_debate = tf_utils.MaxPooling(bilstm_debate, self.debate_len)

        attention_vector_for_W0 = tf.concat(
            [pooling_debate, pooling_reason, pooling_warrant0, pooling_claim],
            axis=-1)
        attention_vector_for_W1 = tf.concat(
            [pooling_debate, pooling_reason, pooling_warrant1, pooling_claim],
            axis=-1)

        def AttBiLSTM(attention_vector,
                      input_x,
                      input_x_len,
                      hidden_size,
                      return_sequence=True):
            cell_fw = AttBasicLSTMCell(attention_vector, num_units=hidden_size)
            cell_bw = AttBasicLSTMCell(attention_vector, num_units=hidden_size)

            b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                input_x,
                sequence_length=input_x_len,
                dtype=tf.float32)
            if return_sequence:
                outputs = tf.concat(b_outputs, axis=2)
            else:
                # states: [c, h]
                outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1)
            return outputs

        with tf.variable_scope("att_lstm") as s:
            attention_warrant0 = AttBiLSTM(attention_vector_for_W0,
                                           bilstm_warrant0,
                                           self.warrant0_len,
                                           self.lstm_size,
                                           return_sequence=False)
            s.reuse_variables()
            attention_warrant1 = AttBiLSTM(attention_vector_for_W1,
                                           bilstm_warrant1,
                                           self.warrant1_len,
                                           self.lstm_size,
                                           return_sequence=False)

        self.attention_warrant0 = attention_warrant0
        self.attention_warrant1 = attention_warrant1

        # concatenate them
        warrant_0minus1 = attention_warrant0 - attention_warrant1
        warrant_1minus0 = attention_warrant1 - attention_warrant0
        merge_warrant = tf.concat([warrant_1minus0, warrant_0minus1], axis=-1)
        merge_warrant = tf.concat([
            attention_warrant0, attention_warrant1, attention_warrant0 -
            attention_warrant1, attention_warrant0 * attention_warrant1
        ],
                                  axis=-1)
        dropout_warrant = tf.nn.dropout(merge_warrant, self.drop_keep_rate)

        # and add one extra layer with ReLU
        with tf.variable_scope("linear") as s:
            dense1 = tf.nn.relu(
                tf_utils.linear(dropout_warrant,
                                int(self.lstm_size / 2),
                                bias=True,
                                scope='dense'))
            logits = tf_utils.linear(dense1,
                                     self.num_class,
                                     bias=True,
                                     scope='logit')

        # Obtain the Predict, Loss, Train_op
        predict_prob = tf.nn.softmax(logits, name='predict_prob')
        predict_label = tf.cast(tf.argmax(logits, axis=1), tf.int32)

        loss = tf.nn.softmax_cross_entropy_with_logits(
            logits=logits, labels=self.target_label)
        loss = tf.reduce_mean(loss)

        # Build the loss
        global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # optimizer = tf.train.AdagradOptimizer(self.learning_rate)

        if FLAGS.clipper:
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                              FLAGS.clipper)
            train_op = optimizer.apply_gradients(list(zip(grads, tvars)))
        else:
            train_op = optimizer.minimize(loss, global_step=global_step)

        self.predict_prob = predict_prob
        self.predict_label = predict_label
        self.loss = loss
        self.train_op = train_op
        self.global_step = global_step