Exemple #1
0
    def call(self, x, y, mask, training=False):
        self.step += 1
        x_ = x

        x = dropout(x, keep_prob=self.keep_prob, training=training)
        y = dropout(y, keep_prob=self.keep_prob, training=training)

        if self.step == 0:
            if not self.identity:
                self.linear = layers.Dense(melt.get_shape(x, -1),
                                           activation=tf.nn.relu)
            else:
                self.linear = None

        # NOTICE shared linear!
        if self.linear is not None:
            x = self.linear(x)
            y = self.linear(y)

        scores = tf.matmul(x, tf.transpose(y, [0, 2, 1]))

        if mask is not None:
            JX = melt.get_shape(x, 1)
            mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1])
            scores = softmax_mask(scores, mask)

        alpha = tf.nn.softmax(scores)
        self.alpha = alpha

        y = tf.matmul(alpha, y)

        if self.combine is None:
            return y
        else:
            return self.combine(x_, y, training=training)
Exemple #2
0
    def call(self, x, training=False):
        x = x['comment']
        batch_size = melt.get_shape(x, 0)
        length = melt.length(x)
        #with tf.device('/cpu:0'):
        x = self.embedding(x)

        num_units = [
            melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units
            for layer in range(self.num_layers)
        ]
        #print('----------------length', tf.reduce_max(length), inputs.comment.shape)
        mask_fws = [
            melt.dropout(tf.ones([batch_size, 1, num_units[layer]],
                                 dtype=tf.float32),
                         keep_prob=self.keep_prob,
                         training=training,
                         mode=None) for layer in range(self.num_layers)
        ]
        mask_bws = [
            melt.dropout(tf.ones([batch_size, 1, num_units[layer]],
                                 dtype=tf.float32),
                         keep_prob=self.keep_prob,
                         training=training,
                         mode=None) for layer in range(self.num_layers)
        ]
        #x = self.encode(x, length, mask_fws=mask_fws, mask_bws=mask_bws)
        x = self.encode(x)

        x = self.pooling(x, length)
        #x = self.pooling(x)
        x = self.logits(x)
        return x
    def call(self, input, training=False):
        x1 = input['query']
        x2 = input['passage']
        length1 = melt.length(x1)
        length2 = melt.length(x2)
        #with tf.device('/cpu:0'):
        x1 = self.embedding(x1)
        x2 = self.embedding(x2)

        x = x1
        batch_size = melt.get_shape(x1, 0)

        num_units = [
            melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units
            for layer in range(self.num_layers)
        ]
        #print('----------------length', tf.reduce_max(length), inputs.comment.shape)
        mask_fws = [
            melt.dropout(tf.ones([batch_size, 1, num_units[layer]],
                                 dtype=tf.float32),
                         keep_prob=self.keep_prob,
                         training=training,
                         mode=None) for layer in range(self.num_layers)
        ]
        mask_bws = [
            melt.dropout(tf.ones([batch_size, 1, num_units[layer]],
                                 dtype=tf.float32),
                         keep_prob=self.keep_prob,
                         training=training,
                         mode=None) for layer in range(self.num_layers)
        ]

        x = self.encode(x1,
                        length1,
                        x2,
                        length2,
                        mask_fws=mask_fws,
                        mask_bws=mask_bws)
        x = self.pooling(x, length1, length2)
        #x = self.pooling(x)

        if FLAGS.use_type:
            x = tf.concat([x, tf.expand_dims(tf.to_float(input['type']), 1)],
                          1)

        if not FLAGS.split_type:
            x = self.logits(x)
        else:
            x1 = self.logits(x)
            x2 = self.logits2(x)
            x = tf.cond(tf.cast(input['type'] == 0, tf.bool), lambda:
                        (x1 + x2) / 2., lambda: x2)

        return x
Exemple #4
0
  def encode(self, seq, seq_len=None, output_method='all'):
    with tf.variable_scope(self.scope):
      num_filters = self.num_units
      seqs = [seq]
      batch_size = melt.get_batch_size(seq)
     
      kernel_sizes = [3, 5, 7, 9, 11, 13]
      #kernel_sizes = [3] * 7
      assert self.num_layers <= len(kernel_sizes)

      for layer in range(self.num_layers):
        input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters
        seq = melt.dropout(seq, self.keep_prob, self.is_train)
        seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu)
        # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32),
        #                   keep_prob=self.keep_prob, is_train=self.is_train, mode=None)
        #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu)
        #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu)
        
        # if self.is_train and self.keep_prob < 1:
        #   seq = tf.nn.dropout(seq, self.keep_prob)
        #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer)
        seqs.append(seq)
      
      outputs = tf.concat(seqs[1:], 2)
      # not do any dropout in convet just dropout outside 
      # if self.is_train and self.keep_prob < 1:
      #   outputs = tf.nn.dropout(outputs, self.keep_prob)

      # compact for rnn with sate return
      return melt.rnn.encode_outputs(outputs, seq_len, output_method)
Exemple #5
0
    def call(self, inputs, memory, inputs_mask, memory_mask, training=False):
        combiner = self.combiner
        # DotAttention already convert to dot_attention
        #with tf.variable_scope(self.scope):
        d_inputs = dropout(inputs, keep_prob=self.keep_prob, training=training)
        d_memory = dropout(memory, keep_prob=self.keep_prob, training=training)
        JX = tf.shape(inputs)[1]

        with tf.variable_scope("attention"):
            inputs_ = self.inputs_dense(d_inputs)
            memory_ = self.memory_dense(d_memory)

            # shared matrix for c2q and q2c attention
            scores = tf.matmul(inputs_, tf.transpose(
                memory_, [0, 2, 1])) / (self.hidden**0.5)

            # c2q attention
            mask = memory_mask
            if mask is not None:
                mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1])
                scores = softmax_mask(scores, mask)

            alpha = tf.nn.softmax(scores)
            self.alpha = alpha
            c2q = tf.matmul(alpha, memory)

            # TODO check this with allennlp implementation since not good result here...
            # q2c attention
            # (batch_size, clen)
            logits = tf.reduce_max(scores, -1)
            mask = inputs_mask
            if mask is not None:
                logits = softmax_mask(logits, mask)
            alpha2 = tf.nn.softmax(logits)
            # inputs (batch_size, clen, dim), probs (batch_size, clen)
            q2c = tf.matmul(tf.expand_dims(alpha2, 1), inputs)
            # (batch_size, clen, dim)
            q2c = tf.tile(q2c, [1, JX, 1])

            outputs = tf.concat([c2q, q2c], -1)

        if self.combine is not None:
            return self.combine(inputs, outputs, training=training)
        else:
            return outputs
Exemple #6
0
    def call(self, inputs, memory, mask, self_match=False, training=False):
        combiner = self.combiner
        # DotAttention already convert to dot_attention
        #with tf.variable_scope(self.scope):
        # TODO... here has some problem might for self match dot attention as same inputs with different dropout...Try self_match == True and verify..
        # NOTICE self_match == False following HKUST rnet
        d_inputs = dropout(inputs, keep_prob=self.keep_prob, training=training)
        if not self_match:
            d_memory = dropout(memory,
                               keep_prob=self.keep_prob,
                               training=training)
        else:
            d_memory = d_inputs
        JX = tf.shape(inputs)[1]

        # TODO remove scope ?
        with tf.variable_scope("attention"):
            inputs_ = self.inputs_dense(d_inputs)
            if not self_match:
                memory_ = self.memory_dense(d_memory)
            else:
                memory_ = inputs_

            scores = tf.matmul(inputs_, tf.transpose(
                memory_, [0, 2, 1])) / (self.hidden**0.5)

            if mask is not None:
                mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1])
                #print(inputs_.shape, memory_.shape, weights.shape, mask.shape)
                # (32, 318, 100) (32, 26, 100) (32, 318, 26) (32, 318, 26)
                scores = softmax_mask(scores, mask)

            alpha = tf.nn.softmax(scores)
            self.alpha = alpha
            # logits (32, 326, 326)  memory(32, 326, 200)
            outputs = tf.matmul(alpha, memory)

        if self.combine is not None:
            return self.combine(inputs, outputs, training=training)
        else:
            return outputs
Exemple #7
0
 def call(self, x, y, training=False):
     self.step += 1
     #with tf.variable_scope(self.scope):
     res = tf.concat([x, y], axis=2)
     dim = melt.get_shape(res, -1)
     d_res = dropout(res, keep_prob=self.keep_prob, training=training)
     if self.step == 0:
         self.dense = layers.Dense(dim,
                                   use_bias=False,
                                   activation=tf.nn.sigmoid)
     gate = self.dense(d_res)
     return res * gate
  def call(self, input, training=False):
    q = input['query']
    c = input['passage']
    q_len = melt.length(q)
    c_len = melt.length(c)
    q_mask = tf.cast(q, tf.bool)
    q_emb = self.embedding(q)
    c_emb = self.embedding(c)
    
    x = c_emb
    batch_size = melt.get_shape(x, 0)

    num_units = [melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers)]
    mask_fws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers)]
    mask_bws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers)]
    
    c = self.encode(c_emb, c_len, mask_fws=mask_fws, mask_bws=mask_bws)
    q = self.encode(q_emb, q_len, mask_fws=mask_fws, mask_bws=mask_bws)

    qc_att = self.att_dot_attention(c, q, mask=q_mask, training=training)

    num_units = [melt.get_shape(qc_att, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers)]
    mask_fws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(1)]
    mask_bws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(1)]
    x = self.att_encode(qc_att, c_len, mask_fws=mask_fws, mask_bws=mask_bws)

    x = self.pooling(x, c_len)

    if FLAGS.use_type:
      x = tf.concat([x, tf.expand_dims(tf.to_float(input['type']), 1)], 1)

    if not FLAGS.split_type:
      x = self.logits(x)
    else:
      x1 = self.logits(x)
      x2 = self.logits2(x)
      x = tf.cond(tf.cast(input['type'] == 0, tf.bool), lambda: (x1 + x2) / 2., lambda: x2)
    
    return x
Exemple #9
0
  def encode(self, seq, seq_len=None, output_method='all'):
    with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
      if self.use_position_encoding:
        hidden_size = melt.get_shape(seq, -1)
        # Scale embedding by the sqrt of the hidden size
        seq *= hidden_size ** 0.5

        # Create binary array of size [batch_size, length]
        # where 1 = padding, 0 = not padding
        padding = tf.to_float(tf.sequence_mask(seq_len))

        # Set all padding embedding values to 0
        seq *= tf.expand_dims(padding, -1)

        pos_encoding = model_utils.get_position_encoding(
            tf.shape(seq)[1], tf.shape(seq)[-1])
        seq = seq + pos_encoding

      num_filters = self.num_filters
      seqs = [seq]
      #batch_size = melt.get_batch_size(seq)
     
      #kernel_sizes = [3, 5, 7, 9, 11, 13]
      kernel_sizes = [3] * 7
      assert self.num_layers <= len(kernel_sizes)

      for layer in range(self.num_layers):
        #input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters
        seq = melt.dropout(seq, self.keep_prob, self.is_train)
        seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu)
        # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32),
        #                   keep_prob=self.keep_prob, is_train=self.is_train, mode=None)
        #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu)
        #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu)
        
        # if self.is_train and self.keep_prob < 1:
        #   seq = tf.nn.dropout(seq, self.keep_prob)
        #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer)
        seqs.append(seq)
      
      outputs = tf.concat(seqs[1:], 2)
      # not do any dropout in convet just dropout outside 
      # if self.is_train and self.keep_prob < 1:
      #   outputs = tf.nn.dropout(outputs, self.keep_prob)

      # compact for rnn with sate return
      return melt.rnn.encode_outputs(outputs, seq_len, output_method)
Exemple #10
0
 def call(self, x, fusions, training=False):
     self.step += 1
     assert len(fusions) > 0
     vectors = tf.concat(
         [x] + fusions, axis=-1
     )  # size = [batch_size, ..., input_dim * (len(fusion_vectors) + 1)]
     dim = melt.get_shape(x, -1)
     dv = dropout(vectors, keep_prob=self.keep_prob, training=training)
     if self.step == 0:
         self.composition_dense = layers.Dense(dim,
                                               use_bias=True,
                                               activation=tf.nn.tanh,
                                               name='compostion_dense')
         self.gate_dense = layers.Dense(dim,
                                        use_bias=True,
                                        activation=tf.nn.sigmoid,
                                        name='gate_dense')
     r = self.composition_dense(dv)
     g = self.gate_dense(dv)
     return g * r + (1 - g) * x
Exemple #11
0
    def call(self, x, mask, training=False):
        self.step += 1
        x_ = x
        x = dropout(x, keep_prob=self.keep_prob, training=training)

        if self.step == 0:
            if not self.identity:
                self.linear = layers.Dense(melt.get_shape(x, -1),
                                           activation=tf.nn.relu)
            else:
                self.linear = None

        # NOTICE shared linear!
        if self.linear is not None:
            x = self.linear(x)

        scores = tf.matmul(x, tf.transpose(x, [0, 2, 1]))

        #  x = tf.constant([[[1,2,3], [4,5,6],[7,8,9]],[[1,2,3],[4,5,6],[7,8,9]]], dtype=tf.float32) # shape=(2, 3, 3)
        #  z = tf.matrix_set_diag(x, tf.zeros([2, 3]))
        if not self.diag:
            # TODO better dim
            dim0 = melt.get_shape(scores, 0)
            dim1 = melt.get_shape(scores, 1)
            scores = tf.matrix_set_diag(scores, tf.zeros([dim0, dim1]))

        if mask is not None:
            JX = melt.get_shape(x, 1)
            mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1])
            scores = softmax_mask(scores, mask)

        alpha = tf.nn.softmax(scores)
        self.alpha = alpha

        x = tf.matmul(alpha, x)

        if self.combine is None:
            return y
        else:
            return self.combine(x_, x, training=training)
Exemple #12
0
  def call(self, seq, seq_len=None, masks=None, 
           output_method=OutputMethod.all, 
           training=False):
    if self.use_position_encoding:
      hidden_size = melt.get_shape(seq, -1)
      # Scale embedding by the sqrt of the hidden size
      seq *= hidden_size ** 0.5

      # Create binary array of size [batch_size, length]
      # where 1 = padding, 0 = not padding
      padding = tf.to_float(tf.sequence_mask(seq_len))

      # Set all padding embedding values to 0
      seq *= tf.expand_dims(padding, -1)

      pos_encoding = model_utils.get_position_encoding(
          tf.shape(seq)[1], tf.shape(seq)[-1])
      seq = seq + pos_encoding

    num_filters = self.num_filters
    seqs = [seq]
    #batch_size = melt.get_batch_size(seq)

    for layer in range(self.num_layers):
      if masks is None:
        seq_ = melt.dropout(seq, self.keep_prob, training)
      else:
        seq_ = seq * masks[layer]
      seq = self.conv1ds[layer](seq_)
      seqs.append(seq)
    
    outputs = tf.concat(seqs[1:], 2)
    # not do any dropout in convet just dropout outside 
    # if self.is_train and self.keep_prob < 1:
    #   outputs = tf.nn.dropout(outputs, self.keep_prob)

    # compact for rnn with sate return
    return melt.rnn.encode_outputs(outputs, seq_len, output_method)
Exemple #13
0
    def call(self,
             x,
             sequence_length=None,
             mask_fws=None,
             mask_bws=None,
             concat_layers=None,
             output_method=None,
             training=False):

        concat_layers = concat_layers or self.concat_layers
        output_mehtod = output_method or self.output_method

        if self.residual_connect:
            x = self.residual_linear(x)

        outputs = [x]

        #states = []
        keep_prob = self.keep_prob
        num_units = self.num_units
        batch_size = melt.get_batch_size(x)

        if sequence_length is None:
            len_ = melt.get_shape(x, 1)
            sequence_length = tf.ones([
                batch_size,
            ], dtype=tf.int64) * len_

        for layer in range(self.num_layers):
            input_size_ = melt.get_shape(x,
                                         -1) if layer == 0 else 2 * num_units

            gru_fw, gru_bw = self.gru_fws[layer], self.gru_bws[layer]

            if self.train_init_state:
                #init_fw = tf.tile(self.init_fw[layer], [batch_size, 1])
                #init_fw = tf.tile(self.init_fw_layer(layer), [batch_size, 1])
                init_fw = self.init_fw_layer(layer, batch_size)
                if self.cell == 'lstm':
                    init_fw = (init_fw, self.init_fw2_layer(layer, batch_size))
            else:
                init_fw = None

            if self.recurrent_dropout:
                if mask_fws is not None:
                    mask_fw = mask_fws[layer]
                else:
                    if not self.share_dropout:
                        mask_fw = dropout(tf.ones([batch_size, 1, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          training=training,
                                          mode=None)
                    else:
                        if self.dropout_mask_fw[layer] is None or (
                                tf.executing_eagerly() and batch_size !=
                                self.dropout_mask_fw[layer].shape[0]):
                            mask_fw = dropout(
                                tf.ones([batch_size, 1, input_size_],
                                        dtype=tf.float32),
                                keep_prob=keep_prob,
                                training=training,
                                mode=None)
                            self.dropout_mask_fw[layer] = mask_fw
                        else:
                            mask_fw = self.dropout_mask_fw[layer]

                inputs_fw = outputs[-1] * mask_fw
            else:
                inputs_fw = dropout(outputs[-1],
                                    keep_prob=keep_prob,
                                    training=training,
                                    mode=None)

            # https://stackoverflow.com/questions/48233400/lstm-initial-state-from-dense-layer
            # gru and lstm different ... state lstm need tuple (,) states as input state\
            if self.cell == 'gru':
                out_fw, state_fw = gru_fw(inputs_fw, init_fw)
            else:
                out_fw, state_fw1, state_fw2 = gru_fw(inputs_fw, init_fw)
                state_fw = (state_fw1, state_fw2)

            if self.train_init_state:
                #init_bw = tf.tile(self.init_bw[layer], [batch_size, 1])
                #init_bw = tf.tile(self.init_bw_layer(layer), [batch_size, 1])
                init_bw = self.init_bw_layer(layer, batch_size)
                if self.cell == 'lstm':
                    init_bw = (init_bw, self.init_bw2_layer(layer, batch_size))
            else:
                init_bw = None

            if mask_bws is not None:
                mask_bw = mask_bws[layer]
            else:
                if not self.share_dropout:
                    mask_bw = dropout(tf.ones([batch_size, 1, input_size_],
                                              dtype=tf.float32),
                                      keep_prob=keep_prob,
                                      training=training,
                                      mode=None)
                else:
                    if self.dropout_mask_bw[layer] is None or (
                            tf.executing_eagerly() and batch_size !=
                            self.dropout_mask_bw[layer].shape[0]):
                        mask_bw = dropout(tf.ones([batch_size, 1, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          training=training,
                                          mode=None)
                        self.dropout_mask_bw[layer] = mask_bw
                    else:
                        mask_bw = self.dropout_mask_bw[layer]

            if self.recurrent_dropout:
                inputs_bw = outputs[-1] * mask_bw
            else:
                if self.bw_dropout:
                    inputs_bw = dropout(outputs[-1],
                                        keep_prob=keep_prob,
                                        training=training,
                                        mode=None)
                else:
                    inputs_bw = inputs_fw

            inputs_bw = tf.reverse_sequence(inputs_bw,
                                            seq_lengths=sequence_length,
                                            seq_axis=1,
                                            batch_axis=0)

            if self.cell == 'gru':
                out_bw, state_bw = gru_bw(inputs_bw, init_bw)
            else:
                out_bw, state_bw1, state_bw2 = gru_bw(inputs_bw, init_bw)
                state_bw = (state_bw1, state_bw2)

            out_bw = tf.reverse_sequence(out_bw,
                                         seq_lengths=sequence_length,
                                         seq_axis=1,
                                         batch_axis=0)

            outputs.append(tf.concat([out_fw, out_bw], axis=2))
            if self.residual_connect:
                outputs[-1] = self.batch_norm(outputs[-2] + outputs[-1])

        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
        else:
            res = outputs[-1]

        res = encode_outputs(res,
                             output_method=output_method,
                             sequence_length=sequence_length)

        self.state = (state_fw, state_bw)
        if not self.return_state:
            return res
        else:
            return res, self.state
Exemple #14
0
 def call(self, x, training=False):
     x_proj = dropout(self.linear1(x),
                      keep_prob=self.keep_prob,
                      training=training)
     x_proj = self.linear2(x_proj)
     return x_proj
    def call(self, input, training=False):
        q = input['query']
        c = input['passage']

        # reverse worse
        if FLAGS.cq_reverse:
            q, c = c, q

        #print(input['type'])
        # print('q', q)
        # print('c', c)

        q_len = melt.length(q)
        c_len = melt.length(c)
        q_mask = tf.cast(q, tf.bool)
        c_mask = tf.cast(c, tf.bool)
        q_emb = self.embedding(q)
        c_emb = self.embedding(c)

        x = c_emb
        batch_size = melt.get_shape(x, 0)

        if FLAGS.share_dropout:
            num_units = [
                melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units
                for layer in range(self.num_layers)
            ]
            mask_fws = [
                melt.dropout(tf.ones([batch_size, 1, num_units[layer]],
                                     dtype=tf.float32),
                             keep_prob=self.keep_prob,
                             training=training,
                             mode=None) for layer in range(self.num_layers)
            ]
            mask_bws = [
                melt.dropout(tf.ones([batch_size, 1, num_units[layer]],
                                     dtype=tf.float32),
                             keep_prob=self.keep_prob,
                             training=training,
                             mode=None) for layer in range(self.num_layers)
            ]

            # NOTICE query and passage share same drop out, so same word still has same embedding vector after dropout in query and passage
            c = self.encode(c_emb,
                            c_len,
                            mask_fws=mask_fws,
                            mask_bws=mask_bws,
                            training=training)
            q = self.encode(q_emb,
                            q_len,
                            mask_fws=mask_fws,
                            mask_bws=mask_bws,
                            training=training)
        else:
            c = self.encode(c_emb, c_len, training=training)
            q = self.encode(q_emb, q_len, training=training)

        # helps a lot using qc att, now bidaf att worse..
        for i in range(FLAGS.hop):
            if not FLAGS.use_bidaf_att:
                x = self.att_dot_attentions[i](c,
                                               q,
                                               mask=q_mask,
                                               training=training)
            else:
                x = self.att_dot_attentions[i](c,
                                               q,
                                               c_mask,
                                               q_mask,
                                               training=training)
            if FLAGS.use_att_encode:
                x = self.att_encodes[i](x, c_len, training=training)
            x = self.match_dot_attentions[i](x,
                                             x,
                                             mask=c_mask,
                                             training=training)
            #x = self.match_dot_attentions[i](x, mask=c_mask, training=training)
            x = self.match_encodes[i](x, c_len, training=training)

        x = self.pooling(x, c_len, calc_word_scores=self.debug)

        if FLAGS.use_type:
            x = tf.concat([x, tf.expand_dims(tf.to_float(input['type']), 1)],
                          1)

        # might helps ensemble
        if FLAGS.use_answer_emb:
            x1 = x

            neg = input['candidate_neg']
            pos = input['candidate_pos']
            na = input['candidate_na']
            neg_len = melt.length(neg)
            pos_len = melt.length(pos)
            na_len = melt.length(na)
            neg_emb = self.embedding(neg)
            pos_emb = self.embedding(pos)
            na_emb = self.embedding(na)

            if FLAGS.share_dropout:
                neg = self.encode(neg_emb,
                                  neg_len,
                                  mask_fws=mask_fws,
                                  mask_bws=mask_bws,
                                  training=training)
                pos = self.encode(pos_emb,
                                  pos_len,
                                  mask_fws=mask_fws,
                                  mask_bws=mask_bws,
                                  training=training)
                na = self.encode(na_emb,
                                 na_len,
                                 mask_fws=mask_fws,
                                 mask_bws=mask_bws,
                                 training=training)
            else:
                neg = self.encode(neg_emb, neg_len, training=training)
                pos = self.encode(pos_emb, pos_len, training=training)
                na = self.encode(na_emb, na_len, training=training)

            neg = self.pooling(neg, neg_len)
            pos = self.pooling(pos, pos_len)
            na = self.pooling(na, na_len)

            answer = tf.stack([neg, pos, na], 1)

            # [batch_size, emb_dim]
            x = self.context_dense(x)
            # [batch_size, 3, emb_dim]
            answer = self.answer_dense(answer)
            x = tf.matmul(answer, tf.transpose(tf.expand_dims(x, 1),
                                               [0, 2, 1]))
            x = tf.reshape(x, [batch_size, NUM_CLASSES])

            x = tf.concat([x1, x], -1)

            #return x

        # not help
        if FLAGS.combine_query:
            q = self.pooling(q, q_len)
            x = tf.concat([x, q], -1)

        if not FLAGS.use_label_emb:
            # split logits by type is useful, especially for type1, and improve a lot with type1 only finetune
            if not FLAGS.split_type:
                x = self.logits(x)
            else:
                x1 = self.logits(x)
                x2 = self.logits2(x)
                mask = tf.expand_dims(tf.to_float(tf.equal(input['type'], 0)),
                                      1)
                x = x1 * mask + x2 * (1 - mask)
        else:
            # use label emb seems not help ?
            x = self.label_dense(x)
            # TODO..
            x = melt.dot(x, self.label_embedding(None))

        return x