Ejemplo n.º 1
0
 def test_attention(self):
     inputs = tf.placeholder(dtype=tf.float32, shape=(2, 2, 3))
     output = attention(inputs)
     init_op = tf.initialize_all_variables()
     with tf.Session() as sess:
         sess.run(init_op)
         output_val = sess.run(output, feed_dict={inputs: \
                                                      np.asarray([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
                                                                 dtype='float32')})
         self.assertTrue((output_val == np.asarray([[2.5, 3.5, 4.5], [8.5, 9.5, 10.5]])).all(), 'output')
Ejemplo n.º 2
0
def encoder(inputs,
            params,
            is_training=True,
            ):
    # inputs: batch_size, time steps, channel
    filters = list(params['filters'])[0]
    blocks = params['blocks']
    kernel_size = params['kernel_size']
    is_training = is_training
    strides = params['strides']
    embedding_size = params['embedding_size']
    memory_cells = params['memory_cells']

    output = inputs
    with tf.variable_scope("encoder"):
        for l in range(blocks):
            with tf.variable_scope('block_{}'.format(l + 1)):
                output = tf.layers.conv1d(output,
                                          filters=filters,
                                          kernel_size=kernel_size,
                                          padding='same')
                output = tf.layers.batch_normalization(output,
                                                       training=is_training)
                output = tf.nn.relu(output)
                output = tf.layers.max_pooling1d(output,
                                                 pool_size=kernel_size,
                                                 strides=strides,
                                                 padding='same')

        # to one fixed length: batch_size, num_channels, by using the attention mechanism
        output = attention(output)
        with tf.variable_scope('output_transformer'):
            output = tf.layers.dense(output, embedding_size)

        # apply memory
        if memory_cells > 0:
            output = read_memory(output)

        # apply l2 norm
        output = tf.nn.l2_normalize(output, 1, name="l2_embedding")
    return output
Ejemplo n.º 3
0
def encoder(
    inputs,
    params,
    is_training=True,
):
    # inputs: batch_size, time steps, channel
    filters_list = list(params['filters'])
    blocks = params['blocks']
    kernel_size = params['kernel_size']
    is_training = is_training
    strides = params['strides']
    embedding_size = params['embedding_size']
    memory_cells = params['memory_cells']

    output = inputs

    with tf.variable_scope("encoder"):
        for stage, filters in enumerate(filters_list):
            output = conv_and_res_block(output,
                                        kernel_size=kernel_size,
                                        filters=filters,
                                        strides=strides,
                                        stage=stage + 1,
                                        blocks=blocks,
                                        is_training=is_training)

        # to one fixed length: batch_size, num_channels, by using the attention mechanism
        output = attention(output)

        with tf.variable_scope('output_transformer'):
            output = tf.layers.dense(output, embedding_size)

        # apply memory
        if memory_cells > 0:
            output = read_memory(output)

        # apply l2 norm
        output = tf.nn.l2_normalize(output, 1, name="l2_embedding")
    return output
Ejemplo n.º 4
0
  def self_attention(self, num_heads=1, residual='concat', queries_eq_keys=False):
    word_embs = glove(self.words, self.params['words'], self.params['glove'])
    char_embs = get_char_representations(
      self.chars, self.nchars, 
      self.params['chars'], mode='lstm',
      training=self.training
    )
    html_embs = get_soft_html_representations(
      self.html, self.params['html_tags'],
      self.css_chars, self.css_lengths,
      self.params['chars'], training=self.training
    )

    embs = tf.concat([word_embs, char_embs, html_embs], axis=-1)
    embs = self.dropout(embs)
    output = self.lstm(embs, self.params['lstm_size'])
    output = self.dropout(output)

    return attention(
      output, output, num_heads,
      residual=residual, queries_eq_keys=queries_eq_keys,
      training=self.training
    )
Ejemplo n.º 5
0
    def forward(self, query, key, value, mask=None):
        """
        query, key, value - shape (batch_size, sentence_len_enc, d_model) - for encoder values
                          - shape (batch_size, sentence_len_dec, d_model) - for decoder values

        mask - shape (batch_size,            1, sentence_len) for encoder mask
             - shape (batch_size, sentence_len, sentence_len) for decoder mask
        """
        "Implements Figure 2"
        if mask is not None:
            mask = mask.unsqueeze(1)
        batch_size = query.size(0)
        "tensor into shape (batch_size, h, sentence_len, d_k)"
        query, key, value = \
            [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        # 2) Apply attention on all the projected vectors in batch
        "x has shape (batch_size, h, sent_len, d_k)"
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        x = x.transpose(1, 2).contiguous() \
            .view(batch_size, -1, self.h * self.d_k)
        "return shape: (batch_size, sent_len, d_model)"
        return self.linears[-1](x)
Ejemplo n.º 6
0
    def __init__(self,
                 sequence_length,
                 num_classes,
                 vocab_size,
                 embedding_size,
                 pos_vocab_size,
                 pos_embedding_size,
                 hidden_size,
                 num_heads,
                 attention_size,
                 use_elmo=False,
                 l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32,
                                      shape=[None, sequence_length],
                                      name='input_x')
        self.input_y = tf.placeholder(tf.float32,
                                      shape=[None, num_classes],
                                      name='input_y')
        self.input_text = tf.placeholder(tf.string,
                                         shape=[
                                             None,
                                         ],
                                         name='input_text')
        self.input_e1 = tf.placeholder(tf.int32,
                                       shape=[
                                           None,
                                       ],
                                       name='input_e1')
        self.input_e2 = tf.placeholder(tf.int32,
                                       shape=[
                                           None,
                                       ],
                                       name='input_e2')
        self.input_p1 = tf.placeholder(tf.int32,
                                       shape=[None, sequence_length],
                                       name='input_p1')
        self.input_p2 = tf.placeholder(tf.int32,
                                       shape=[None, sequence_length],
                                       name='input_p2')
        self.emb_dropout_keep_prob = tf.placeholder(
            tf.float32, name='emb_dropout_keep_prob')
        self.rnn_dropout_keep_prob = tf.placeholder(
            tf.float32, name='rnn_dropout_keep_prob')
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name='dropout_keep_prob')

        if use_elmo:
            # Contextual Embedding Layer
            with tf.variable_scope("elmo-embeddings"):
                elmo_model = hub.Module("https://tfhub.dev/google/elmo/2",
                                        trainable=True)
                self.embedded_chars = elmo_model(self.input_text,
                                                 signature="default",
                                                 as_dict=True)["elmo"]
        else:
            # Word Embedding Layer
            with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
                self.W_text = tf.Variable(tf.random_uniform(
                    [vocab_size, embedding_size], -0.25, 0.25),
                                          name="W_text")
                self.embedded_chars = tf.nn.embedding_lookup(
                    self.W_text, self.input_x)

        # Position Embedding Layer
        with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"):
            self.W_pos = tf.get_variable("W_pos",
                                         [pos_vocab_size, pos_embedding_size],
                                         initializer=initializer())
            self.p1 = tf.nn.embedding_lookup(
                self.W_pos,
                self.input_p1)[:, :tf.shape(self.embedded_chars)[1]]
            self.p2 = tf.nn.embedding_lookup(
                self.W_pos,
                self.input_p2)[:, :tf.shape(self.embedded_chars)[1]]

        # Dropout for Word Embedding
        with tf.variable_scope('dropout-embeddings'):
            self.embedded_chars = tf.nn.dropout(self.embedded_chars,
                                                self.emb_dropout_keep_prob)

        # Self Attention
        with tf.variable_scope("self-attention"):
            self.self_attn, self.self_alphas = multihead_attention(
                self.embedded_chars,
                self.embedded_chars,
                num_units=embedding_size,
                num_heads=num_heads)

        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size,
                                               initializer=initializer())
            fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell,
                                                    self.rnn_dropout_keep_prob)
            _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size,
                                               initializer=initializer())
            bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell,
                                                    self.rnn_dropout_keep_prob)
            self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_cell,
                cell_bw=bw_cell,
                inputs=self.self_attn,
                sequence_length=self._length(self.input_x),
                dtype=tf.float32)
            self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1)

        # Attention
        with tf.variable_scope('attention'):
            self.attn, self.alphas, self.e1_alphas, self.e2_alphas = attention(
                self.rnn_outputs,
                self.input_e1,
                self.input_e2,
                self.p1,
                self.p2,
                attention_size=attention_size)

        # Dropout
        with tf.variable_scope('dropout'):
            self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)

        # Fully connected layer
        with tf.variable_scope('output'):
            self.logits = tf.layers.dense(self.h_drop,
                                          num_classes,
                                          kernel_initializer=initializer())
            self.predictions = tf.argmax(self.logits, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.variable_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=self.logits, labels=self.input_y)
            self.l2 = tf.add_n(
                [tf.nn.l2_loss(v) for v in tf.trainable_variables()])
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2

        # Accuracy
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   tf.float32),
                                           name="accuracy")
Ejemplo n.º 7
0
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument("--mode", "-m", type=str)
    arg_parser.add_argument("--preprocess", "-p", type=bool, default=False)
    arg_parser.add_argument("--data", "-d", type=str, default="./data")

    args = arg_parser.parse_args()
    mode = args.mode
    if_preprocess = args.preprocess
    data_dir = args.data

    logging.set_verbosity(logging.INFO)

    train_x, train_y, test_x, train_data, test_data = load_data(data_dir=data_dir, if_preprocess=if_preprocess)

    logging.info("building model...")
    model = attention()
    restored = model.restore()

    if mode == "train":
        logging.info("training...")
        model.train(train_x, train_y,epochs=100,batch_size=150)
    elif mode == "evaluate":
        logging.info("evaluating...")
        if restored:
            for name, value in model.evaluate(train_x,train_y,batch_size=150):
                print("name: %s, value: %f" % (name, value))
        else:
            logging.error("error: model weights not exist!")
    elif mode == "submit":
        logging.info("predicting final result...")
        test_data[LABEL_LIST] = model.predict(test_x, batch_size=150)
    def __init__(self, max_sequence_length, num_classes, pos_vocab_size, init_embed, \
                 hidden_size, attention_size, keep_prob, attention_lambda, attention_loss_type, \
                 l2_reg_lambda, use_pos_flag=True, rnn_cell="lstm"):
        # word index
        self.input_word = tf.placeholder(tf.int32, [None, max_sequence_length],
                                         name="input_word")
        # pos index
        self.input_pos = tf.placeholder(tf.int32, [None, max_sequence_length],
                                        name="input_pos")
        # sequence length of words
        self.sequence_length = tf.placeholder(tf.int32, [None], name="length")
        # attention over x
        self.input_attention = tf.placeholder(tf.float32,
                                              [None, max_sequence_length],
                                              name="input_attention")
        # output probability
        self.input_y = tf.placeholder(tf.float32, [None, num_classes],
                                      name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")
        l2_loss = tf.constant(0.0)

        # embedding layer with initialization of words and pos tags
        with tf.name_scope("embedding"):
            W = tf.Variable(init_embed, name="W", dtype=tf.float32)
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_word)
            self.embedded_input = self.embedded_chars

        if (use_pos_flag):
            with tf.name_scope("pos_embedding"):
                W_pos = tf.Variable(tf.eye(pos_vocab_size),
                                    name="W_pos",
                                    dtype=tf.float32)
                self.embedded_pos = tf.nn.embedding_lookup(
                    W_pos, self.input_pos)
                self.embedded_input = tf.concat(
                    [self.embedded_chars, self.embedded_pos], axis=-1)

        # RNN layer + attention for words
        with tf.variable_scope("bi-rnn"):
            if rnn_cell == "gru":
                rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size),\
                                        inputs=self.embedded_input, sequence_length=self.sequence_length, \
                                        dtype=tf.float32)
            elif rnn_cell == "lstm":
                rnn_outputs, _ = bi_rnn(LSTMCell(hidden_size), LSTMCell(hidden_size),\
                                        inputs=self.embedded_input, sequence_length=self.sequence_length, \
                                        dtype=tf.float32)
            else:
                raise Exception(
                    "Cell type {} is not supported!".format(rnn_cell))
            attention_outputs, self.alphas = attention(rnn_outputs,
                                                       attention_size,
                                                       return_alphas=True)
            drop_outputs = tf.nn.dropout(attention_outputs, keep_prob)

        # Fully connected layer by taking both rnn-words and rnn-pos as inputs
        with tf.name_scope("fc-layer-1"):
            fc_dim = 10
            W = tf.Variable(tf.truncated_normal(
                [drop_outputs.get_shape()[1].value, fc_dim], stddev=0.1),
                            name="W")
            b = tf.Variable(tf.constant(0.1, shape=[fc_dim]), name="b")
            fc_outputs = tf.nn.tanh(tf.nn.xw_plus_b(drop_outputs, W, b))
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)

        with tf.name_scope("fc-layer-2"):
            W = tf.Variable(tf.truncated_normal(
                [fc_outputs.get_shape()[1].value, num_classes], stddev=0.1),
                            name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            self.logits = tf.nn.xw_plus_b(fc_outputs, W, b)
            self.prob = tf.nn.softmax(self.logits)
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)

        with tf.name_scope("cross_entropy"):
            entropy_loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,
                                                        logits=self.logits))
            if (attention_loss_type == "encoded"):
                print("Supervised attention with encoded loss.")
                att_shared_dim = 20
                # rationale input_attention: (batch_size, max_sent_len)
                # W: (max_sent_len, att_shared_dim)
                # b: (att_shared_dim,)
                # proj: (batch_size, att_shared_dim)
                ration_W = tf.Variable(tf.truncated_normal([
                    self.input_attention.get_shape()[1].value, att_shared_dim
                ],
                                                           stddev=0.1),
                                       name="ration_W")
                ration_b = tf.Variable(tf.constant(0.05,
                                                   shape=[att_shared_dim]),
                                       name="ration_b")
                proj_ration = tf.nn.tanh(
                    tf.nn.xw_plus_b(self.input_attention, ration_W, ration_b))
                alpha_W = tf.Variable(tf.truncated_normal(
                    [self.alphas.get_shape()[1].value, att_shared_dim],
                    stddev=0.1),
                                      name="alpha_W")
                alpha_b = tf.Variable(tf.constant(0.05,
                                                  shape=[att_shared_dim]),
                                      name="alpha_b")
                proj_alphas = tf.nn.tanh(
                    tf.nn.xw_plus_b(self.alphas, alpha_W, alpha_b))
                # negative of inner product
                attention_loss = -1 * tf.reduce_mean(
                    tf.multiply(proj_ration, proj_alphas))
            elif (attention_loss_type == "l1"):
                print("Supervised attention with L1 loss.")
                attention_loss = tf.reduce_mean(
                    tf.abs(
                        tf.subtract(tf.nn.softmax(self.input_attention),
                                    self.alphas)))
            elif (attention_loss_type == "l2"):
                print("Supervised attention with L2 loss.")
                attention_loss = tf.reduce_mean(
                    tf.square(
                        tf.subtract(tf.nn.softmax(self.input_attention),
                                    self.alphas)))
            else:
                print("No supervised attention.")
                attention_loss = tf.constant(0.0)
            self.loss = entropy_loss + attention_lambda * attention_loss + l2_reg_lambda * l2_loss
Ejemplo n.º 9
0
    batch_x = tf.placeholder(tf.int32, [None, MAX_DOCUMENT_LENGTH])
    batch_y = tf.placeholder(tf.float32, [None, MAX_LABEL])
    keep_prob = tf.placeholder(tf.float32)

    embeddings_var = tf.Variable(tf.random_uniform(
        [vocab_size, EMBEDDING_SIZE], -1.0, 1.0),
                                 trainable=True)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_x)
    print(batch_embedded.shape)  # (?, 256, 128)

    cell = tf.contrib.rnn.BasicLSTMCell(HIDDEN_SIZE)
    rnn_outputs, _ = tf.nn.dynamic_rnn(cell, batch_embedded, dtype=tf.float32)

    # Attention
    attention_output, alphas = attention(rnn_outputs,
                                         ATTENTION_SIZE,
                                         return_alphas=True)
    drop = tf.nn.dropout(attention_output, keep_prob)
    shape = drop.get_shape()

    # Fully connected layer(dense layer)
    W = tf.Variable(
        tf.truncated_normal([shape[1].value, MAX_LABEL], stddev=0.1))
    b = tf.Variable(tf.constant(0., shape=[MAX_LABEL]))
    y_hat = tf.nn.xw_plus_b(drop, W, b)

    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=batch_y))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

    # Accuracy metric
Ejemplo n.º 10
0
    ax = fig.add_subplot(111)

    cax = ax.matshow(df, interpolation='nearest', cmap='hot_r')
    fig.colorbar(cax)

    tick_spacing = 1
    ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))

    ax.set_xticklabels([''] + list(df.columns))
    ax.set_yticklabels([''] + list(df.index))

    plt.show()


if __name__ == '__main__':
    text1 = '今天天气怎么样?'
    text2 = '天气不太好,是雨天'

    model = BertModel.from_pretrained('bert-base-chinese')
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

    inputs = tokenizer(text=text1, text_pair=text2, return_tensors='pt')
    outputs = model(**inputs)
    sequence_outputs, cls = outputs[:2]
    outputs, p_attn = attention(query=sequence_outputs,
                                key=sequence_outputs,
                                value=sequence_outputs)

    selfattn_visual(p_attn, text=text1, text_pair=text2)
    def process(self, x, seq_len, input_keep_prob, output_keep_prob, scope):
        """
        Args:
            x (tensor of list): shape (batch_size, sequence_length, embedding_size)
            seq_len (tensor of list): shape (batch_size, 1)
            input_keep_prob (float): dropout rate 
            output_keep_prob (float): dropout rate
            scope (string): the variable scope for this model 
        """
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            if self.num_layers != 1:
                cells = []
                for i in range(self.num_layers):
                    rnn_cell = DropoutWrapper(
                        GRUCell(self.hidden_size),
                        input_keep_prob=input_keep_prob,
                        output_keep_prob=output_keep_prob)
                    cells.append(rnn_cell)
                self.cell_fw = MultiRNNCell(cells)
            else:
                self.cell_fw = DropoutWrapper(
                    GRUCell(self.hidden_size),
                    input_keep_prob=input_keep_prob,
                    output_keep_prob=output_keep_prob)

            if self.num_layers != 1:
                cells = []
                for i in range(self.num_layers):
                    rnn_cell = DropoutWrapper(
                        GRUCell(self.hidden_size),
                        input_keep_prob=input_keep_prob,
                        output_keep_prob=output_keep_prob)
                    cells.append(rnn_cell)
                self.cell_bw = MultiRNNCell(cells)
            else:
                self.cell_bw = DropoutWrapper(
                    GRUCell(self.hidden_size),
                    input_keep_prob=input_keep_prob,
                    output_keep_prob=output_keep_prob)
            if self.dynamic:
                with tf.name_scope("dynamic-rnn-with-{}-layers".format(
                        self.num_layers)):
                    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                        inputs=x,
                        cell_fw=self.cell_fw,
                        cell_bw=self.cell_bw,
                        sequence_length=seq_len,
                        dtype=tf.float32)
                    # If no initial_state is provided, dtype must be specified
                    output_fw, output_bw = outputs
                    outputs = tf.concat([output_fw, output_bw], axis=2)
                    # shape: batch_size, sequence_length, hidden_size * 2
                    batch_size = tf.shape(outputs)[0]
                    index = tf.range(0, batch_size) * \
                        self.sequence_length + (seq_len - 1)
                    output = tf.gather(
                        tf.reshape(outputs, [-1, self.hidden_size * 2]), index)
                    # shape: batch_size, hidden_size * 2
            else:
                if self.use_attention:
                    x = tf.unstack(x, self.sequence_length, axis=1)
                    # get list (length == sequence_length) of tensors with shape: batch_size, embedding_size
                    with tf.name_scope(
                            "rnn-based-attention-with-{}-layers".format(
                                self.num_layers)):
                        outputs, _, _ = tf.nn.static_bidirectional_rnn(
                            inputs=x,
                            cell_fw=self.cell_fw,
                            cell_bw=self.cell_bw,
                            dtype=tf.float32)
                        # this will be deprecated
                        outputs = tf.stack(outputs)
                        outputs = tf.transpose(outputs, [1, 0, 2])
                        output, alpha = attention(outputs, self.attention_size)
                else:
                    x = tf.unstack(x, self.sequence_length, axis=1)
                    # get list (length == sequence_length) of tensors with shape: batch_size, embedding_size
                    with tf.name_scope("rnn-with-{}-layers".format(
                            self.num_layers)):
                        outputs, _, _ = tf.nn.static_bidirectional_rnn(
                            inputs=x,
                            cell_fw=self.cell_fw,
                            cell_bw=self.cell_bw,
                            dtype=tf.float32)
                        # this will be deprecated
                        outputs = tf.stack(outputs)
                        outputs = tf.transpose(outputs, [1, 0, 2])
                        output = tf.reduce_sum(outputs, axis=1)
        return output