def test_attention(self): inputs = tf.placeholder(dtype=tf.float32, shape=(2, 2, 3)) output = attention(inputs) init_op = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init_op) output_val = sess.run(output, feed_dict={inputs: \ np.asarray([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype='float32')}) self.assertTrue((output_val == np.asarray([[2.5, 3.5, 4.5], [8.5, 9.5, 10.5]])).all(), 'output')
def encoder(inputs, params, is_training=True, ): # inputs: batch_size, time steps, channel filters = list(params['filters'])[0] blocks = params['blocks'] kernel_size = params['kernel_size'] is_training = is_training strides = params['strides'] embedding_size = params['embedding_size'] memory_cells = params['memory_cells'] output = inputs with tf.variable_scope("encoder"): for l in range(blocks): with tf.variable_scope('block_{}'.format(l + 1)): output = tf.layers.conv1d(output, filters=filters, kernel_size=kernel_size, padding='same') output = tf.layers.batch_normalization(output, training=is_training) output = tf.nn.relu(output) output = tf.layers.max_pooling1d(output, pool_size=kernel_size, strides=strides, padding='same') # to one fixed length: batch_size, num_channels, by using the attention mechanism output = attention(output) with tf.variable_scope('output_transformer'): output = tf.layers.dense(output, embedding_size) # apply memory if memory_cells > 0: output = read_memory(output) # apply l2 norm output = tf.nn.l2_normalize(output, 1, name="l2_embedding") return output
def encoder( inputs, params, is_training=True, ): # inputs: batch_size, time steps, channel filters_list = list(params['filters']) blocks = params['blocks'] kernel_size = params['kernel_size'] is_training = is_training strides = params['strides'] embedding_size = params['embedding_size'] memory_cells = params['memory_cells'] output = inputs with tf.variable_scope("encoder"): for stage, filters in enumerate(filters_list): output = conv_and_res_block(output, kernel_size=kernel_size, filters=filters, strides=strides, stage=stage + 1, blocks=blocks, is_training=is_training) # to one fixed length: batch_size, num_channels, by using the attention mechanism output = attention(output) with tf.variable_scope('output_transformer'): output = tf.layers.dense(output, embedding_size) # apply memory if memory_cells > 0: output = read_memory(output) # apply l2 norm output = tf.nn.l2_normalize(output, 1, name="l2_embedding") return output
def self_attention(self, num_heads=1, residual='concat', queries_eq_keys=False): word_embs = glove(self.words, self.params['words'], self.params['glove']) char_embs = get_char_representations( self.chars, self.nchars, self.params['chars'], mode='lstm', training=self.training ) html_embs = get_soft_html_representations( self.html, self.params['html_tags'], self.css_chars, self.css_lengths, self.params['chars'], training=self.training ) embs = tf.concat([word_embs, char_embs, html_embs], axis=-1) embs = self.dropout(embs) output = self.lstm(embs, self.params['lstm_size']) output = self.dropout(output) return attention( output, output, num_heads, residual=residual, queries_eq_keys=queries_eq_keys, training=self.training )
def forward(self, query, key, value, mask=None): """ query, key, value - shape (batch_size, sentence_len_enc, d_model) - for encoder values - shape (batch_size, sentence_len_dec, d_model) - for decoder values mask - shape (batch_size, 1, sentence_len) for encoder mask - shape (batch_size, sentence_len, sentence_len) for decoder mask """ "Implements Figure 2" if mask is not None: mask = mask.unsqueeze(1) batch_size = query.size(0) "tensor into shape (batch_size, h, sentence_len, d_k)" query, key, value = \ [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))] # 2) Apply attention on all the projected vectors in batch "x has shape (batch_size, h, sent_len, d_k)" x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) x = x.transpose(1, 2).contiguous() \ .view(batch_size, -1, self.h * self.d_k) "return shape: (batch_size, sent_len, d_model)" return self.linears[-1](x)
def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, pos_vocab_size, pos_embedding_size, hidden_size, num_heads, attention_size, use_elmo=False, l2_reg_lambda=0.0): # Placeholders for input, output and dropout self.input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_x') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') self.input_text = tf.placeholder(tf.string, shape=[ None, ], name='input_text') self.input_e1 = tf.placeholder(tf.int32, shape=[ None, ], name='input_e1') self.input_e2 = tf.placeholder(tf.int32, shape=[ None, ], name='input_e2') self.input_p1 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p1') self.input_p2 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p2') self.emb_dropout_keep_prob = tf.placeholder( tf.float32, name='emb_dropout_keep_prob') self.rnn_dropout_keep_prob = tf.placeholder( tf.float32, name='rnn_dropout_keep_prob') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') if use_elmo: # Contextual Embedding Layer with tf.variable_scope("elmo-embeddings"): elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) self.embedded_chars = elmo_model(self.input_text, signature="default", as_dict=True)["elmo"] else: # Word Embedding Layer with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"): self.W_text = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -0.25, 0.25), name="W_text") self.embedded_chars = tf.nn.embedding_lookup( self.W_text, self.input_x) # Position Embedding Layer with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"): self.W_pos = tf.get_variable("W_pos", [pos_vocab_size, pos_embedding_size], initializer=initializer()) self.p1 = tf.nn.embedding_lookup( self.W_pos, self.input_p1)[:, :tf.shape(self.embedded_chars)[1]] self.p2 = tf.nn.embedding_lookup( self.W_pos, self.input_p2)[:, :tf.shape(self.embedded_chars)[1]] # Dropout for Word Embedding with tf.variable_scope('dropout-embeddings'): self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob) # Self Attention with tf.variable_scope("self-attention"): self.self_attn, self.self_alphas = multihead_attention( self.embedded_chars, self.embedded_chars, num_units=embedding_size, num_heads=num_heads) # Bidirectional LSTM with tf.variable_scope("bi-lstm"): _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer()) fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, self.rnn_dropout_keep_prob) _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer()) bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, self.rnn_dropout_keep_prob) self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cell, cell_bw=bw_cell, inputs=self.self_attn, sequence_length=self._length(self.input_x), dtype=tf.float32) self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1) # Attention with tf.variable_scope('attention'): self.attn, self.alphas, self.e1_alphas, self.e2_alphas = attention( self.rnn_outputs, self.input_e1, self.input_e2, self.p1, self.p2, attention_size=attention_size) # Dropout with tf.variable_scope('dropout'): self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob) # Fully connected layer with tf.variable_scope('output'): self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=initializer()) self.predictions = tf.argmax(self.logits, 1, name="predictions") # Calculate mean cross-entropy loss with tf.variable_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.input_y) self.l2 = tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2 # Accuracy with tf.variable_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
arg_parser = argparse.ArgumentParser() arg_parser.add_argument("--mode", "-m", type=str) arg_parser.add_argument("--preprocess", "-p", type=bool, default=False) arg_parser.add_argument("--data", "-d", type=str, default="./data") args = arg_parser.parse_args() mode = args.mode if_preprocess = args.preprocess data_dir = args.data logging.set_verbosity(logging.INFO) train_x, train_y, test_x, train_data, test_data = load_data(data_dir=data_dir, if_preprocess=if_preprocess) logging.info("building model...") model = attention() restored = model.restore() if mode == "train": logging.info("training...") model.train(train_x, train_y,epochs=100,batch_size=150) elif mode == "evaluate": logging.info("evaluating...") if restored: for name, value in model.evaluate(train_x,train_y,batch_size=150): print("name: %s, value: %f" % (name, value)) else: logging.error("error: model weights not exist!") elif mode == "submit": logging.info("predicting final result...") test_data[LABEL_LIST] = model.predict(test_x, batch_size=150)
def __init__(self, max_sequence_length, num_classes, pos_vocab_size, init_embed, \ hidden_size, attention_size, keep_prob, attention_lambda, attention_loss_type, \ l2_reg_lambda, use_pos_flag=True, rnn_cell="lstm"): # word index self.input_word = tf.placeholder(tf.int32, [None, max_sequence_length], name="input_word") # pos index self.input_pos = tf.placeholder(tf.int32, [None, max_sequence_length], name="input_pos") # sequence length of words self.sequence_length = tf.placeholder(tf.int32, [None], name="length") # attention over x self.input_attention = tf.placeholder(tf.float32, [None, max_sequence_length], name="input_attention") # output probability self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") l2_loss = tf.constant(0.0) # embedding layer with initialization of words and pos tags with tf.name_scope("embedding"): W = tf.Variable(init_embed, name="W", dtype=tf.float32) self.embedded_chars = tf.nn.embedding_lookup(W, self.input_word) self.embedded_input = self.embedded_chars if (use_pos_flag): with tf.name_scope("pos_embedding"): W_pos = tf.Variable(tf.eye(pos_vocab_size), name="W_pos", dtype=tf.float32) self.embedded_pos = tf.nn.embedding_lookup( W_pos, self.input_pos) self.embedded_input = tf.concat( [self.embedded_chars, self.embedded_pos], axis=-1) # RNN layer + attention for words with tf.variable_scope("bi-rnn"): if rnn_cell == "gru": rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size),\ inputs=self.embedded_input, sequence_length=self.sequence_length, \ dtype=tf.float32) elif rnn_cell == "lstm": rnn_outputs, _ = bi_rnn(LSTMCell(hidden_size), LSTMCell(hidden_size),\ inputs=self.embedded_input, sequence_length=self.sequence_length, \ dtype=tf.float32) else: raise Exception( "Cell type {} is not supported!".format(rnn_cell)) attention_outputs, self.alphas = attention(rnn_outputs, attention_size, return_alphas=True) drop_outputs = tf.nn.dropout(attention_outputs, keep_prob) # Fully connected layer by taking both rnn-words and rnn-pos as inputs with tf.name_scope("fc-layer-1"): fc_dim = 10 W = tf.Variable(tf.truncated_normal( [drop_outputs.get_shape()[1].value, fc_dim], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[fc_dim]), name="b") fc_outputs = tf.nn.tanh(tf.nn.xw_plus_b(drop_outputs, W, b)) l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) with tf.name_scope("fc-layer-2"): W = tf.Variable(tf.truncated_normal( [fc_outputs.get_shape()[1].value, num_classes], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") self.logits = tf.nn.xw_plus_b(fc_outputs, W, b) self.prob = tf.nn.softmax(self.logits) l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) with tf.name_scope("cross_entropy"): entropy_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y, logits=self.logits)) if (attention_loss_type == "encoded"): print("Supervised attention with encoded loss.") att_shared_dim = 20 # rationale input_attention: (batch_size, max_sent_len) # W: (max_sent_len, att_shared_dim) # b: (att_shared_dim,) # proj: (batch_size, att_shared_dim) ration_W = tf.Variable(tf.truncated_normal([ self.input_attention.get_shape()[1].value, att_shared_dim ], stddev=0.1), name="ration_W") ration_b = tf.Variable(tf.constant(0.05, shape=[att_shared_dim]), name="ration_b") proj_ration = tf.nn.tanh( tf.nn.xw_plus_b(self.input_attention, ration_W, ration_b)) alpha_W = tf.Variable(tf.truncated_normal( [self.alphas.get_shape()[1].value, att_shared_dim], stddev=0.1), name="alpha_W") alpha_b = tf.Variable(tf.constant(0.05, shape=[att_shared_dim]), name="alpha_b") proj_alphas = tf.nn.tanh( tf.nn.xw_plus_b(self.alphas, alpha_W, alpha_b)) # negative of inner product attention_loss = -1 * tf.reduce_mean( tf.multiply(proj_ration, proj_alphas)) elif (attention_loss_type == "l1"): print("Supervised attention with L1 loss.") attention_loss = tf.reduce_mean( tf.abs( tf.subtract(tf.nn.softmax(self.input_attention), self.alphas))) elif (attention_loss_type == "l2"): print("Supervised attention with L2 loss.") attention_loss = tf.reduce_mean( tf.square( tf.subtract(tf.nn.softmax(self.input_attention), self.alphas))) else: print("No supervised attention.") attention_loss = tf.constant(0.0) self.loss = entropy_loss + attention_lambda * attention_loss + l2_reg_lambda * l2_loss
batch_x = tf.placeholder(tf.int32, [None, MAX_DOCUMENT_LENGTH]) batch_y = tf.placeholder(tf.float32, [None, MAX_LABEL]) keep_prob = tf.placeholder(tf.float32) embeddings_var = tf.Variable(tf.random_uniform( [vocab_size, EMBEDDING_SIZE], -1.0, 1.0), trainable=True) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_x) print(batch_embedded.shape) # (?, 256, 128) cell = tf.contrib.rnn.BasicLSTMCell(HIDDEN_SIZE) rnn_outputs, _ = tf.nn.dynamic_rnn(cell, batch_embedded, dtype=tf.float32) # Attention attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) drop = tf.nn.dropout(attention_output, keep_prob) shape = drop.get_shape() # Fully connected layer(dense layer) W = tf.Variable( tf.truncated_normal([shape[1].value, MAX_LABEL], stddev=0.1)) b = tf.Variable(tf.constant(0., shape=[MAX_LABEL])) y_hat = tf.nn.xw_plus_b(drop, W, b) loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=batch_y)) optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) # Accuracy metric
ax = fig.add_subplot(111) cax = ax.matshow(df, interpolation='nearest', cmap='hot_r') fig.colorbar(cax) tick_spacing = 1 ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing)) ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing)) ax.set_xticklabels([''] + list(df.columns)) ax.set_yticklabels([''] + list(df.index)) plt.show() if __name__ == '__main__': text1 = '今天天气怎么样?' text2 = '天气不太好,是雨天' model = BertModel.from_pretrained('bert-base-chinese') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') inputs = tokenizer(text=text1, text_pair=text2, return_tensors='pt') outputs = model(**inputs) sequence_outputs, cls = outputs[:2] outputs, p_attn = attention(query=sequence_outputs, key=sequence_outputs, value=sequence_outputs) selfattn_visual(p_attn, text=text1, text_pair=text2)
def process(self, x, seq_len, input_keep_prob, output_keep_prob, scope): """ Args: x (tensor of list): shape (batch_size, sequence_length, embedding_size) seq_len (tensor of list): shape (batch_size, 1) input_keep_prob (float): dropout rate output_keep_prob (float): dropout rate scope (string): the variable scope for this model """ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): if self.num_layers != 1: cells = [] for i in range(self.num_layers): rnn_cell = DropoutWrapper( GRUCell(self.hidden_size), input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob) cells.append(rnn_cell) self.cell_fw = MultiRNNCell(cells) else: self.cell_fw = DropoutWrapper( GRUCell(self.hidden_size), input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob) if self.num_layers != 1: cells = [] for i in range(self.num_layers): rnn_cell = DropoutWrapper( GRUCell(self.hidden_size), input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob) cells.append(rnn_cell) self.cell_bw = MultiRNNCell(cells) else: self.cell_bw = DropoutWrapper( GRUCell(self.hidden_size), input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob) if self.dynamic: with tf.name_scope("dynamic-rnn-with-{}-layers".format( self.num_layers)): outputs, _ = tf.nn.bidirectional_dynamic_rnn( inputs=x, cell_fw=self.cell_fw, cell_bw=self.cell_bw, sequence_length=seq_len, dtype=tf.float32) # If no initial_state is provided, dtype must be specified output_fw, output_bw = outputs outputs = tf.concat([output_fw, output_bw], axis=2) # shape: batch_size, sequence_length, hidden_size * 2 batch_size = tf.shape(outputs)[0] index = tf.range(0, batch_size) * \ self.sequence_length + (seq_len - 1) output = tf.gather( tf.reshape(outputs, [-1, self.hidden_size * 2]), index) # shape: batch_size, hidden_size * 2 else: if self.use_attention: x = tf.unstack(x, self.sequence_length, axis=1) # get list (length == sequence_length) of tensors with shape: batch_size, embedding_size with tf.name_scope( "rnn-based-attention-with-{}-layers".format( self.num_layers)): outputs, _, _ = tf.nn.static_bidirectional_rnn( inputs=x, cell_fw=self.cell_fw, cell_bw=self.cell_bw, dtype=tf.float32) # this will be deprecated outputs = tf.stack(outputs) outputs = tf.transpose(outputs, [1, 0, 2]) output, alpha = attention(outputs, self.attention_size) else: x = tf.unstack(x, self.sequence_length, axis=1) # get list (length == sequence_length) of tensors with shape: batch_size, embedding_size with tf.name_scope("rnn-with-{}-layers".format( self.num_layers)): outputs, _, _ = tf.nn.static_bidirectional_rnn( inputs=x, cell_fw=self.cell_fw, cell_bw=self.cell_bw, dtype=tf.float32) # this will be deprecated outputs = tf.stack(outputs) outputs = tf.transpose(outputs, [1, 0, 2]) output = tf.reduce_sum(outputs, axis=1) return output