def transformer(self, num_blocks=2, num_heads=5, mid_layer='feed_forward'): word_embs = glove(self.words, self.params['words'], self.params['glove']) char_embs = get_char_representations( self.chars, self.nchars, self.params['chars'], mode='lstm', training=self.training ) html_embs = get_soft_html_representations( self.html, self.params['html_tags'], self.css_chars, self.css_lengths, self.params['chars'], training=self.training ) embs = tf.concat([word_embs, char_embs, html_embs], axis=-1) # embs = word_embs # embs += pos_embeddings(embs, 1000) x = self.dropout(embs) for i in range(num_blocks): output = multihead_attention( queries=x, keys=x, values=x, num_heads=num_heads, dropout_rate=0.5, training=self.training, causality=False ) if mid_layer == 'feed_forward': output = tf.layers.dense(output, 450, activation=tf.nn.relu) output = tf.layers.dense(output, 450) # Residual connection output += x # Normalize x = normalize(output) elif mid_layer == 'lstm': # Bidirectional LSTM will output a tensor with a shape that is twice # the hidden layer size. x = self.lstm(x, x.shape[2].value / 2, var_scope='transformer_' + str(i)) + x return x
def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, pos_vocab_size, pos_embedding_size, hidden_size, num_heads, attention_size, use_elmo=False, l2_reg_lambda=0.0): # Placeholders for input, output and dropout self.input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_x') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') self.input_text = tf.placeholder(tf.string, shape=[ None, ], name='input_text') self.input_e1 = tf.placeholder(tf.int32, shape=[ None, ], name='input_e1') self.input_e2 = tf.placeholder(tf.int32, shape=[ None, ], name='input_e2') self.input_p1 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p1') self.input_p2 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p2') self.emb_dropout_keep_prob = tf.placeholder( tf.float32, name='emb_dropout_keep_prob') self.rnn_dropout_keep_prob = tf.placeholder( tf.float32, name='rnn_dropout_keep_prob') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') if use_elmo: # Contextual Embedding Layer with tf.variable_scope("elmo-embeddings"): elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) self.embedded_chars = elmo_model(self.input_text, signature="default", as_dict=True)["elmo"] else: # Word Embedding Layer with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"): self.W_text = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -0.25, 0.25), name="W_text") self.embedded_chars = tf.nn.embedding_lookup( self.W_text, self.input_x) # Position Embedding Layer with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"): self.W_pos = tf.get_variable("W_pos", [pos_vocab_size, pos_embedding_size], initializer=initializer()) self.p1 = tf.nn.embedding_lookup( self.W_pos, self.input_p1)[:, :tf.shape(self.embedded_chars)[1]] self.p2 = tf.nn.embedding_lookup( self.W_pos, self.input_p2)[:, :tf.shape(self.embedded_chars)[1]] # Dropout for Word Embedding with tf.variable_scope('dropout-embeddings'): self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob) # Self Attention with tf.variable_scope("self-attention"): self.self_attn, self.self_alphas = multihead_attention( self.embedded_chars, self.embedded_chars, num_units=embedding_size, num_heads=num_heads) # Bidirectional LSTM with tf.variable_scope("bi-lstm"): _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer()) fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, self.rnn_dropout_keep_prob) _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer()) bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, self.rnn_dropout_keep_prob) self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cell, cell_bw=bw_cell, inputs=self.self_attn, sequence_length=self._length(self.input_x), dtype=tf.float32) self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1) # Attention with tf.variable_scope('attention'): self.attn, self.alphas, self.e1_alphas, self.e2_alphas = attention( self.rnn_outputs, self.input_e1, self.input_e2, self.p1, self.p2, attention_size=attention_size) # Dropout with tf.variable_scope('dropout'): self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob) # Fully connected layer with tf.variable_scope('output'): self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=initializer()) self.predictions = tf.argmax(self.logits, 1, name="predictions") # Calculate mean cross-entropy loss with tf.variable_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.input_y) self.l2 = tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2 # Accuracy with tf.variable_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
def __init__(self, sequence_length, rw_length, num_classes, vocab_size, rw_vocab_size, rw_pos_vocab_size, embedding_size, pos_vocab_size, pos_embedding_size, hidden_size, num_heads, attention_size, use_elmo=False, l2_reg_lambda=0.0): # Placeholders for input, output and dropout self.input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_x') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') self.input_text = tf.placeholder(tf.string, shape=[ None, ], name='input_text') self.input_e1 = tf.placeholder(tf.int32, shape=[ None, ], name='input_e1') self.input_e2 = tf.placeholder(tf.int32, shape=[ None, ], name='input_e2') self.input_p1 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p1') self.input_p2 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p2') self.input_rw_x = tf.placeholder(tf.int32, shape=[None, rw_length], name='input_rw_x') ######## self.input_rw_text = tf.placeholder(tf.string, shape=[ None, ], name='input_rw_text') ####### self.input_rw_pos_x = tf.placeholder(tf.int32, shape=[None, rw_length], name='input_rw_pos_x') ######## self.input_rw_pos_text = tf.placeholder( tf.string, shape=[ None, ], name='input_rw_pos_text') ####### self.input_rw_cate = tf.placeholder(tf.float32, shape=[None, 11], name='input_rw_cate') self.emb_dropout_keep_prob = tf.placeholder( tf.float32, name='emb_dropout_keep_prob') self.rnn_dropout_keep_prob = tf.placeholder( tf.float32, name='rnn_dropout_keep_prob') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') if use_elmo: # Contextual Embedding Layer with tf.variable_scope("elmo-embeddings"): elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) self.embedded_chars = elmo_model(self.input_text, signature="default", as_dict=True)["elmo"] self.rw_embedding = elmo_model(self.input_rw_text, signature="default", as_dict=True)["elmo"] self.rw_pos_embedding = elmo_model(self.input_rw_pos_text, signature="default", as_dict=True)["elmo"] else: # Word Embedding Layer with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"): self.W_text = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -0.25, 0.25), name="W_text") self.W_rw_text = tf.Variable(tf.random_uniform( [rw_vocab_size, embedding_size], -0.25, 0.25), name="W_rw_text") self.embedded_chars = tf.nn.embedding_lookup( self.W_text, self.input_x) self.rw_embedding = tf.nn.embedding_lookup( self.W_rw_text, self.input_rw_x) # Position Embedding Layer with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"): self.W_pos = tf.get_variable("W_pos", [pos_vocab_size, pos_embedding_size], initializer=initializer()) self.p1 = tf.nn.embedding_lookup( self.W_pos, self.input_p1)[:, :tf.shape(self.embedded_chars)[1]] self.p2 = tf.nn.embedding_lookup( self.W_pos, self.input_p2)[:, :tf.shape(self.embedded_chars)[1]] self.W_rw_pos_text = tf.get_variable( "W_rw_pos_text", [rw_pos_vocab_size, embedding_size], initializer=initializer()) self.rw_pos_embedding = tf.nn.embedding_lookup( self.W_rw_pos_text, self.input_rw_pos_x) # Dropout for Word Embedding with tf.variable_scope('dropout-embeddings'): self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob) self.rw_embedding = tf.nn.dropout(self.rw_embedding, self.emb_dropout_keep_prob) self.rw_pos_embedding = tf.nn.dropout(self.rw_pos_embedding, self.emb_dropout_keep_prob) # Self Attention with tf.variable_scope("self-attention"): self.self_attn, self.self_alphas = multihead_attention( self.embedded_chars, self.embedded_chars, num_units=embedding_size, num_heads=num_heads) self.rw_pos_self_attn, self.rw_pos_self_alpha = multihead_attention2( self.rw_embedding, self.embedded_chars, num_units=embedding_size, num_heads=num_heads) # Bidirectional LSTM with tf.variable_scope("bi-lstm"): _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer()) fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, self.rnn_dropout_keep_prob) _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer()) bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, self.rnn_dropout_keep_prob) self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cell, cell_bw=bw_cell, inputs=self.self_attn, sequence_length=self._length(self.input_x), dtype=tf.float32) self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1) with tf.variable_scope("rw_multi-scale-cnn"): self.self_attn2 = tf.reshape(self.rw_pos_self_attn, [ -1, self.rw_pos_self_attn.shape[1], self.rw_pos_self_attn.shape[2], 1 ]) conv1 = tf.layers.conv2d(inputs=self.self_attn2, filters=50, kernel_size=[1, self.self_attn2.shape[2]], padding="valid", activation=tf.nn.relu) pool1 = tf.keras.layers.GlobalMaxPooling2D()(conv1) conv2 = tf.layers.conv2d(inputs=self.self_attn2, filters=50, kernel_size=[2, self.self_attn2.shape[2]], padding="valid", activation=tf.nn.relu) pool2 = tf.keras.layers.GlobalMaxPooling2D()(conv2) conv3 = tf.layers.conv2d(inputs=self.self_attn2, filters=50, kernel_size=[3, self.self_attn2.shape[2]], padding="valid", activation=tf.nn.relu) pool3 = tf.keras.layers.GlobalMaxPooling2D()(conv3) conv4 = tf.layers.conv2d(inputs=self.self_attn2, filters=50, kernel_size=[4, self.self_attn2.shape[2]], padding="valid", activation=tf.nn.relu) pool4 = tf.keras.layers.GlobalMaxPooling2D()(conv4) self.rw_conv = tf.concat([pool1, pool2, pool3, pool4], axis=-1) # Attention with tf.variable_scope('attention1'): self.attn1, self.alphas, self.trans = attention1( self.rnn_outputs, self.input_e1, self.input_e2, self.p1, self.p2, attention_size=attention_size) # Dropout with tf.variable_scope('dropout'): #c = tf.concat([self.conv,self.rw_conv], axis=-1) self.h_drop1 = tf.nn.dropout(self.attn1, self.dropout_keep_prob) self.h_drop2 = tf.nn.dropout(self.rw_conv, self.dropout_keep_prob) # Fully connected layer with tf.variable_scope('output'): self.logits = tf.layers.dense(self.h_drop1, num_classes, kernel_initializer=initializer()) self.logits2 = tf.layers.dense(self.h_drop2, num_classes, kernel_initializer=initializer()) self.l = tf.add(self.logits, self.logits2) self.dir = tf.layers.dense(self.trans, 3, kernel_initializer=initializer()) self.predictions = tf.argmax(self.l, 1, name="predictions") # Calculate mean cross-entropy loss with tf.variable_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.input_y) losses2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits2, labels=self.input_y) self.l2 = tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) self.loss = tf.reduce_mean( losses) + l2_reg_lambda * self.l2 + tf.reduce_mean(losses2) # Accuracy with tf.variable_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
def model_fn(features, labels, mode, params): is_training = (mode == tf.estimator.ModeKeys.TRAIN) x = features unchanged_labels = labels # non-smoothing labels for f1 metrics if params.label_smooth and labels is not None: labels = tf.cast(labels, tf.float32) labels = label_smoothing(labels, epsilon=params.epsilon) # build embedding vectors vector = word_embedding(x, params.vector_path, scale=False) # ! reduce the fiexed word dimensions to appropriate dimension if params.hidden_size != vector.get_shape().as_list()[-1]: # 原论文中使用全连接降维 with tf.variable_scope("dimension_reduction"): vector = tf.layers.dense( vector, params.hidden_size, activation=None, use_bias=False, kernel_initializer=tf.contrib.layers.xavier_initializer(), kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0)) # scale the word embedding vector = vector * (params.hidden_size**0.5) # 给词向量 增加位置信息 vector += position_embedding(x, num_units=params.hidden_size, scale=False) # # * add dropout mask vector may be not a good idea vector = tf.layers.dropout(vector, rate=params.dropout_rate, training=tf.convert_to_tensor(is_training)) # # transformer attention stacks for i in range(params.num_attention_stacks): with tf.variable_scope(f"num_attention_stacks_{i + 1}"): # multi-head attention vector = multihead_attention( queries=vector, keys=vector, num_units=params.hidden_size, num_heads=params.num_heads, dropout_rate=params.dropout_rate, kernel_initializer=tf.contrib.layers.xavier_initializer(), kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0), is_training=is_training, causality=False) # feed forward vector = feedforward( vector, kernel_initializer=tf.contrib.layers.xavier_initializer(), num_units=[2 * params.hidden_size, params.hidden_size]) attentions = vector # 最里增加一维,以模拟一维黑白通道 # (N, attention_stacks*T, C, 1) attentions = tf.expand_dims(attentions, -1) # ************************************************************ # complete attention part, now CNN capture part # ************************************************************ logits = [] # 每个category对应一个inception_maxpool classifier for topic in range(params.multi_categories): cnn_features = inception( attentions, filter_size_list=params.filter_size_list, num_filters=params.num_filters, hidden_size=params.hidden_size, kernel_initializer=tf.contrib.layers.xavier_initializer(), kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0), scope=f"category_{topic+1}_inception" ) # (n, 1, 1, total_filter_num) total_feature_num = len(params.filter_size_list) * params.num_filters # cnn_features: (n, total_filter_num) cnn_features = tf.reshape(cnn_features, (-1, total_feature_num)) # category_logit: (n, num_sentiment) category_logits = dense_logits( cnn_features, params.num_sentiment, kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0), kernel_initializer=tf.contrib.layers.xavier_initializer(), scope=f"category_{topic+1}_logits", inner_dense_outshape=params.inner_dense_outshape, inner_dense_activation=tf.tanh, use_bias=True) # 将该category的logit加入列表 logits.append(category_logits) # logits: (n, multi_categories, num_sentiment) logits = tf.stack(logits, axis=1) # * train & eval common part if (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL): gstep = tf.train.get_or_create_global_step() # loss: (n, multi_categories) loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) loss = tf.reduce_sum(loss, axis=1) # (n,) loss = tf.reduce_mean(loss, axis=0) # scala if params.use_regularizer: loss_reg = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) loss += params.reg_const * loss_reg loss = tf.identity(loss, name="loss") # predictions = tf.nn.softmax(logits) predictions = tf.cast( tf.equal(tf.reduce_max(logits, axis=-1, keepdims=True), logits), tf.float32) avg_macro_f1, avg_macro_f1_update_op = average_macro_f1( labels=tf.cast(unchanged_labels, tf.float32), predictions=predictions) eval_metric_ops = { 'avg_macro_f1': (avg_macro_f1, avg_macro_f1_update_op) } tf.summary.scalar("loss", loss) tf.summary.scalar("f1", avg_macro_f1) summary_hook = tf.train.SummarySaverHook( save_steps=params.print_n_step, output_dir="./summary", summary_op=tf.summary.merge_all()) else: loss = None eval_metric_ops = None # * train specific part if (mode == tf.estimator.ModeKeys.TRAIN): learning_rate = tf.train.cosine_decay_restarts( learning_rate=params.learning_rate, global_step=gstep, first_decay_steps=params.first_decay_steps, t_mul=params.t_mul, m_mul=params.m_mul, alpha=params.alpha, name="learning_rate") optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params.momentum) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, params.max_norm) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=gstep) # add custom training logger custom_logger = _LoggerHook(loss, gstep, learning_rate, params.print_n_step) else: train_op = None # * predict part if mode == tf.estimator.ModeKeys.PREDICT: # 在预测时, logits:(multi_categories, num_sentiment) # pred: (multi_categories,) pred = tf.subtract(tf.argmax(logits, axis=-1), 2) predictions = { "classes": pred, } export_outputs = { "classify": tf.estimator.export.PredictOutput(predictions) } else: predictions = None export_outputs = None training_hooks = [custom_logger, summary_hook] if is_training else None return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops, training_hooks=training_hooks)