def rnn_forward(config, inputs, scope=None): with tf.variable_scope(scope or "forward"): JX, JQ = config.max_context_size, config.max_ques_size d = config.hidden_size x, x_len, q, q_len = [inputs[key] for key in ['x', 'x_len', 'q', 'q_len']] x_mask = tf.sequence_mask(x_len, JX) q_mask = tf.sequence_mask(q_len, JQ) # emb_mat = tf.get_variable('emb_mat', shape=[V, d]) emb_mat = config.emb_mat_ph if config.serve else config.emb_mat emb_mat = tf.slice(emb_mat, [2, 0], [-1, -1]) emb_mat = tf.concat([tf.get_variable('emb_mat', shape=[2, d]), emb_mat], axis=0) xx = tf.nn.embedding_lookup(emb_mat, x, name='xx') # [N, JX, d] qq = tf.nn.embedding_lookup(emb_mat, q, name='qq') # [N, JQ, d] #now process xx and qq with this new matrices with tf.variable_scope('xx-encoder'): fw_xx_cell = GRUCell(d) fw_xx_cell = DropoutWrapper(cell=fw_xx_cell, output_keep_prob=config.keep_prob) bw_xx_cell = GRUCell(d) bw_xx_cell = DropoutWrapper(cell=bw_xx_cell, output_keep_prob=config.keep_prob) outputs_xx, _ = bidirectional_dynamic_rnn( fw_xx_cell, bw_xx_cell, xx, dtype=tf.float32) with tf.variable_scope('qq-encoder'): fw_qq_cell = GRUCell(d) fw_qq_cell = DropoutWrapper(cell=fw_qq_cell, output_keep_prob=config.keep_prob) bw_qq_cell = GRUCell(d) fw_xx_cell = DropoutWrapper(cell=fw_xx_cell, output_keep_prob=config.keep_prob) outputs_qq, _ = bidirectional_dynamic_rnn( fw_qq_cell, bw_qq_cell, qq, dtype=tf.float32) # print('ACHTUNG\n',outputs_xx.shape) xx_fwbw=tf.concat(outputs_xx, 2) qq_fwbw=tf.concat(outputs_qq, 2) # q_mask=tf.concat([q_mask,q_mask],0) # x_mask=tf.concat([x_mask,x_mask],0) # q_mask_exp=tf.concat([q_mask,q_mask],2) qq_avg = tf.reduce_mean(bool_mask(qq_fwbw, q_mask, expand=True), axis=1) # [N, d] qq_avg_exp = tf.expand_dims(qq_avg, axis=1) # [N, 1, d] qq_avg_tiled = tf.tile(qq_avg_exp, [1, JX, 1]) # [N, JX, d] xq = tf.concat([xx_fwbw, qq_avg_tiled, xx_fwbw * qq_avg_tiled], axis=2) # [N, JX, 3d] xq_flat = tf.reshape(xq, [-1, 2*3*d]) # [N * JX, 3*d] # Compute logits with tf.variable_scope('start'): logits1 = exp_mask(tf.reshape(tf.layers.dense(inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp1 = tf.argmax(logits1, axis=1) # [N] with tf.variable_scope('stop'): logits2 = exp_mask(tf.reshape(tf.layers.dense(inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp2 = tf.argmax(logits2, axis=1) # [N] outputs = {'logits1': logits1, 'logits2': logits2, 'yp1': yp1, 'yp2': yp2} variables = {'emb_mat': emb_mat} return variables, outputs
def _rnn_layer(self, name, is_train_mode, in_sequence, in_lengths): with tf.variable_scope(name): fw_cell = tfcontrib.rnn.LSTMCell( self.hidden_size, state_is_tuple=True, initializer=tf.truncated_normal_initializer(stddev=.001)) bw_cell = tfcontrib.rnn.LSTMCell( self.hidden_size, state_is_tuple=True, initializer=tf.truncated_normal_initializer(stddev=.001)) if is_train_mode: fw_cell = tfcontrib.rnn.DropoutWrapper( fw_cell, output_keep_prob=self.out_keep_prob) bw_cell = tfcontrib.rnn.DropoutWrapper( bw_cell, output_keep_prob=self.out_keep_prob) output, _ = nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, in_sequence, sequence_length=in_lengths, time_major=False, dtype=tf.float32) output = tf.concat(output, 2) return output
def _projection_lstm_layer(self): """pass""" with tf.variable_scope("projection_lstm_layer"): cells = { "fw": nn.rnn_cell.DropoutWrapper( nn.rnn_cell.LSTMCell(num_units = self.hidden_dim, initializer = tc.layers.xavier_initializer(), state_is_tuple = True), output_keep_prob = self.dropout), "bw": nn.rnn_cell.DropoutWrapper( nn.rnn_cell.LSTMCell(num_units = self.hidden_dim, initializer = tc.layers.xavier_initializer(), state_is_tuple = True), output_keep_prob = self.dropout)} outputs, state = nn.bidirectional_dynamic_rnn( cell_fw = cells["fw"], cell_bw = cells["bw"], inputs = self.embedding, sequence_length = self.length, dtype = tf.float32 ) outputs = tf.concat(outputs, axis = 2) w = tf.get_variable("W", shape = [self.hidden_dim * 2, self.num_tags], dtype = tf.float32, initializer = tc.layers.xavier_initializer()) b = tf.get_variable("b", shape = [self.num_tags], dtype = tf.float32, initializer = tf.zeros_initializer()) outputs = tf.nn.xw_plus_b(outputs, w, b) return outputs
def BiLSTM_Correlation_BiLSTM(cn, cor, weights, biases): # BiLSTM lstm_fw_cell = rnn.BasicLSTMCell(n_bilstm_hidden, forget_bias=1.0) lstm_bw_cell = rnn.BasicLSTMCell(n_bilstm_hidden, forget_bias=1.0) bilstm_outputs, _ = nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, cn, dtype=tf.float32, scope='input') bilstm_output = tf.matmul(tf.concat(bilstm_outputs, axis=2)[0], weights['bilstm']) + biases['bilstm'] # Attention lstm2_input = tf.convert_to_tensor([tf.matmul(cor, bilstm_output)]) # BiLSTM lstm2_fw_cell = rnn.BasicLSTMCell(n_bilstm2_hidden, forget_bias=1.0) lstm2_bw_cell = rnn.BasicLSTMCell(n_bilstm2_hidden, forget_bias=1.0) bilstm2_outputs, _ = nn.bidirectional_dynamic_rnn(lstm2_fw_cell, lstm2_bw_cell, lstm2_input, dtype=tf.float32, scope='output') bilstm2_output = tf.matmul(tf.concat(bilstm2_outputs, axis=2)[0], weights['bilstm2']) + biases['bilstm2'] return bilstm2_output
def BiLSTM(x, x_len, n_hidden, biRnnScopeName): lstm_fw_cell = rnn.LSTMCell(n_hidden) lstm_bw_cell = rnn.LSTMCell(n_hidden) outputs, output_states = nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32, sequence_length=x_len, scope=biRnnScopeName) return outputs, output_states
def BiGRU(x, x_len, n_hidden, biRnnScopeName): gru_fw_cell = rnn.GRUCell(n_hidden) gru_bw_cell = rnn.GRUCell(n_hidden) outputs, output_states = nn.bidirectional_dynamic_rnn( gru_fw_cell, gru_bw_cell, x, dtype=tf.float32, sequence_length=x_len, scope=biRnnScopeName) return outputs, output_states
def separable_lstm2(cell, num_units, inputs, seq_lengths1, seq_lengths2, scope=None): """Run bidirectional LSTMs first horizontally then vertically. Args: cell: an RNN cell num_units: number of neurons inputs: input sequence (length, batch_size, ninput) sequence_lengths: array of length 'batch_size' containing sequence_lengths scope: optional scope name Returns: (batch_size, height, width, num_units*2) tensor """ with variable_scope.variable_scope(scope, "SeparableLstm", [inputs]): batch_size = tf.shape(inputs)[0] _, height, width, depth = _shape(inputs) reshaped = array_ops.reshape(inputs, [batch_size * width, height, depth]) _, states = bidirectional_dynamic_rnn(cell(num_units), cell(num_units), reshaped, sequence_length=seq_lengths1, dtype=tf.float32) stacked_state = array_ops.concat(states, 1) with variable_scope.variable_scope("vertical"): unpacked = array_ops.reshape(stacked_state, [batch_size, width, num_units * 2]) _, states = bidirectional_dynamic_rnn(cell(num_units), cell(num_units), unpacked, sequence_length=seq_lengths2, dtype=tf.float32) return array_ops.concat(states, 1)
def BiRNN(self, x, x_lens, n_steps, n_hidden, biRnnScopeName, dropoutName): x = tf.nn.dropout(x, 0.5, name=dropoutName) # Forward direction cell lstm_fw_cell = rnn.LSTMCell(n_hidden) # Backward direction cell lstm_bw_cell = rnn.LSTMCell(n_hidden) # need scope to identified different cell outputs, output_states = nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, x, sequence_length=x_lens, dtype=tf.float32, scope=biRnnScopeName) return outputs, output_states
def BiLSTM(x, x_len, n_hidden, biRnnScopeName): lstm_fw_cell = rnn.LSTMCell(n_hidden) lstm_bw_cell = rnn.LSTMCell(n_hidden) outputs, output_states = nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32, sequence_length=x_len, scope=biRnnScopeName) #output_states is a tuple (fw,bw), fw contain (c,h) bw too. return outputs, output_states
def biLSTMCell(x, x_lens, hiddenSize): # 前向lstm单元 lstm_fw_cell = rnn.LSTMCell(hiddenSize) # 后向lstm单元 lstm_bw_cell = rnn.LSTMCell(hiddenSize) # 双向lstm outputs, output_states = nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, x, sequence_length=x_lens, dtype=tf.float32) output_fw, output_bw = outputs return tf.concat( [output_fw, output_bw], 1 ) # outputs ->(output_fw, output_bw)->(batch_size, max_time, n_hidden)
def bilstm(self, sequence, sequence_length, lstm_unit, reuse=None): with tf.variable_scope('BiLSTM', reuse=reuse, dtype=tf.float32): cell_fw = LSTMCell(num_units=lstm_unit, reuse=tf.get_variable_scope().reuse) cell_bw = LSTMCell(num_units=lstm_unit, reuse=tf.get_variable_scope().reuse) ((output_fw, output_bw), _) = bidirectional_dynamic_rnn(cell_fw, cell_bw, sequence, dtype=tf.float32, sequence_length=sequence_length) return tf.concat([output_fw, output_bw], axis=2) # (batch_size, num_step, lstm_unit * 2)
def forward(self, X, is_training=False): if self.cell_type == CellType.Bidir_Dynamic: return bidirectional_dynamic_rnn(cell_fw=self.cells, cell_bw=self.cells, inputs=X, dtype=tf.float32) elif self.cell_type == CellType.Bidir_Static: X = tf.unstack(X, num=self.seq_length, axis=1) return static_bidirectional_rnn(cell_fw=self.cells, cell_bw=self.cells, inputs=X, dtype=tf.float32) elif self.cell_type == CellType.Dynamic: return dynamic_rnn(self.cells, X, dtype=tf.float32) elif self.cell_type == CellType.Static: X = tf.unstack(X, num=self.seq_length, axis=1) return static_rnn(self.cells, X, dtype=tf.float32)
def bidirectional_horizontal_lstm(cell, num_units, inputs, seq_lengths, scope=None): he_init = tf.contrib.layers.variance_scaling_initializer() with variable_scope.variable_scope(scope, "BiHorizontalLstm", [inputs]): batch_size = tf.shape(inputs)[0] height = _shape(inputs)[1] sequence = grid_to_sequence(inputs) forward_cell = cell(num_units) backward_cell = cell(num_units) outputs, states = bidirectional_dynamic_rnn( forward_cell, backward_cell, sequence, sequence_length=seq_lengths, time_major=True, dtype=tf.float32) stacked_state = tf.expand_dims(array_ops.concat(states, 1), 0) output = sequence_to_grid(stacked_state, batch_size, height) return output
def forward(self, x, computation_mode=MakiRestorable.INFERENCE_MODE): if self._cell_type == CellType.BIDIR_DYNAMIC: (outputs_f, outputs_b), (states_f, states_b) = \ bidirectional_dynamic_rnn(cell_fw=self._cells, cell_bw=self._cells, inputs=x, dtype=tf.float32) # Creation of the two MakiTensors for both `outputs_f` and `outputs_b` is inappropriate since # the algorithm that builds the computational graph does not consider such case and # therefore can not handle this situation, it will cause an error. self._cells_state = tf.concat([states_f, states_b], axis=-1) return tf.concat([outputs_f, outputs_b], axis=-1) elif self._cell_type == CellType.BIDIR_STATIC: x = tf.unstack(x, num=self._seq_length, axis=1) outputs_fb, states_f, states_b = \ static_bidirectional_rnn(cell_fw=self._cells, cell_bw=self._cells, inputs=x, dtype=tf.float32) self._cells_state = tf.concat([states_f, states_f], axis=-1) return outputs_fb elif self._cell_type == CellType.DYNAMIC: outputs, states = dynamic_rnn(self._cells, x, dtype=tf.float32) self._cells_state = states return outputs elif self._cell_type == CellType.STATIC: x = tf.unstack(x, num=self._seq_length, axis=1) outputs, states = static_rnn(self._cells, x, dtype=tf.float32) self._cells_state = states return tf.stack(outputs, axis=1)
DropoutWrapper(fw_cell, output_keep_prob=keep_prob) for fw_cell in fw_cells ] fw_cells = MultiRNNCell(fw_cells) with tf.variable_scope('Backward'): bw_cells = [GRUCell(num_units) for _ in range(num_layers)] bw_cells = [ DropoutWrapper(bw_cell, output_keep_prob=keep_prob) for bw_cell in bw_cells ] bw_cells = MultiRNNCell(bw_cells) outputs, states = bidirectional_dynamic_rnn( fw_cells, bw_cells, rnn_input, dtype=tf.float32, sequence_length=sequence_length) # ★Attention # 'outputs' is a tensor of shape [batch_size, max_time, num_of_units] # 'state' is a N-tuple where N is the number of GRUCells containing a # tf.contrib.rnn.GRUcells for each cell fw_states, bw_states = states fw_states = fw_states[-1] #[batch_size,num_of_units] bw_states = bw_states[-1] #[batch_size,num_of_units] fc_states = tf.concat([fw_states, bw_states], 1) with tf.variable_scope('full_connected'): fc = tf.contrib.layers.fully_connected(fc_states, num_class,
def build_graph(training_setting): tf.reset_default_graph() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs') as name_scope: X_sent = tf.placeholder( tf.int32, [None, training_setting['maximum_sent_length']], name='x_sent') y = tf.placeholder(tf.float32, [None, training_setting['classes_num']], name='y') dropout = tf.placeholder(tf.float32, shape=[], name='dropout') pretrained_embeddings_input = tf.placeholder( tf.float32, shape=[ training_setting['pretrained_vocab_length'], training_setting['embedding_size'] ], name='pretrained_embeddings_ph') with tf.name_scope('embedding') as name_scope: reserved_embeddings = tf.Variable(tf.random_uniform([ training_setting['reserved_vocab_length'], training_setting['embedding_size'] ], -1.0, 1.0), trainable=True, name='reserved_embeddings') if training_setting['use_pretrained_embeddings']: pretrained_embeddings = tf.Variable( tf.random_uniform([ training_setting['pretrained_vocab_length'], training_setting['embedding_size'] ], -1.0, 1.0), trainable=False, name='pretrained_embeddings') assign_pretrained_embeddings = tf.assign( pretrained_embeddings, pretrained_embeddings_input, name='assign_pretrained_embeddings') else: pretrained_embeddings = tf.Variable( tf.random_uniform([ training_setting['pretrained_vocab_length'], training_setting['embedding_size'] ], -1.0, 1.0), trainable=True, name='pretrained_embeddings') X_sent = tf.where( tf.less(X_sent, tf.constant( training_setting['reserved_vocab_length'])), X_sent * 2, (X_sent - training_setting['reserved_vocab_length']) * 2 + 1) word_embeddings_sent = tf.nn.embedding_lookup( [reserved_embeddings, pretrained_embeddings], X_sent, name='word_embeddings_sent') # word_embeddings_sent = tf_print(word_embeddings_sent, 'word_embeddings_sent') with tf.name_scope('gru_cell_sent') as name_scope: gru_forward_sent = rnn.DropoutWrapper(rnn.GRUCell( training_setting['hidden_units']), output_keep_prob=dropout) gru_backward_sent = rnn.DropoutWrapper(rnn.GRUCell( training_setting['hidden_units']), output_keep_prob=dropout) (gru_output_forward, gru_output_backward), _ = nn.bidirectional_dynamic_rnn( gru_forward_sent, gru_backward_sent, word_embeddings_sent, dtype=tf.float32, scope=name_scope) bidirectional_gru_output_sent = tf.concat( axis=2, values=(gru_output_forward, gru_output_backward), name='output_sent') # bidirectional_gru_output_sent1 = tf_print(bidirectional_gru_output_sent1, 'bidirectional_gru_output_sent1') with tf.name_scope('pooling') as name_scope: W = tf.Variable( tf.random_normal([2 * training_setting['hidden_units']], name='attention_weight')) b = tf.Variable(tf.random_normal([1]), name='attention_bias') # W = tf_print(W, 'W') attentions = tf.reduce_sum( tf.multiply(W, bidirectional_gru_output_sent), axis=2) + b attentions = tf.nn.softmax(attentions) # attentions = tf_print(attentions, 'attentions') expand_attentions = tf.expand_dims(attentions, 1) transpose_outputs = tf.transpose(bidirectional_gru_output_sent, perm=[0, 2, 1]) attentions_output = tf.reduce_sum(tf.transpose(tf.multiply( expand_attentions, transpose_outputs), perm=[0, 2, 1]), axis=1) # attentions_output = tf_print(attentions_output, 'attentions_output') with tf.name_scope('mlp') as name_scope: W_mlp = tf.Variable( tf.random_normal([ 2 * training_setting['hidden_units'], training_setting['classes_num'] ])) b_mlp = tf.Variable( tf.random_normal([training_setting['classes_num']])) logits = tf.matmul(attentions_output, W_mlp) + b_mlp probability = tf.nn.softmax(logits, name='probability') y_pred = tf.one_hot(tf.argmax(probability, 1), depth=training_setting['classes_num'], name='y_pred') with tf.name_scope('loss') as name_scope: loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y, name='loss') with tf.name_scope('optimizer') as name_scope: optimizer = tf.train.AdadeltaOptimizer( learning_rate=training_setting['learning_rate']).minimize( loss, name='optimizer') return graph
def build_graph(training_setting): tf.reset_default_graph() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs') as name_scope: sequence_length = tf.placeholder(tf.int32, [None], name='sequence_length') X_sent = tf.placeholder( tf.int32, [None, training_setting['maximum_sent_length']], name='x_sent') y_slot = tf.placeholder( tf.int32, [None, training_setting['maximum_sent_length']], name='y_slot') y_intent = tf.placeholder(tf.int32, [None], name='y_intent') y_intent = tf.one_hot(y_intent, depth=training_setting['intent_num'], dtype=tf.int32) dropout = tf.placeholder(tf.float32, shape=[], name='dropout') pretrained_embeddings_input = tf.placeholder( tf.float32, shape=[ training_setting['pretrained_vocab_length'], training_setting['embedding_size'] ], name='pretrained_embeddings_ph') with tf.name_scope('embedding') as name_scope: reserved_embeddings = tf.Variable(tf.random_uniform([ training_setting['reserved_vocab_length'], training_setting['embedding_size'] ], -1.0, 1.0), trainable=True, name='reserved_embeddings') if training_setting['use_pretrained_embeddings']: pretrained_embeddings = tf.Variable( tf.random_uniform([ training_setting['pretrained_vocab_length'], training_setting['embedding_size'] ], -1.0, 1.0), trainable=False, name='pretrained_embeddings') assign_pretrained_embeddings = tf.assign( pretrained_embeddings, pretrained_embeddings_input, name='assign_pretrained_embeddings') else: pretrained_embeddings = tf.Variable( tf.random_uniform([ training_setting['pretrained_vocab_length'], training_setting['embedding_size'] ], -1.0, 1.0), trainable=True, name='pretrained_embeddings') X_sent = tf.where( tf.less(X_sent, tf.constant( training_setting['reserved_vocab_length'])), X_sent * 2, (X_sent - training_setting['reserved_vocab_length']) * 2 + 1) word_embeddings_sent = tf.nn.embedding_lookup( [reserved_embeddings, pretrained_embeddings], X_sent, name='word_embeddings_sent') # word_embeddings_sent = tf_print(word_embeddings_sent, 'word_embeddings_sent') with tf.name_scope('gru_cell_sent') as name_scope: gru_forward_sent = rnn.DropoutWrapper(rnn.GRUCell( training_setting['hidden_units']), output_keep_prob=dropout) gru_backward_sent = rnn.DropoutWrapper(rnn.GRUCell( training_setting['hidden_units']), output_keep_prob=dropout) (gru_output_forward, gru_output_backward), _ = nn.bidirectional_dynamic_rnn( gru_forward_sent, gru_backward_sent, word_embeddings_sent, dtype=tf.float32, sequence_length=sequence_length, scope=name_scope) bidirectional_gru_output_sent = tf.concat( axis=2, values=(gru_output_forward, gru_output_backward), name='output_sent') # bidirectional_gru_output_sent = tf_print(bidirectional_gru_output_sent, 'bidirectional_gru_output_sent') with tf.name_scope('pooling') as name_scope: W = tf.Variable( tf.random_normal([2 * training_setting['hidden_units']], name='attention_weight')) b = tf.Variable(tf.random_normal([1]), name='attention_bias') # W = tf_print(W, 'W') attentions = tf.reduce_sum( tf.multiply(W, bidirectional_gru_output_sent), axis=2) + b attentions = tf.nn.softmax(attentions) # attentions = tf_print(attentions, 'attentions') expand_attentions = tf.expand_dims(attentions, 1) transpose_outputs = tf.transpose(bidirectional_gru_output_sent, perm=[0, 2, 1]) attentions_output = tf.reduce_sum(tf.transpose(tf.multiply( expand_attentions, transpose_outputs), perm=[0, 2, 1]), axis=1) # attentions_output = tf_print(attentions_output, 'attentions_output') with tf.name_scope('slot') as name_scope: # attentions_output = tf_print(attentions_output, 'attention output') W_projection_slot = tf.get_variable( "W_projection_slot", shape=[64, training_setting['slot_num']]) b_projection_slot = tf.get_variable( "b_projection_slot", shape=[training_setting['slot_num']]) logits = [] hidden_states_list = [] for i in range(training_setting['maximum_sent_length']): feature = bidirectional_gru_output_sent[:, i, :] hidden_states = tf.layers.dense(feature, 64, activation=tf.nn.tanh) output = tf.matmul(hidden_states, W_projection_slot) + b_projection_slot logits.append(output) hidden_states_list.append(hidden_states) logits_slots = tf.stack(logits, axis=1) y_pred_slot = tf.argmax(logits_slots, axis=2, name="y_pred") with tf.name_scope('intent') as name_scope: W_mlp = tf.Variable( tf.random_normal([ 2 * training_setting['hidden_units'], training_setting['intent_num'] ])) b_mlp = tf.Variable( tf.random_normal([training_setting['intent_num']])) logits_intent = tf.matmul(attentions_output, W_mlp) + b_mlp y_pred_intent = tf.argmax(logits_intent, 1, name='y_pred') with tf.name_scope('loss') as name_scope: mask = tf.to_float(tf.not_equal(sequence_length, 0)) loss_slot = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=y_slot, logits=(logits_slots)) loss_slot = tf.reduce_sum( loss_slot, axis=1) / training_setting['maximum_sent_length'] loss_slot = tf.reduce_mean(loss_slot) loss_intent = tf.nn.softmax_cross_entropy_with_logits( labels=y_intent, logits=logits_intent) loss_intent = tf.reduce_mean(loss_intent) # l2_losses = tf.add_n( # [tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * self.l2_lambda # weights_intent = tf.nn.sigmoid(tf.cast(self.global_step / 1000, dtype=tf.float32)) / 2 loss = loss_slot + loss_intent loss = tf.identity(loss, name='loss') # loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_intent, name='loss') with tf.name_scope('optimizer') as name_scope: optimizer = tf.train.AdadeltaOptimizer( learning_rate=training_setting['learning_rate']).minimize( loss, name='optimizer') return graph
def attention_forward(config, inputs, scope=None): with tf.variable_scope(scope or "rnn_attention"): JX, JQ = config.max_context_size, config.max_ques_size d = config.hidden_size x, x_len, q, q_len = [inputs[key] for key in ['x', 'x_len', 'q', 'q_len']] x_mask = tf.sequence_mask(x_len, JX) # emb_mat = tf.get_variable('emb_mat', shape=[V, d]) emb_mat = config.emb_mat_ph if config.serve else config.emb_mat emb_mat = tf.slice(emb_mat, [2, 0], [-1, -1]) emb_mat = tf.concat( [tf.get_variable('emb_mat', shape=[2, d]), emb_mat], axis=0) xx = tf.nn.embedding_lookup(emb_mat, x, name='xx') # [N, JX, d] qq = tf.nn.embedding_lookup(emb_mat, q, name='qq') # [N, JQ, d] with tf.variable_scope("context_rnn"): # run context embeddings through GRU dropout = 0.1 fw_cell = GRUCell(64) bw_cell = GRUCell(64) if config.is_train: fw_cell = DropoutWrapper( fw_cell, output_keep_prob=(1.0 - dropout)) bw_cell = DropoutWrapper( bw_cell, output_keep_prob=(1.0 - dropout)) (output_fw, output_bw), _ = bidirectional_dynamic_rnn( fw_cell, bw_cell, xx, dtype=tf.float32, sequence_length=x_len) xx_rnn_toobig = tf.concat([output_fw, output_bw], axis=2) xx_rnn = tf.layers.dense(xx_rnn_toobig, 50, activation=None) with tf.variable_scope("question_rnn"): # run question embeddings through GRU dropout = 0.1 fw_cell2 = GRUCell(64) bw_cell2 = GRUCell(64) if config.is_train: fw_cell2 = DropoutWrapper( fw_cell2, output_keep_prob=(1.0 - dropout)) bw_cell2 = DropoutWrapper( bw_cell2, output_keep_prob=(1.0 - dropout)) (output_fw2, output_bw2), _ = bidirectional_dynamic_rnn( fw_cell2, bw_cell2, qq, dtype=tf.float32, sequence_length=q_len) qq_rnn_toobig = tf.concat([output_fw2, output_bw2], axis=2) qq_rnn = tf.layers.dense(qq_rnn_toobig, 50, activation=None) # equation 10 # how can i point-wise multiply xx_rnn and qq_rnn given their different sizes? xx_rnn_exp = tf.expand_dims(xx_rnn, axis=2) xx_rnn_tiled = tf.tile(xx_rnn_exp, [1, 1, JQ, 1]) qq_rnn_exp = tf.expand_dims(qq_rnn, axis=1) qq_rnn_tiled = tf.tile(qq_rnn_exp, [1, JX, 1, 1]) weights = tf.get_variable(name="weights", shape=[3*d, 1]) bScalar = tf.get_variable(name="bScalar", shape=[]) insideBrackets = tf.concat([xx_rnn_tiled, qq_rnn_tiled, tf.math.multiply( xx_rnn_tiled, qq_rnn_tiled)], axis=3) insideBracketsReshaped = tf.reshape(insideBrackets, [tf.shape(insideBrackets)[ 0] * tf.shape(insideBrackets)[1] * tf.shape(insideBrackets)[2], 3*d]) dotProductWithWeightsPlusScalar = tf.matmul( insideBracketsReshaped, weights) + bScalar dotProductWithWeightsReshaped = tf.reshape(dotProductWithWeightsPlusScalar, [tf.shape(insideBrackets)[ 0], tf.shape(insideBrackets)[1], tf.shape(insideBrackets)[2]]) p = tf.nn.softmax(dotProductWithWeightsReshaped, 2) p_exp = tf.expand_dims(p, axis=3) p_tiled = tf.tile(p_exp, [1, 1, 1, d]) # equation 9 qk_bar = tf.reduce_sum(tf.multiply(p_tiled, qq_rnn_tiled), axis=2) # plug qk_bar in place of qq_avg_tiled below xq = tf.concat([xx_rnn, qk_bar, xx_rnn * qk_bar], axis=2) # [N, JX, 3d] xq_flat = tf.reshape(xq, [-1, 3*d]) # [N * JX, 3*d] # Compute logits with tf.variable_scope('start'): logits1 = exp_mask(tf.reshape(tf.layers.dense( inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp1 = tf.argmax(logits1, axis=1) # [N] with tf.variable_scope('stop'): logits2 = exp_mask(tf.reshape(tf.layers.dense( inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp2 = tf.argmax(logits2, axis=1) # [N] outputs = {'logits1': logits1, 'logits2': logits2, 'yp1': yp1, 'yp2': yp2} variables = {'emb_mat': emb_mat} return variables, outputs
with graph.as_default(): SEQUENCE_INPUT = tf.placeholder(tf.float32, shape=(None, None, 2)) LABEL_INPUT = tf.placeholder(tf.int32, shape=(None, 1)) LABEL_ONE_HOT = tf.one_hot(LABEL_INPUT, L_UNIQUE) DROPOUT_IN = tf.placeholder_with_default(1.0, ()) batch_size = tf.shape(SEQUENCE_INPUT)[0] CELL_FW, CELL_FW_INIT = create_lstm(batch_size, 32, activation=tf.nn.relu) CELL_BW, CELL_BW_INIT = create_lstm(batch_size, 32, activation=tf.nn.relu) x = SEQUENCE_INPUT outputs, states = bidirectional_dynamic_rnn(cell_fw=CELL_FW, cell_bw=CELL_BW, initial_state_fw=CELL_FW_INIT, initial_state_bw=CELL_BW_INIT, dtype=tf.float32, inputs=x) output_fw, output_bw = outputs outputs = (output_fw[:, -1, :], output_bw[:, -1, :]) x = tf.concat(outputs, -1) x = tf.nn.dropout(x, DROPOUT_IN) x = tf.contrib.layers.fully_connected(x, 256) x = tf.nn.dropout(x, DROPOUT_IN) x = tf.contrib.layers.fully_connected(x, L_UNIQUE, activation_fn=None) PREDICTION_TENSOR = tf.contrib.layers.softmax(x) PREDICTED_LABEL_T = tf.round(PREDICTION_TENSOR) EQUAL_T = tf.equal(PREDICTED_LABEL_T, LABEL_ONE_HOT)
def rnn_forward(config, inputs, scope=None): with tf.variable_scope(scope or "rnn"): JX, JQ = config.max_context_size, config.max_ques_size d = config.hidden_size x, x_len, q, q_len = [inputs[key] for key in ['x', 'x_len', 'q', 'q_len']] x_mask = tf.sequence_mask(x_len, JX) q_mask = tf.sequence_mask(q_len, JQ) # emb_mat = tf.get_variable('emb_mat', shape=[V, d]) emb_mat = config.emb_mat_ph if config.serve else config.emb_mat emb_mat = tf.slice(emb_mat, [2, 0], [-1, -1]) emb_mat = tf.concat( [tf.get_variable('emb_mat', shape=[2, d]), emb_mat], axis=0) xx = tf.nn.embedding_lookup(emb_mat, x, name='xx') # [N, JX, d] qq = tf.nn.embedding_lookup(emb_mat, q, name='qq') # [N, JQ, d] with tf.variable_scope("context_rnn"): # run context embeddings through GRU dropout = 0.1 fw_cell = GRUCell(64) bw_cell = GRUCell(64) if config.is_train: fw_cell = DropoutWrapper( fw_cell, output_keep_prob=(1.0 - dropout)) bw_cell = DropoutWrapper( bw_cell, output_keep_prob=(1.0 - dropout)) (output_fw, output_bw), _ = bidirectional_dynamic_rnn( fw_cell, bw_cell, xx, dtype=tf.float32) xx_rnn_toobig = tf.concat([output_fw, output_bw], axis=2) xx_rnn = tf.layers.dense(xx_rnn_toobig, 50, activation=None) with tf.variable_scope("question_rnn"): # run question embeddings through GRU dropout = 0.1 fw_cell2 = GRUCell(64) bw_cell2 = GRUCell(64) if config.is_train: fw_cell2 = DropoutWrapper( fw_cell2, output_keep_prob=(1.0 - dropout)) bw_cell2 = DropoutWrapper( bw_cell2, output_keep_prob=(1.0 - dropout)) (output_fw2, output_bw2), _ = bidirectional_dynamic_rnn( fw_cell2, bw_cell2, qq, dtype=tf.float32) qq_rnn_toobig = tf.concat([output_fw2, output_bw2], axis=2) qq_rnn = tf.layers.dense(qq_rnn_toobig, 50, activation=None) # equation 1 (averaging) qq_avg = tf.reduce_mean( bool_mask(qq_rnn, q_mask, expand=True), axis=1) # [N, d] qq_avg_exp = tf.expand_dims(qq_avg, axis=1) # [N, 1, d] qq_avg_tiled = tf.tile(qq_avg_exp, [1, JX, 1]) # [N, JX, d] xq = tf.concat([xx_rnn, qq_avg_tiled, xx_rnn * qq_avg_tiled], axis=2) # [N, JX, 3d] xq_flat = tf.reshape(xq, [-1, 3*d]) # [N * JX, 3*d] # Compute logits with tf.variable_scope('start'): logits1 = exp_mask(tf.reshape(tf.layers.dense( inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp1 = tf.argmax(logits1, axis=1) # [N] with tf.variable_scope('stop'): logits2 = exp_mask(tf.reshape(tf.layers.dense( inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp2 = tf.argmax(logits2, axis=1) # [N] outputs = {'logits1': logits1, 'logits2': logits2, 'yp1': yp1, 'yp2': yp2} variables = {'emb_mat': emb_mat} return variables, outputs
def attention_forward(config, inputs, scope=None): with tf.variable_scope(scope or "forward"): JX, JQ = config.max_context_size, config.max_ques_size d = config.hidden_size x, x_len, q, q_len = [inputs[key] for key in ['x', 'x_len', 'q', 'q_len']] x_mask = tf.sequence_mask(x_len, JX) q_mask = tf.sequence_mask(q_len, JQ) # emb_mat = tf.get_variable('emb_mat', shape=[V, d]) emb_mat = config.emb_mat_ph if config.serve else config.emb_mat emb_mat = tf.slice(emb_mat, [2, 0], [-1, -1]) emb_mat = tf.concat([tf.get_variable('emb_mat', shape=[2, d]), emb_mat], axis=0) xx = tf.nn.embedding_lookup(emb_mat, x, name='xx') # [N, JX, d] qq = tf.nn.embedding_lookup(emb_mat, q, name='qq') # [N, JQ, d] #now process xx and qq with this new matrices with tf.variable_scope('xx-encoder'): fw_xx_cell = GRUCell(d) fw_xx_cell = DropoutWrapper(cell=fw_xx_cell, output_keep_prob=config.keep_prob) bw_xx_cell = GRUCell(d) bw_xx_cell = DropoutWrapper(cell=bw_xx_cell, output_keep_prob=config.keep_prob) outputs_xx, _ = bidirectional_dynamic_rnn( fw_xx_cell, bw_xx_cell, xx, dtype=tf.float32) with tf.variable_scope('qq-encoder'): fw_qq_cell = GRUCell(d) fw_qq_cell = DropoutWrapper(cell=fw_qq_cell, output_keep_prob=config.keep_prob) bw_qq_cell = GRUCell(d) fw_xx_cell = DropoutWrapper(cell=fw_xx_cell, output_keep_prob=config.keep_prob) outputs_qq, _ = bidirectional_dynamic_rnn( fw_qq_cell, bw_qq_cell, qq, dtype=tf.float32) xx_fwbw=tf.concat(outputs_xx, 2 ) #[N,JX,2d] qq_fwbw=tf.concat(outputs_qq, 2) #[N,JQ,2d] qq_exp= tf.expand_dims(qq_fwbw, axis=2) # [N,JQ, 1, 2d] qq_tiled = tf.tile(qq_exp, [1,1, JX, 1]) # [N,JQ, JX, 2d] xx_exp= tf.expand_dims(xx_fwbw, axis=1) # [N, 1,JX, 2d] xx_tiled = tf.tile(xx_exp, [1,JQ, 1, 1]) # [N,JQ, JX, 2d] pre_pk= tf.concat([xx_tiled,qq_tiled, xx_tiled * qq_tiled], axis=-1) # [N,JQ,JX, 6d] pre_pk_flat=tf.reshape(pre_pk,[-1,6*d]) with tf.variable_scope('weights'): logits_p=tf.layers.dense(inputs=pre_pk_flat, units=1) print('logitsp shape:', logits_p.shape) logits_p=tf.reshape(logits_p, [-1,JQ,JX,1]) pk=tf.nn.softmax(logits_p,axis=1) #softmax along JQ print('logitsp shape after:', logits_p.shape) print('pk shape:', pk.shape) #now, get the new qs qq_rew=tf.reduce_sum(qq_tiled * pk,axis=1) #[N,JX,1]??? print('new weights shape', qq_rew.shape) #now we can resum as in previous methods xq = tf.concat([xx_fwbw, qq_rew, xx_fwbw *qq_rew ], axis=2) # [N, JX, 6d] xq_flat = tf.reshape(xq, [-1, 2*3*d]) # [N * JX, 3*d] # Compute logits with tf.variable_scope('start'): logits1 = exp_mask(tf.reshape(tf.layers.dense(inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp1 = tf.argmax(logits1, axis=1) # [N] with tf.variable_scope('stop'): logits2 = exp_mask(tf.reshape(tf.layers.dense(inputs=xq_flat, units=1), [-1, JX]), x_mask) # [N, JX] yp2 = tf.argmax(logits2, axis=1) # [N] outputs = {'logits1': logits1, 'logits2': logits2, 'yp1': yp1, 'yp2': yp2} variables = {'emb_mat': emb_mat} return variables, outputs