def seq2seq_f(lstm_inputs, decoder_inputs, seq_length, do_decode): num_hidden = attn_num_layers * attn_num_hidden lstm_fw_cell = BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) # Backward direction cell lstm_bw_cell = BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, sequence_length=None, scope=None) encoder_inputs = [e*f for e,f in zip(pre_encoder_inputs,encoder_masks[:seq_length])] top_states = [array_ops.reshape(e, [-1, 1, num_hidden*2]) for e in encoder_inputs] attention_states = array_ops.concat(top_states, 1) initial_state = tf.concat(axis=1, values=[output_state_fw, output_state_bw]) outputs, _, attention_weights_history = embedding_attention_decoder( decoder_inputs, initial_state, attention_states, cell, num_symbols=target_vocab_size, embedding_size=target_embedding_size, num_heads=1, output_size=target_vocab_size, output_projection=None, feed_previous=do_decode, initial_state_attention=False, attn_num_hidden = attn_num_hidden) return outputs, attention_weights_history
def __init__(self, config): self._config = config self._kernel_size = 3 self._train_data, _ = self.get_video_data() self._nr_training_examples = self._train_data.shape[0] self._train_data = self.shuffle_train_data(self._train_data) self._word_to_index, self._index_to_word, self._bias_init_vector, self._caption_matrix, self._longest_caption = \ self.get_caption_dicts(self._train_data) self._nr_words = len(self._word_to_index) self._W_emb = tf.get_variable(tf.random_uniform( [self._nr_words, self._config.dim_hidden], -0.1, 0.1), name='W_emb') self._lstm = BasicLSTMCell(self._config.dim_hidden) self._encode_image_W = tf.get_variable(tf.random_uniform( [self._config.dim_video, self._config.dim_hidden], -0.1, 0.1), name='encode_image_W') self._encode_image_b = tf.get_variable(tf.zeros( [self._config.dim_hidden]), name='encode_image_b') self._embed_word_W = tf.get_variable(tf.random_uniform( [self._config.dim_hidden, self._nr_words], -0.1, 0.1), name='embed_word_W') self._embed_word_b = tf.get_variable(tf.zeros([self._nr_words]), name='embed_word_b') self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * self._config.learning_rate_decay_factor)
def define_basic_lstm_cell(self): #num_units = self.config.hidden_size * 2 lstm_cell = BasicLSTMCell(self.config.hidden_size, forget_bias=0.0) if self.is_training and self.config.keep_prob < 1.0: lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=self.config.keep_prob) return tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * self.config.num_layers)
class RecurrentController(BaseController): def network_vars(self): self.lstm_cell = BasicLSTMCell(256) self.state = self.lstm_cell.zero_state(self.batch_size, tf.float32) def network_op(self, X, state): X = tf.convert_to_tensor(X) return self.lstm_cell(X, state) def get_state(self): return self.state def update_state(self, new_state): return tf.no_op()
def RNN(x, weights, biases): # Prepare data shape to match `rnn` function requirements # Current data input shape: (batch_size, n_steps, n_input) # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) # Permuting batch_size and n_steps x = tf.transpose(x, [1, 0, 2]) # Reshaping to (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, n_input]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) # x = tf.split(x, n_steps, 0) x = tf.split(0, n_steps, x) # Define a lstm cell with tensorflow lstm_cell = BasicLSTMCell(n_hidden, forget_bias=1.0) # Get lstm cell output outputs, states = tf.nn.rnn(lstm_cell, x, dtype=tf.float32) # Linear activation, using rnn inner loop last output return tf.matmul(outputs[-1], weights['out']) + biases['out']
def __init__(self, features, batch_size): self.cell = BasicLSTMCell(features) self.h = tf.Variable(tf.zeros([batch_size, features]), trainable=False) self.c = tf.Variable(tf.zeros([batch_size, features]), trainable=False) self.batch_size = batch_size self.features = features
def network_vars(self): self.lstm_cell = BasicLSTMCell(256) self.state = self.lstm_cell.zero_state(self.batch_size, tf.float32)
def __init__(self, encoder_masks, encoder_inputs_tensor, decoder_inputs, target_weights, target_vocab_size, buckets, target_embedding_size, attn_num_layers, attn_num_hidden, forward_only, use_gru): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.encoder_inputs_tensor = encoder_inputs_tensor self.decoder_inputs = decoder_inputs self.target_weights = target_weights self.target_vocab_size = target_vocab_size self.buckets = buckets self.encoder_masks = encoder_masks # Create the internal multi-layer cell for our RNN. single_cell = BasicLSTMCell(attn_num_hidden, forget_bias=0.0, state_is_tuple=False) if use_gru: print("using GRU CELL in decoder") single_cell = GRUCell(attn_num_hidden) cell = single_cell if attn_num_layers > 1: cell = MultiRNNCell([single_cell] * attn_num_layers, state_is_tuple=False) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(lstm_inputs, decoder_inputs, seq_length, do_decode): num_hidden = attn_num_layers * attn_num_hidden lstm_fw_cell = BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) # Backward direction cell lstm_bw_cell = BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, sequence_length=None, scope=None) encoder_inputs = [e*f for e,f in zip(pre_encoder_inputs,encoder_masks[:seq_length])] top_states = [array_ops.reshape(e, [-1, 1, num_hidden*2]) for e in encoder_inputs] attention_states = array_ops.concat(top_states, 1) initial_state = tf.concat(axis=1, values=[output_state_fw, output_state_bw]) outputs, _, attention_weights_history = embedding_attention_decoder( decoder_inputs, initial_state, attention_states, cell, num_symbols=target_vocab_size, embedding_size=target_embedding_size, num_heads=1, output_size=target_vocab_size, output_projection=None, feed_previous=do_decode, initial_state_attention=False, attn_num_hidden = attn_num_hidden) return outputs, attention_weights_history # Our targets are decoder inputs shifted by one. targets = [decoder_inputs[i + 1] for i in xrange(len(decoder_inputs) - 1)] softmax_loss_function = None # default to tf.nn.sparse_softmax_cross_entropy_with_logits # Training outputs and losses. if forward_only: self.outputs, self.losses, self.attention_weights_histories = model_with_buckets( encoder_inputs_tensor, decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.attention_weights_histories = model_with_buckets( encoder_inputs_tensor, decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), softmax_loss_function=softmax_loss_function)
for i in range(len(Questions_Types)): Questions_Types_embedding.append( embeddings_index[Questions_Types[i].lower()]) #Question Type Encoder # Here I have to pass these Question types embeddings e_ti to the fully connected neural network to get the internal vector representation qt_i Topic_internal_vector = FC(Questions_Types_embedding, 7) # Number of hidden units. from tensorflow.python.ops.rnn_cell import BasicLSTMCell encoder_hidden_units = 600 decoder_hidden_units = 600 # BLSTM Encoder encoding the Question Topic forward_topics_encoder_cell = BasicLSTMCell(encoder_hidden_units) backward_topics_encoder_cell = BasicLSTMCell(encoder_hidden_units) ( ( encoder_topics_fw_outputs, # Contains the outputs of the BLSTM. encoder_topics_bw_outputs), ( encoder_topics_fw_final_state, # Contains the last hidden state of the BLSTM. encoder_topics_bw_final_state)) = ( tf.nn.bidirectional_dynamic_rnn( cell_fw=forward_topics_encoder_cell, cell_bw=backward_topics_encoder_cell, inputs=Topics, # Topics dtype=tf.float32, time_major=True))
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, dc, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.char_emb_size, config.max_word_size H = config.max_tree_height x_mask = self.x > 0 q_mask = self.q > 0 tx_mask = self.tx > 0 # [N, M, H, JX] with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float') bias = tf.get_variable("bias", shape=[d], dtype='float') strides = [1, 1, 1, 1] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d] qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d] xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d]) qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d]) with tf.variable_scope("word_emb"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] # Ax = linear([Ax], d, False, scope='Ax_reshape') # Aq = linear([Aq], d, False, scope='Aq_reshape') xx = tf.concat(3, [xxc, Ax]) # [N, M, JX, 2d] qq = tf.concat(2, [qqc, Aq]) # [N, JQ, 2d] D = d + config.word_emb_size with tf.variable_scope("pos_emb"): pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float') Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d] cell = BasicLSTMCell(D, state_is_tuple=True) cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope("rnn"): (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d] tf.get_variable_scope().reuse_variables() (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d] u = (fw_u + bw_u) / 2.0 h = (fw_h + bw_h) / 2.0 with tf.variable_scope("h"): no_op_cell = NoOpCell(D) tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max) initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D] inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX] inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX] length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX]) # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1]) h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D] h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D] u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d] dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX] # self.logits = tf.reshape(dot, [N, M * H * JX]) self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX] self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def __init__(self, kwd_voc_size, *args, **kwargs): BasicLSTMCell.__init__(self, *args, **kwargs) self.key_words_voc_size = kwd_voc_size
in_onehot = tf.one_hot(in_ph, vocab_size, name="input_onehot") inputs = tf.split(in_onehot, sequence_length, axis=1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] targets = tf.split(targ_ph, sequence_length, axis=1) # at this point, inputs is a list of length sequence_length # each element of inputs is [batch_size,vocab_size] # targets is a list of length sequence_length # each element of targets is a 1D vector of length batch_size # ------------------ # YOUR COMPUTATION GRAPH HERE cell0 = BasicLSTMCell(state_dim) cell1 = BasicLSTMCell(state_dim) multi_cell = tf.contrib.rnn.MultiRNNCell([cell0, cell1]) initial_state = multi_cell.zero_state(batch_size, tf.float32) ####################################################################### #and add an encoder for the decoder... seq_out = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, initial_state, multi_cell) seq_out = tf.reshape(seq_out, (50, 50, 128, 1)) initializer = tf.contrib.layers.variance_scaling_initializer() logits = tf.contrib.layers.fully_connected(seq_out, vocab_size, activation_fn=None, weights_initializer=initializer, biases_initializer=initializer) #logits =
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) print(word_emb_mat.get_shape().as_list()) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0, p1 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # with tf.variable_scope("activate"): # p0 = tf.nn.relu(_linear(tf.reshape(p0,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # p1 = tf.nn.relu(_linear(tf.reshape(p1,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu')) with tf.variable_scope('two_lstm'): p0 = tf.reshape(p0, [N, 1, -1, 300]) p1 = tf.reshape(p1, [N, 1, -1, 300]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) q_len_new = tf.tile(tf.expand_dims(q_len, 1), [1, M]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) # with tf.variable_scope('two_lstm_1'): # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # q_len_new = tf.tile(tf.expand_dims(q_len,1),[1,M]) # if config.share_lstm_weights: # tf.get_variable_scope().reuse_variables() # (fw_g3, bw_g3), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g1, q_len_new, dtype='float', scope='g0') # [N, M, JX, 2d] # g3 = tf.concat(3, [fw_g3, bw_g3]) g0 = tf.reduce_sum(tf.reduce_max(g0, 2), 1) g1 = tf.reduce_sum(tf.reduce_max(g1, 2), 1) logits = _linear([g0, g1, tf.abs(tf.subtract(g0, g1)), g0 * g1], 2, bias=0.01, bias_start=0.0, scope='logits1') flat_logits2 = tf.reshape(logits, [N, 2]) yp = tf.nn.softmax(flat_logits2) # [-1, M*JX] self.tensor_dict['g0'] = g0 self.tensor_dict['g1'] = g1 self.logits = flat_logits2 self.yp = yp
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.two_prepro_layers: (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, u, q_len, dtype='float', scope='u2') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='u2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] if config.two_prepro_layers: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, h, x_len, dtype='float', scope='h2') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) if config.late: (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [g1, p0]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) logits2 = get_logits([g1, g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) else: logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) if config.feed_gt: logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER) logits = tf.cond(self.is_train, lambda: logy, lambda: logits) if config.feed_hard: hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1) hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX), [N, M, JX]) # [N, M, JX] logits = tf.cond(self.is_train, lambda: logits, lambda: hard_logits) flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) yp_aug = tf.expand_dims(yp, -1) g1yp = g1 * yp_aug if config.prev_mode == 'a': prev = a1i elif config.prev_mode == 'y': prev = yp_aug elif config.prev_mode == 'gy': prev = g1yp else: raise Exception() (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, prev, g1 * prev]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2") logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
inputs = [tf.squeeze(input_, [1]) for input_ in inputs] targets = tf.split(1, sequence_length, targ_ph) # at this point, inputs is a list of length sequence_length # each element of inputs is [batch_size,vocab_size] # targets is a list of length sequence_length # each element of targets is a 1D vector of length batch_size # ------------------ # YOUR COMPUTATION GRAPH HERE with tf.variable_scope("Graph_") as scope: # create a BasicLSTMCell cell = BasicLSTMCell(state_dim, state_is_tuple=True) # use it to create a MultiRNNCell #tf.nn.rnn_cell.MultiRNNCell.__init__(cells, state_is_tuple=False) stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell] * 2, state_is_tuple=True) #stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell]*2) # use it to create an initial_state initial_state = stacked_lstm.zero_state(batch_size, tf.float32) #note that initial_state will be a *list* of tensors! # call seq2seq.rnn_decoder # rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None): #outputs, state = tf.nn.seq2seq.rnn_decoder(inputs, initial_state, stacked_lstm, loop_function=None, scope=None) outputs, state = tf.nn.seq2seq.rnn_decoder(inputs, initial_state, stacked_lstm, loop_function=None,
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=True, num_samples=512, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. output_keep_prob = tf.constant(0.8) single_cell = GRUCell(size) # Add dropout layer for regularization. single_cell = DropoutWrapper(single_cell, output_keep_prob=output_keep_prob) if use_lstm: single_cell = BasicLSTMCell(size) single_cell = DropoutWrapper(single_cell, output_keep_prob=output_keep_prob) cell = single_cell if num_layers > 1: cell = MultiRNNCell([single_cell] * num_layers) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.nn.seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] if forward_only: self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def _build_forward(self): config = self.config N = config.batch_size M = config.max_num_sents JX = config.max_sent_size JQ = config.max_ques_size VW = config.word_vocab_size VC = config.char_vocab_size W = config.max_word_size d = config.hidden_size JX = tf.shape(self.x)[2] # JX max sentence size, length, JQ = tf.shape(self.q)[1] # JQ max questions size, length, is the M = tf.shape(self.x)[1] # m is the max number of sentences dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size # dc = 8, each char will be map to 8-number vector, "char-level word embedding size [100]" with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # 330,8 a matrix for each char to its 8-number vector with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] 60,None,None,16,8, batch-size, # N is the number of batch_size # M the max number of sentences # JX is the max sentence length # W is the max length of a word # dc is the vector for each char # map each char to a vector Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # JQ the max length of question # W the max length of words # mao each char in questiosn to vectors Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # max questions size, length, max_word_size(16), char_emb_size(8) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) # so here, there are 100 filters and the size of each filter is 5 # different heights and there are different number of these filter, but here just 100 5-long filters assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape( qq, [-1, JQ, dco ]) # here, xx and qq are the output of cnn, if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: # create a new word embedding or use the glove? word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # here we used cnn and word embedding represented each word with a 200-unit vector # so for, xx, (batch_size, sentence#, word#, embedding), qq (batch_size, word#, embedding) # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # same shape with line 173 cell = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M], [60,?] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] [60] # masks are true and false, here, he sums up those truths, with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat( [fw_u, bw_u], 2) # (60, ?, 200) | 200 becahse combined 2 100 hidden states if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # [60, ?, 200] for question self.tensor_dict['h'] = h # [60, ?, ?, 200] for article with tf.variable_scope("main"): if config.dynamic_att: # todo what is this dynamic attention. p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) cell2 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell = SwitchableDropoutWrapper( cell2, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, inputs=p0, sequence_length=x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) cell3 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell3 = SwitchableDropoutWrapper( cell3, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell3, first_cell3, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) cell4 = BasicLSTMCell( d, state_is_tuple=True) # d = 100, hidden state number first_cell4 = SwitchableDropoutWrapper( cell4, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( first_cell4, first_cell4, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def decode(self, knowledge_rep, masks, is_train, hparams): """ takes in a knowledge representation and output a probability estimation over all paragraph tokens on which token should be the start of the answer span, and which should be the end of the answer span. :param knowledge_rep: it is a representation of the paragraph and question, decided by how you choose to implement the encoder :return: """ p0 = knowledge_rep p_mask, q_mask = masks batch_size = hparams.batch_size input_keep_prob = hparams.input_keep_prob p_len = tf.reduce_sum(tf.cast(p_mask, 'int32'), 1) # [N] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] JX = tf.shape(p_mask)[1] with tf.variable_scope("main"): cell = BasicLSTMCell(self.state_size, state_is_tuple=True) first_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=input_keep_prob) # [N, JX, 2d] (fw_g0, bw_g0), _ = _bidirectional_dynamic_rnn(first_cell, first_cell, p0, p_len, dtype='float', scope='g0') g0 = tf.concat([fw_g0, bw_g0], 2) cell = BasicLSTMCell(self.state_size, state_is_tuple=True) first_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=input_keep_prob) # [N, JX, 2d] (fw_g1, bw_g1), _ = _bidirectional_dynamic_rnn(first_cell, first_cell, g0, p_len, dtype='float', scope='g1') g1 = tf.concat([fw_g1, bw_g1], 2) logits = linear_logits([g1, p0], self.state_size, 0.0, scope='logits1', mask=p_mask, is_train=is_train) # TODO use batch _size a1i = softsel(tf.reshape(g1, [batch_size, JX, 2 * self.state_size]), tf.reshape(logits, [batch_size, JX])) a1i = tf.tile(tf.expand_dims(a1i, 1), [1, JX, 1]) flat_logits1 = tf.reshape(logits, [-1, JX]) flat_yp = tf.nn.softmax(flat_logits1) # [-1, M*JX] yp1 = tf.reshape(flat_yp, [-1, JX]) cell = BasicLSTMCell(self.state_size, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=input_keep_prob) # [N, M, JX, 2d] (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, tf.concat([p0, g1, a1i, g1 * a1i], 2), p_len, dtype='float', scope='g2') g2 = tf.concat([fw_g2, bw_g2], 2) logits2 = linear_logits([g2, p0], self.state_size, 0.0, scope='logits2', mask=p_mask, is_train=is_train) flat_logits2 = tf.reshape(logits2, [-1, JX]) flat_yp = tf.nn.softmax(flat_logits2) # [-1, M*JX] yp2 = tf.reshape(flat_yp, [-1, JX]) return (yp1, flat_logits1), (yp2, flat_logits2)
def encode(self, inputs, masks, encoder_state_input, is_train, hparams): """ In a generalized encode function, you pass in your inputs, masks, and an initial hidden state input into this function. :param inputs: Symbolic representations of your input :param masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate through masked steps :param encoder_state_input: (Optional) pass this as initial hidden state to tf.nn.dynamic_rnn to build conditional representations :return: an encoded representation of your input. It can be context-level representation, word-level representation, or both. """ context_embed, question_embed = inputs p_mask, q_mask = masks batch_size = hparams.batch_size, input_keep_prob = hparams.input_keep_prob cell = BasicLSTMCell(self.size, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=input_keep_prob) p_len = tf.reduce_sum(tf.cast(p_mask, 'int32'), 1) # [N] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope('prepro'): # [N, J, d], [N, d] (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, question_embed, q_len, dtype='float', scope='u1') tf.get_variable_scope().reuse_variables() # [N, JX, 2d] (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, context_embed, p_len, dtype='float', scope='u1') u = tf.concat([fw_u, bw_u], 2) h = tf.concat([fw_h, bw_h], 2) # Attention Layer with tf.variable_scope('attention_layer'): JQ = tf.shape(u)[1] JX = tf.shape(h)[1] h_aug = tf.tile(tf.expand_dims(h, 2), [1, 1, JQ, 1]) u_aug = tf.tile(tf.expand_dims(u, 1), [1, JX, 1, 1]) h_mask_aug = tf.tile(tf.expand_dims(p_mask, 2), [1, 1, JQ]) u_mask_aug = tf.tile(tf.expand_dims(q_mask, 1), [1, JX, 1]) hu_mask = tf.cast(h_mask_aug, tf.bool) & tf.cast(u_mask_aug, tf.bool) hu_aug = h_aug * u_aug u_logits = linear_logits([h_aug, u_aug, hu_aug], True, scope='u_logits', mask=hu_mask, is_train=is_train) u_a = softsel(u_aug, u_logits) # [N, JX, d] h_a = softsel(h, tf.reduce_max(u_logits, 2)) # [N, d] h_a = tf.tile(tf.expand_dims(h_a, 1), [1, JX, 1]) p0 = tf.concat([h, u_a, h * u_a, h * h_a], 2) return p0
inputs = tf.split(1, sequence_length, in_onehot) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] targets = tf.split(1, sequence_length, targ_ph) # at this point, inputs is a list of length sequence_length # each element of inputs is [batch_size,vocab_size] # targets is a list of length sequence_length # each element of targets is a 1D vector of length batch_size # ------------------ # YOUR COMPUTATION GRAPH HERE # create a BasicLSTMCell cell = BasicLSTMCell(state_dim) stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True) initial_state = stacked_lstm.zero_state(batch_size, tf.float32) # call seq2seq.rnn_decoder outputs, final_state = tf.nn.seq2seq.rnn_decoder(inputs, initial_state, stacked_lstm) # transform the list of state outputs to a list of logits. W = tf.Variable(tf.truncated_normal([state_dim, vocab_size], stddev=0.1)) # use a linear transformation. logits = [tf.matmul(i, W) for i in outputs]
def __init__(self, num_units, forget_bias=1.0, input_size=None): BasicLSTMCell.__init__(self, num_units, forget_bias=forget_bias, input_size=input_size) self.matrix, self.bias = None, None
def _build_forward(self): #config为预先配置好的参数等 config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size #嵌入层 with tf.variable_scope("emb"): #字符嵌入层 if config.use_char_emb: #若需要字符嵌入层 with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) #CNN的滤波器参数 filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) #词嵌入层 if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: #若调用已训练好的词嵌入文件 word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): #将文章主体context:x和问题query:q转换为词向量 #embedding_lookup(params, ids),根据ids寻找params中的第id行 Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: #若进行了字符嵌入,在指定维度上将字符嵌入和词嵌入进行拼接 xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # 经过两层highway network得到context vector∈ R^(d*T)和query vectorQ∈R^(d∗J) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) #SwitchableDropoutWrapper为自定义的DropoutWrapper类 d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) #reduce_sum在指定的维度上求和(得到x和q的非空值总数),cast将输入的tensor映射到指定类型(此处为x_mask到int32) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] #Contextual Embedding Layer:对上一层得到的X和Q分别使用BiLSTM进行处理,分别捕捉X和Q中各自单词间的局部关系 with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] #fw_u和bw_u分别为双向lstm的output u = tf.concat(2, [fw_u, bw_u]) #[N, J, 2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h #核心层Attention Flow Layer with tf.variable_scope("main"): if config.dynamic_att: p0 = h #expand_dims()在矩阵指定位置增加维度 #tile()对矩阵的指定维度进行复制 u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [ N * M, JQ, 2 * d ]) #先在索引1的位置添加一个维度,然后复制M(context中最多的sentence数量)次,使u和h能具有相同的维度 q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
def __init__(self, num_units, cell_type='lstm', scope=None): self.cell_fw = GRUCell( num_units) if cell_type == 'gru' else BasicLSTMCell(num_units) self.cell_bw = GRUCell( num_units) if cell_type == 'gru' else BasicLSTMCell(num_units) self.scope = scope or "bi_rnn"
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell self.p = p0 (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) with tf.variable_scope("output"): if config.model_name == "basic": logits = get_logits([g1, p0], d, True, wd=config.wd, \ input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, \ func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), \ tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), \ [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, \ tf.concat(3, [p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, \ input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 elif config.model_name == "basic-class": C = 3 if config.data_dir.startswith('data/snli') else 2 (fw_g2, bw_g2) = (fw_g1, bw_g1) if config.classifier == 'maxpool': g2 = tf.concat(3, [fw_g2, bw_g2]) # [N, M, JX, 2d] g2 = tf.reduce_max(g2, 2) # [N, M, 2d] g2_dim = 2 * d elif config.classifier == 'sumpool': g2 = tf.concat(3, [fw_g2, bw_g2]) g2 = tf.reduce_sum(g2, 2) g2_dim = 2 * d else: fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]), JX - 1) bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0) g2 = tf.concat(2, [fw_g2_, bw_g2_]) g2_dim = 2 * d g2_ = tf.reshape(g2, [N, g2_dim]) logits0 = linear(g2_, C, True, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=self.is_train, scope='classifier') flat_yp0 = tf.nn.softmax(logits0) yp0 = tf.reshape(flat_yp0, [N, M, C]) self.tensor_dict['g1'] = g1 self.logits0 = logits0 self.yp0 = yp0 self.logits = logits0 self.yp = yp0
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2, pix_distributions=None, conf=None): """Build convolutional lstm video predictor using STP, CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences iter_num: tensor of the current training iteration (for sched. sampling) k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) stp: True to use Spatial Transformer Predictor (STP) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions pix_distrib: the initial one-hot distriubtion for designated pixels Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ if 'dna_size' in conf.keys(): DNA_KERN_SIZE = conf['dna_size'] else: DNA_KERN_SIZE = 5 print 'constructing network with less layers...' if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape( )[0:4] batch_size = int(batch_size) lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images, gen_masks, inf_low_state, pred_low_state = [], [], [], [], [] current_state = states[0] gen_pix_distrib = [] summaries = [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. lstm_size = np.int32(np.array([16, 32, 64, 100, 10])) lstm_state1, lstm_state2, lstm_state3 = None, None, None single_lstm1 = BasicLSTMCell(lstm_size[3], state_is_tuple=True) single_lstm2 = BasicLSTMCell(lstm_size[4], state_is_tuple=True) low_dim_lstm = MultiRNNCell([single_lstm1, single_lstm2], state_is_tuple=True) low_dim_lstm_state = low_dim_lstm.zero_state(batch_size, tf.float32) dim_low_state = int(lstm_size[-1]) t = -1 for image, action in zip(images[:-1], actions[:-1]): t += 1 print 'building timestep ', t # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] if pix_distributions != None: prev_pix_distrib = gen_pix_distrib[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if pix_distributions != None: prev_pix_distrib = pix_distributions[t] prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1) # Predicted state is always fed back in state_action = tf.concat(1, [action, current_state]) # 6x import pdb pdb.set_trace() enc0 = slim.layers.conv2d( #32x32x32 prev_image, 32, kernel_size=[5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( #32x32 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( #16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden2, lstm_state2 = lstm_func( #16x16x32 enc1, lstm_state2, lstm_size[1], scope='state3') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4') enc2 = slim.layers.conv2d( #8x8x32 hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. smear = tf.reshape( state_action, [batch_size, 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( #8x8x6 smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(3, [enc2, smear]) enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden2.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden3, lstm_state3 = lstm_func( #8x8x64 enc3, lstm_state3, lstm_size[2], scope='state5') # last 8x8 hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6') enc3 = slim.layers.conv2d( # 8x8x32 hidden3, 16, [1, 1], stride=1, scope='conv5') enc3_flat = tf.reshape(enc3, [batch_size, -1]) if 'use_low_dim_lstm' in conf: with tf.variable_scope('low_dim_lstm', reuse=reuse): hidden4, low_dim_lstm_state = low_dim_lstm( enc3_flat, low_dim_lstm_state) low_dim_state = hidden4 else: enc_fully1 = slim.layers.fully_connected(enc3_flat, 400, scope='enc_fully1') enc_fully2 = slim.layers.fully_connected(enc_fully1, 100, scope='enc_fully2') low_dim_state = enc_fully2 # inferred low dimensional state: inf_low_state.append(low_dim_state) pred_low_state.append(project_fwd_lowdim(low_dim_state)) smear = tf.reshape(low_dim_state, [batch_size, 1, 1, dim_low_state]) smear = tf.tile( # 8x8xdim_hidden_state smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc4 = slim.layers.conv2d_transpose( #16x16x32 smear, hidden3.get_shape()[3], 3, stride=2, scope='convt1') enc5 = slim.layers.conv2d_transpose( #32x32x32 enc4, enc0.get_shape()[3], 3, stride=2, scope='convt2') enc6 = slim.layers.conv2d_transpose( #64x64x16 enc5, 16, 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose(enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError('Only one mask is supported for DNA model.') transformed = [dna_transformation(prev_image, enc7, DNA_KERN_SIZE)] if 'use_masks' in conf: masks = slim.layers.conv2d_transpose(enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [ int(batch_size), int(img_height), int(img_width), num_masks + 1 ]) mask_list = tf.split(3, num_masks + 1, masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask else: mask_list = None output = transformed gen_images.append(output) gen_masks.append(mask_list) if dna and pix_distributions != None: transf_distrib = [ dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE) ] if pix_distributions != None: pix_distrib_output = mask_list[0] * prev_pix_distrib mult_list = [] for i in range(num_masks): mult_list.append(transf_distrib[i] * mask_list[i + 1]) pix_distrib_output += mult_list[i] gen_pix_distrib.append(pix_distrib_output) # pred_low_state_stopped = tf.stop_gradient(pred_low_state) state_enc1 = slim.layers.fully_connected( # pred_low_state[-1], low_dim_state, 100, scope='state_enc1') state_enc2 = slim.layers.fully_connected( state_enc1, # int(current_state.get_shape()[1]), 4, scope='state_enc2', activation_fn=None) current_state = tf.squeeze(state_enc2) gen_states.append(current_state) if pix_distributions != None: return gen_images, gen_states, gen_masks, gen_pix_distrib, inf_low_state, pred_low_state else: return gen_images, gen_states, gen_masks, None, inf_low_state, pred_low_state
def __init__(self, model_parameters, training_parameters, directories, **kwargs): """ Initialization of the RNN Model as TensorFlow computational graph """ self.model_parameters = model_parameters self.training_parameters = training_parameters self.directories = directories # Define model hyperparameters Tensors with tf.name_scope("Parameters"): self.learning_rate = tf.placeholder(tf.float32, name="learning_rate") self.momentum = tf.placeholder(tf.float32, name="momentum") self.input_keep_probability = tf.placeholder( tf.float32, name="input_keep_probability") self.output_keep_probability = tf.placeholder( tf.float32, name="output_keep_probability") self.is_training = tf.placeholder(tf.bool) # Define input, output and initialization Tensors with tf.name_scope("Input"): self.inputs = tf.placeholder("float", [ None, self.model_parameters.sequence_length, self.model_parameters.input_dimension ], name='input_placeholder') self.targets = tf.placeholder( "float", [None, self.model_parameters.sequence_length, 1], name='labels_placeholder') self.init = tf.placeholder( tf.float32, shape=[None, self.model_parameters.state_size], name="init") # Define the TensorFlow RNN computational graph with tf.name_scope("LSTMRNN_RNN"): cells = [] # Define the layers for _ in range(self.model_parameters.n_layers): if self.model_parameters.model == 'rnn': cell = BasicRNNCell(self.model_parameters.state_size) elif self.model_parameters.model == 'gru': cell = GRUCell(self.model_parameters.state_size) elif self.model_parameters.model == 'lstm': cell = BasicLSTMCell(self.model_parameters.state_size, state_is_tuple=True) elif self.model_parameters.model == 'nas': cell = NASCell(self.model_parameters.state_size) else: raise Exception("model type not supported: {}".format( self.model_parameters.model)) if (self.model_parameters.output_keep_probability < 1.0 or self.model_parameters.input_keep_probability < 1.0): if self.model_parameters.output_keep_probability < 1.0: cell = DropoutWrapper( cell, output_keep_prob=self.output_keep_probability) if self.model_parameters.input_keep_probability < 1.0: cell = DropoutWrapper( cell, input_keep_prob=self.input_keep_probability) cells.append(cell) cell = MultiRNNCell(cells) # Simulate time steps and get RNN cell output self.outputs, self.next_state = tf.nn.dynamic_rnn(cell, self.inputs, dtype=tf.float32) # Define cost Tensors with tf.name_scope("LSTMRNN_Cost"): # Flatten to apply same weights to all time steps self.flattened_outputs = tf.reshape( self.outputs, [-1, self.model_parameters.state_size], name="flattened_outputs") self.output_w = tf.Variable(tf.truncated_normal( [self.model_parameters.state_size, 1], stddev=0.01), name="output_weights") self.variable_summaries(self.output_w, 'output_weights') self.output_b = tf.Variable(tf.constant(0.1), name="output_biases") self.variable_summaries(self.output_w, 'output_biases') # Define decision threshold Tensor self.decision_threshold = tf.Variable( self.model_parameters.threshold, name="decision_threshold") # Define moving average step Tensor self.ma_step = tf.Variable(self.model_parameters.ma_step, name="ma_step") # Softmax activation layer, using RNN inner loop last output # logits and labels must have the same shape [batch_size, num_classes] self.logits = tf.add(tf.matmul(self.flattened_outputs, self.output_w), self.output_b, name="logits") self.logits_bn = self.batch_norm_wrapper( inputs=self.logits, is_training=self.is_training) tf.summary.histogram('logits', self.logits) tf.summary.histogram('logits_bn', self.logits_bn) self.predictions = tf.reshape( self.logits, [-1, self.model_parameters.sequence_length, 1], name="predictions") self.shaped_predictions = tf.reshape(self.predictions, [-1], name="shaped_predictions") self.tmp_smoothed_predictions = tf.concat( [ self.shaped_predictions, tf.fill( tf.expand_dims(self.ma_step - 1, 0), self.shaped_predictions[ tf.shape(self.shaped_predictions)[0] - 1]) ], axis=0, name="tmp_smoothed_predictions") self.ma_loop_idx = tf.constant(0, dtype='int32') self.shaped_smoothed_predictions = tf.zeros([0], dtype='float32') _, self.shaped_smoothed_predictions = tf.while_loop( lambda i, _: i < tf.shape(self.shaped_predictions)[0], self.ma_while_body, [self.ma_loop_idx, self.shaped_smoothed_predictions], shape_invariants=[tf.TensorShape([]), tf.TensorShape([None])]) self.smoothed_predictions = tf.reshape( self.shaped_smoothed_predictions, [-1, self.model_parameters.sequence_length, 1], name="smoothed_predictions") self.soft_predictions_summary = tf.summary.tensor_summary( "soft_predictions", self.smoothed_predictions) # self.soft_predictions_summary = tf.summary.tensor_summary("soft_predictions", self.predictions) # self.shaped_logits = tf.reshape(self.logits, # [-1, self.model_parameters.sequence_length, 1], # name="shaped_logits") # Cross-Entropy # self.cost = tf.reduce_mean(-tf.reduce_sum( # self.targets * tf.log(self.predictions), # reduction_indices=[2]), name="cross_entropy") # self.cross_entropy = tf.reduce_mean( # tf.nn.sigmoid_cross_entropy_with_logits(_sentinel=None, # labels=self.targets, # logits=self.predictions), # name="cross_entropy") # self.cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits( # _sentinel=None, # labels=self.targets, # logits=self.shaped_logits, # name="cross_entropy") # Root Mean Squared Error # self.mean_squared_error = tf.losses.mean_squared_error( # labels=self.targets, # predictions=self.predictions) self.cost = tf.sqrt( tf.reduce_mean( tf.squared_difference(self.smoothed_predictions, self.targets))) # self.cost = tf.sqrt(tf.reduce_mean( # tf.squared_difference( # self.predictions, self.targets))) tf.summary.scalar('training_cost', self.cost) # self.cost = tf.reduce_mean( # self.cross_entropy, # name="cost") voicing_condition = tf.greater( self.smoothed_predictions, tf.fill(tf.shape(self.smoothed_predictions), self.decision_threshold), name="thresholding") # voicing_condition = tf.greater(self.predictions, # tf.fill(tf.shape(self.predictions), self.decision_threshold), # name="thresholding") self.label_predictions = tf.where( voicing_condition, tf.ones_like(self.smoothed_predictions), tf.zeros_like(self.smoothed_predictions), name="label_predictions") # self.label_predictions = tf.where(voicing_condition, # tf.ones_like(self.predictions) , # tf.zeros_like(self.predictions), # name="label_predictions") self.hard_predictions_summary = tf.summary.tensor_summary( "hard_predictions", self.label_predictions) self.correct_prediction = tf.equal(self.label_predictions, self.targets, name="correct_predictions") self.r = tf.reshape(self.targets, [-1]) self.h = tf.reshape(self.label_predictions, [-1]) # Defined outside the while loop to avoid problems self.dump_one = tf.constant(1, dtype=tf.int32, shape=[]) self.temp_pk_miss = tf.Variable([0], tf.int32, name='temp_pk_miss') self.temp_pk_falsealarm = tf.Variable([0], tf.int32, name='temp_pk_falsealarm') self.loop_idx = tf.constant(0, dtype=tf.int32, name='loop_idx') self.loop_vars = self.loop_idx, self.temp_pk_miss, self.temp_pk_falsealarm _, self.all_temp_pk_miss, self.all_temp_pk_falsealarm = tf.while_loop( self.while_condition, self.while_body, self.loop_vars, shape_invariants=(self.loop_idx.get_shape(), tf.TensorShape([None]), tf.TensorShape([None]))) self.pk_miss = tf.reduce_mean( tf.cast(self.all_temp_pk_miss, tf.float32)) tf.summary.scalar('p_miss', self.pk_miss) self.pk_falsealarm = tf.reduce_mean( tf.cast(self.all_temp_pk_falsealarm, tf.float32)) tf.summary.scalar('p_falsealarm', self.pk_falsealarm) self.pk = tf.reduce_mean(tf.cast( tf.add(self.all_temp_pk_miss, self.all_temp_pk_falsealarm), tf.float32), name='pk') tf.summary.scalar('pk', self.pk) self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32), name="accuracy") tf.summary.scalar('accuracy', self.accuracy) self.recall, self.update_op_recall = tf.metrics.recall( labels=self.targets, predictions=self.label_predictions, name="recall") tf.summary.scalar('recall', self.recall) self.precision, self.update_op_precision = tf.metrics.precision( labels=self.targets, predictions=self.label_predictions, name="precision") tf.summary.scalar('precision', self.precision) # Define Training Tensors with tf.name_scope("LSTMRNN_Train"): # Momentum optimisation self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum, name="optimizer") self.train_step = self.optimizer.minimize(self.cost, name="train_step") # Initializing the variables self.initializer = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
name="b_h", shape=[self._num_units], initializer=tf.contrib.layers.variance_scaling_initializer()) r_t = tf.sign(tf.matmul(inputs, W_xr) + tf.matmul(state, W_hr) + b_r) z_t = tf.sign(tf.matmul(inputs, W_xz) + tf.matmul(state, W_hz) + b_z) h_hat_t = tf.tanh( tf.matmul(inputs, W_xh) + tf.matmul(r_t * state, W_hh) + b_h) h_t = z_t * state + (1 - z_t) * h_hat_t return h_t, h_t multi_cell = MultiRNNCell( [BasicLSTMCell(state_dim) for i in range(num_layers)]) #multi_cell = MultiRNNCell([mygru(state_dim) for i in range(num_layers)]) initial_state = multi_cell.zero_state(batch_size, dtype=tf.float32) # call seq2seq.rnn_decoder outputs, final_state = rnn_decoder(inputs, initial_state, multi_cell) # transform the list of state outputs to a list of logits. # use a linear transformation. weights = tf.get_variable( name="W", shape=[state_dim, vocab_size], initializer=tf.contrib.layers.variance_scaling_initializer()) bias = tf.get_variable( name="b", shape=[vocab_size],
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): print('word embedding') # if config.use_char_emb: # with tf.variable_scope("emb_var"), tf.device("/cpu:0"): # char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') # with tf.variable_scope("char"): # Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] # Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] # Acx = tf.reshape(Acx, [-1, JX, W, dc]) # Acq = tf.reshape(Acq, [-1, JQ, W, dc]) # filter_sizes = list(map(int, config.out_channel_dims.split(','))) # heights = list(map(int, config.filter_heights.split(','))) # assert sum(filter_sizes) == dco, (filter_sizes, dco) # with tf.variable_scope("conv"): # xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") # if config.share_cnn_weights: # tf.get_variable_scope().reuse_variables() # qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") # else: # qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") # xx = tf.reshape(xx, [-1, M, JX, dco]) # qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/gpu:7"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') # if config.use_glove_for_unk: # word_emb_mat = tf.concat(0, [word_emb_mat]) print(word_emb_mat) with tf.name_scope("word"): print('embedding lookup') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq print('embedding lookup ready') # if config.use_char_emb: # xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] # qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] # else: xx = Ax qq = Aq # highway network #if config.highway: # with tf.variable_scope("highway"): # xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) # tf.get_variable_scope().reuse_variables() # qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq print('context emmbedding') cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] print('prepro') with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float32', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h print('main pro') with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell(cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) logits = get_logits([g1, p0], [config.batch_size,config.max_num_sents] , True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func='sigmoid', scope='logits1') # a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) # a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) # (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, tf.concat(3, [p0, g1, a1i, g1 * a1i]), # x_len, dtype='float', scope='g2') # [N, M, JX, 2d] # g2 = tf.concat(3, [fw_g2, bw_g2]) # logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, # mask=self.x_mask, # is_train=self.is_train, func=config.answer_func, scope='logits2') # flat_logits = tf.reshape(logits, [-1, M * JX]) # flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] # yp = tf.reshape(flat_yp, [-1, M, JX]) yp = tf.greater(0.5, logits) # flat_logits2 = tf.reshape(logits2, [-1, M * JX]) # flat_yp2 = tf.nn.softmax(flat_logits2) # yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 # self.tensor_dict['g2'] = g2 self.logits = logits # self.logits2 = flat_logits2 self.yp = yp
def build_forward(self): config = self.config N, M, JX, JQ, VW , VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size word_emb = tf.get_variable( "word_emb_mat", shape=[config.word_vocab_size, config.word_emb_size], dtype='float', initializer=self.emb_mat) with tf.variable_scope("embedding"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"), tf.device("/cpu:0"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq '''x_len means the length of sequences ''' x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("Encoding"): cell = BasicLSTMCell(d, state_is_tuple=True) encoding_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( encoding_cell, encoding_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: ## G p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) cell = BasicLSTMCell(d, state_is_tuple=True) first_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) '''the following can be simplified, by using multi-layer rnn''' ## 2 layers of bi rnn (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat([fw_g0, bw_g0], 3) cell = BasicLSTMCell(d, state_is_tuple=True) first_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] ##M g1 = tf.concat([fw_g1, bw_g1], 3) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) cell = BasicLSTMCell(d, state_is_tuple=True) M2_operate_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( M2_operate_cell, M2_operate_cell, tf.concat([p0, g1, a1i, g1 * a1i], 3), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] ## M^2 g2 = tf.concat([fw_g2, bw_g2], 3) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2
import tensorflow as tf from tensorflow.python.ops.rnn_cell import BasicLSTMCell from tensorflow.python.ops.rnn_cell import MultiRNNCell num_units = [128, 64] cells = [BasicLSTMCell(num_units=n) for n in num_units] stacked_rnn_cell = MultiRNNCell(cells)