def build_encoder(self): """Inference Network. q(h|X)""" with tf.variable_scope("encoder"): self.l1_lin = linear(tf.expand_dims(self.x, 0), self.embed_dim, bias=True, scope="l1") self.l1 = tf.nn.relu(self.l1_lin) self.l2_lin = linear(self.l1, self.embed_dim, bias=True, scope="l2") self.l2 = tf.nn.relu(self.l2_lin) self.mu = linear(self.l2, self.h_dim, bias=True, scope="mu") self.log_sigma_sq = linear(self.l2, self.h_dim, bias=True, scope="log_sigma_sq") self.eps = tf.random_normal((1, self.h_dim), 0, 1, dtype=tf.float32) self.sigma = tf.sqrt(tf.exp(self.log_sigma_sq)) self.h = tf.add(self.mu, tf.mul(self.sigma, self.eps)) _ = tf.histogram_summary("mu", self.mu) _ = tf.histogram_summary("sigma", self.sigma) _ = tf.histogram_summary("h", self.h) _ = tf.histogram_summary("mu + sigma", self.mu + self.sigma)
def attention(query): """Put attention masks on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): # Attention mask is a softmax of h_in^T*decoder_hidden. dec_hid = array_ops.tile( query, [1, attn_length ]) # replicate query for element-wise multiplication dec_hid = array_ops.reshape( dec_hid, [-1, attn_length, attention_vec_size]) attn_weight = nn_ops.softmax( math_ops.reduce_sum( attention_states * dec_hid, [2 ])) # attn weights for every hidden states in encoder # Now calculate the attention-weighted vector (context vector) cc. cc = math_ops.reduce_sum( array_ops.reshape(attn_weight, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # attented hidden state with vs.variable_scope("AttnW1"): term1 = rnn_cell.linear(query, attn_size, False) with vs.variable_scope("AttnW2"): term2 = rnn_cell.linear(cc, attn_size, False) # environment representation if env: # 2D Tensor of shape [batch_size, env_size] with vs.variable_scope("Environment"): term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False) h_attn = math_ops.tanh(term1 + term2 + term3) else: h_attn = math_ops.tanh(term1 + term2) return h_attn, attn_weight
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn2"): gamma_h = tanh(rnn_cell.linear(gru_out, self._num_units, True, 1.0)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True)) weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True)) context = tf.reduce_sum(self.hs * weights, reduction_indices=0) with vs.variable_scope("AttnConcat"): out = tf.nn.relu(rnn_cell.linear([context, gru_out], self._num_units, True, 1.0)) self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1])) return (out, out)
def __call__(self, inputs, state, episodic_gate, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("MGRUCell"): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r = rnn_cell.linear([inputs, state], self._num_units, True, 1.0, scope=scope) r = sigmoid(r) with vs.variable_scope("Candidate"): c = tanh(rnn_cell.linear([inputs, r * state], self._num_units, True)) new_h = tf.mul(episodic_gate, c) + tf.mul((1 - episodic_gate), state) return new_h, new_h
def iterativeLSTM(inputs, state, num_units, forget_bias, iteration_activation, iteration_count, iteration_prob): # This function applies the standard LSTM calculation plus the calculation of the evidence to infer if another iteration is needed. # "BasicLSTM" # Parameters of gates are concatenated into one multiply for efficiency. c, h = array_ops.split(1, 2, state) concat = linear([inputs, h], 4 * num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = c * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j) new_h = tanh(new_c) * sigmoid(o) # Only a new state is exposed if the iteration gate in this unit of this batch activated the extra iteration. new_h = (new_h + h) * iteration_activation + h * (1 - iteration_activation) new_c = new_c * iteration_activation + c * (1 - iteration_activation) new_state = array_ops.concat(1, [new_c, new_h]) new_output = new_h * iteration_activation + inputs * (1 - iteration_activation) # In this approach the evidence of the iteration gate is based on the inputs that doesn't change over iterations and its state #p = linear([j], num_units, True, scope= "iteration_activation") new_iteration_activation = update_iteration_activations(iteration_activation, tf.ones(tf.shape(inputs))) return new_output, new_state, new_iteration_activation
def __init__(self, num_units, encoder_output, scope=None): self.hs = encoder_output with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn1"): hs2d = tf.reshape(self.hs, [-1, num_units]) phi_hs2d = tanh(rnn_cell.linear(hs2d, num_units, True, 1.0)) self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs)) super(GRUCellAttn, self).__init__(num_units)
def downscale(self, inp): with vs.variable_scope("Downscale"): inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = rnn_cell.linear(inp2d, self.size, True, 1.0) out3d = tf.reshape(out2d, [self.batch_size, -1, self.size]) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out = tanh(out3d) return out
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def testLinear(self): with self.test_session() as sess: with tf.variable_scope("root", initializer=tf.constant_initializer(1.0)): x = tf.zeros([1, 2]) l = linear([x], 2, False) sess.run([tf.initialize_all_variables()]) res = sess.run([l], {x.name: np.array([[1., 2.]])}) self.assertAllClose(res[0], [[3.0, 3.0]]) # Checks prevent you from accidentally creating a shared function. with self.assertRaises(ValueError): l1 = linear([x], 2, False) # But you can create a new one in a new scope and share the variables. with tf.variable_scope("l1") as new_scope: l1 = linear([x], 2, False) with tf.variable_scope(new_scope, reuse=True): linear([l1], 2, False) self.assertEqual(len(tf.trainable_variables()), 2)
def __call__(self, inputs, state, episodic_gate, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("MGRUCell"): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r = rnn_cell.linear([inputs, state], self._num_units, True, 1.0, scope=scope) r = sigmoid(r) with vs.variable_scope("Candidate"): c = tanh( rnn_cell.linear([inputs, r * state], self._num_units, True)) new_h = tf.mul(episodic_gate, c) + tf.mul( (1 - episodic_gate), state) return new_h, new_h
def attention(query): """Put attention masks on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): # Attention mask is a softmax of h_in^T*decoder_hidden. dec_hid = array_ops.tile(query, [1, attn_length]) # replicate query for element-wise multiplication dec_hid = array_ops.reshape(dec_hid, [-1, attn_length, attention_vec_size]) attn_weight = nn_ops.softmax(math_ops.reduce_sum(attention_states*dec_hid, [2])) # attn weights for every hidden states in encoder # Now calculate the attention-weighted vector (context vector) cc. cc = math_ops.reduce_sum(array_ops.reshape(attn_weight, [-1, attn_length, 1, 1])*hidden, [1,2]) # attented hidden state with vs.variable_scope("AttnW1"): term1 = rnn_cell.linear(query, attn_size, False) with vs.variable_scope("AttnW2"): term2 = rnn_cell.linear(cc, attn_size, False) # environment representation if env: # 2D Tensor of shape [batch_size, env_size] with vs.variable_scope("Environment"): term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False) h_attn = math_ops.tanh(term1 + term2 + term3) else: h_attn = math_ops.tanh(term1 + term2) return h_attn, attn_weight
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn2"): gamma_h = tanh( rnn_cell.linear(gru_out, self._num_units, True, 1.0)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.exp( weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True)) weights = weights / (1e-6 + tf.reduce_sum( weights, reduction_indices=0, keep_dims=True)) context = tf.reduce_sum(self.hs * weights, reduction_indices=0) with vs.variable_scope("AttnConcat"): out = tf.nn.relu( rnn_cell.linear([context, gru_out], self._num_units, True, 1.0)) self.attn_map = tf.squeeze( tf.slice(weights, [0, 0, 0], [-1, -1, 1])) return (out, out)
def setup_label_loss(self): with vs.variable_scope("LabelLogistic"): doshape = tf.shape(self.decoder_output) T, batch_size = doshape[0], doshape[1] # [batch_size, cell.state_size] # decoder_output: [batch_size, time_step, cell.state_size] last_state = self.decoder_output[:, -1, :] # projecting to label space # [batch_size, label_size] logits = rnn_cell.linear(last_state, self.label_size, True, 1.0) self.losses = tf.nn.softmax_cross_entropy_with_logits(logits, self.label_placeholder) self.predictions = logits
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def setup_label_loss(self): with vs.variable_scope("LabelLogistic"): doshape = tf.shape(self.decoder_output) T, batch_size = doshape[0], doshape[1] # [batch_size, cell.state_size] # decoder_output: [batch_size, time_step, cell.state_size] last_state = self.decoder_output[:, -1, :] # projecting to label space # [batch_size, label_size] logits = rnn_cell.linear(last_state, self.label_size, True, 1.0) self.losses = tf.nn.softmax_cross_entropy_with_logits( logits, self.label_placeholder) self.predictions = logits
def setup_loss(self): with vs.variable_scope("Logistic"): do2d = tf.reshape(self.decoder_output, [-1, self.size]) logits2d = rnn_cell.linear(do2d, self.vocab_size, True, 1.0) outputs2d = tf.nn.softmax(logits2d) self.outputs = tf.reshape(outputs2d, [-1, self.batch_size, self.vocab_size]) targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1]) masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(logits2d, labels1d) * tf.to_float(mask1d) losses2d = tf.reshape(losses1d, [-1, self.batch_size]) self.losses = tf.reduce_sum(losses2d) / self.batch_size
def downscale(self, inp, mask): with vs.variable_scope("Downscale"): inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = rnn_cell.linear(inp2d, self.size, True, 1.0) out3d = tf.reshape(out2d, [self.batch_size, -1, self.size]) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out = tanh(out3d) mask = tf.transpose(mask) mask = tf.reshape(mask, [-1, 2]) mask = tf.cast(mask, tf.bool) mask = tf.reduce_any(mask, reduction_indices=1) mask = tf.to_int32(mask) mask = tf.reshape(mask, [self.batch_size, -1]) mask = tf.transpose(mask) return out, mask
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with vs.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def basic_rnn_cell(inputs, state, num_units, scope=None): if state is None: if inputs is not None: batch_size = inputs.get_shape()[0] dtype = inputs.dtype else: batch_size = 0 dtype = tf.float32 init_output = tf.zeros(tf.pack([batch_size, num_units]), dtype=dtype) init_state = tf.zeros(tf.pack([batch_size, num_units]), dtype=dtype) init_output.set_shape([batch_size, num_units]) init_state.set_shape([batch_size, num_units]) return init_output, init_state else: with tf.variable_op_scope([inputs, state], scope, "BasicRNNCell"): output = tf.tanh(linear([inputs, state], num_units, True)) return output, output
def batch_linear(args, output_size, bias): ''' Apply linear map to a batch of matrices. args: a 3D Tensor or a list of 3D, batch x n x m, Tensors. ''' if not nest.is_sequence(args): args = [args] batch_size = args[0].get_shape().as_list()[0] or tf.shape(args[0])[0] flat_args = [] for arg in args: m = arg.get_shape().as_list()[2] if not m: raise ValueError('batch_linear expects shape[2] of arguments: %s' % str(m)) flat_args.append(tf.reshape(arg, [-1, m])) flat_output = linear(flat_args, output_size, bias) output = tf.reshape(flat_output, [batch_size, -1, output_size]) return output
def setup_generation_loss(self): with vs.variable_scope("Logistic"): doshape = tf.shape(self.decoder_output) T, batch_size = doshape[0], doshape[1] do2d = tf.reshape(self.decoder_output, [-1, self.size]) logits2d = rnn_cell.linear(do2d, self.vocab_size, True, 1.0) outputs2d = tf.nn.log_softmax(logits2d) self.outputs = tf.reshape( outputs2d, tf.pack([T, batch_size, self.vocab_size])) targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits( logits2d, labels1d) losses2d = tf.reshape(losses1d, tf.pack([T, batch_size])) self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
def downscale(self, inp, mask): with vs.variable_scope("Downscale"): inshape = tf.shape(inp) T, batch_size, dim = inshape[0], inshape[1], inshape[2] inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = rnn_cell.linear(inp2d, self.size, True, 1.0) out3d = tf.reshape(out2d, tf.pack((batch_size, tf.to_int32(T/2), dim))) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out3d.set_shape([None, None, self.size]) out = tanh(out3d) mask = tf.transpose(mask) mask = tf.reshape(mask, [-1, 2]) mask = tf.cast(mask, tf.bool) mask = tf.reduce_any(mask, reduction_indices=1) mask = tf.to_int32(mask) mask = tf.reshape(mask, tf.pack([batch_size, -1])) mask = tf.transpose(mask) return out, mask
def dnn(tensor_in, hidden_units, activation=nn.relu, dropout=None): """Creates fully connected deep neural network subgraph. Args: tensor_in: tensor or placeholder for input features. hidden_units: list of counts of hidden units in each layer. activation: activation function between layers. Can be None. dropout: if not None, will add a dropout layer with given probability. Returns: A tensor which would be a deep neural network. """ with vs.variable_scope('dnn'): for i, n_units in enumerate(hidden_units): with vs.variable_scope('layer%d' % i): tensor_in = rnn_cell.linear(tensor_in, n_units, True) if activation is not None: tensor_in = activation(tensor_in) if dropout is not None: tensor_in = dropout_ops.dropout(tensor_in, prob=(1.0 - dropout)) return tensor_in
def __init__(self, is_training, vocab_size, batch_size, num_steps, config, reuse_conv_variables=None): if config.topic_number > 0: TopicModel.__init__(self, is_training, vocab_size, batch_size, num_steps, 0, config, reuse_conv_variables) else: self.y = tf.placeholder(tf.int32, [None, num_steps]) self.config = config #placeholders self.x = tf.placeholder(tf.int32, [None, num_steps]) self.lm_mask = tf.placeholder(tf.float32, [None, num_steps]) #variables self.lstm_word_embedding = tf.get_variable("lstm_embedding", [vocab_size, config.word_embedding_size], \ trainable=config.word_embedding_update, \ initializer=tf.random_uniform_initializer(-0.5/config.word_embedding_size, 0.5/config.word_embedding_size)) self.lm_softmax_w = tf.get_variable( "lm_softmax_w", [config.rnn_hidden_size, vocab_size]) if is_training and config.num_samples > 0: self.lm_softmax_w_t = tf.transpose(self.lm_softmax_w) self.lm_softmax_b = tf.get_variable( "lm_softmax_b", [vocab_size], initializer=tf.constant_initializer()) if config.topic_number > 0: self.gate_w = tf.get_variable( "gate_w", [config.topic_embedding_size, config.rnn_hidden_size]) self.gate_u = tf.get_variable( "gate_u", [config.rnn_hidden_size, config.rnn_hidden_size]) self.gate_b = tf.get_variable( "gate_b", [config.rnn_hidden_size], initializer=tf.constant_initializer()) #define lstm cells lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(config.rnn_hidden_size, forget_bias=1.0) if is_training and config.lm_keep_prob < 1.0: lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.lm_keep_prob, seed=config.seed) self.cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.rnn_layer_size) #set initial state to all zeros self.initial_state = self.cell.zero_state(batch_size, tf.float32) #embedding lookup inputs = tf.nn.embedding_lookup(self.lstm_word_embedding, self.x) if is_training and config.lm_keep_prob < 1.0: inputs = tf.nn.dropout(inputs, config.lm_keep_prob, seed=config.seed) #transform input from [batch_size,sent_len,emb_size] to [sent_len,batch_size,emb_size ] inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] #run rnn and get outputs (hidden layer) outputs, self.state = tf.nn.rnn(self.cell, inputs, initial_state=self.initial_state) #reshape output into [sent_len,batch_size,hidden_size] and then into [batch_size*sent_len,hidden_size] lstm_hidden = tf.reshape(tf.concat(1, outputs), [-1, config.rnn_hidden_size]) if config.topic_number > 0: #combine topic and language model hidden with a gating unit z, r = array_ops.split(1, 2, linear([self.conv_hidden, lstm_hidden], \ 2 * config.rnn_hidden_size, True, 1.0)) z, r = tf.sigmoid(z), tf.sigmoid(r) c = tf.tanh(tf.matmul(self.conv_hidden, self.gate_w) + tf.matmul((r * lstm_hidden), self.gate_u) + \ self.gate_b) hidden = (1 - z) * lstm_hidden + z * c #save z self.tm_weights = tf.reshape(tf.reduce_mean(z, 1), [-1, num_steps]) else: hidden = lstm_hidden #compute masked/weighted crossent and mean language model loss if is_training and config.num_samples > 0: lm_crossent = tf.nn.sampled_softmax_loss(self.lm_softmax_w_t, self.lm_softmax_b, hidden, \ tf.reshape(self.y, [-1,1]), config.num_samples, vocab_size) else: lm_logits = tf.matmul(hidden, self.lm_softmax_w) + self.lm_softmax_b lm_crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( lm_logits, tf.reshape(self.y, [-1])) lm_crossent_m = lm_crossent * tf.reshape(self.lm_mask, [-1]) self.lm_cost = tf.reduce_sum(lm_crossent_m) / batch_size #compute probs if in testing mode if not is_training: self.probs = tf.nn.softmax(lm_logits) return #run optimiser and backpropagate (clipped) gradients for lm loss lm_tvars = tf.trainable_variables() lm_grads, _ = tf.clip_by_global_norm( tf.gradients(self.lm_cost, lm_tvars), config.max_grad_norm) self.lm_train_op = tf.train.AdamOptimizer( config.learning_rate).apply_gradients(zip(lm_grads, lm_tvars))
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, batch_size, state_size, decoder_inputs_positions=None, decoder_inputs_maps=None, output_size=None, loop_function=None, dtype=dtypes.float32, scope=None): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. Embedded inputs. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. batch_size: need to clarify batch size explicitly since env_state is updated one sample by one sample. state_size: size of environment state. decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3], indicating intial positions of each example in a map. Default None. decoder_inputs_maps: a 1D Tensor of length batch_size indicating the map. Default None. output_size: size of the output vectors; if None, we use cell.output_size. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on the current decoder input or feed from previous output: cur_output, new_state = cell(input, prev_state). Then, we calculate new attention masks: new_attn = softmax(h_t^T * attention_states). Thus, the context vector: cont_vec = weighted_sum_of(attention_states), weighted by (new_attn), and then we calculate the attended output: attn_output = tanh(W1*current_output + W2*cont_vec + W3*env_state). The finally output for prediction: output = softmax(W*attn_output). This "output" should be a 1D Tensor of shape [num_symbols]. Every item of the output refers to the probability of predicting certain symbol for the next step. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder"): attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value mapIdx = array_ops.pack([map3.map_grid, map3.map_jelly, map3.map_one]) #map attention_vec_size = attn_size # size of query states = [initial_state] # current position and environment position, env = None, None hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) # reshape for later computation def attention(query): """Put attention masks on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): # Attention mask is a softmax of h_in^T*decoder_hidden. dec_hid = array_ops.tile( query, [1, attn_length ]) # replicate query for element-wise multiplication dec_hid = array_ops.reshape( dec_hid, [-1, attn_length, attention_vec_size]) attn_weight = nn_ops.softmax( math_ops.reduce_sum( attention_states * dec_hid, [2 ])) # attn weights for every hidden states in encoder # Now calculate the attention-weighted vector (context vector) cc. cc = math_ops.reduce_sum( array_ops.reshape(attn_weight, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # attented hidden state with vs.variable_scope("AttnW1"): term1 = rnn_cell.linear(query, attn_size, False) with vs.variable_scope("AttnW2"): term2 = rnn_cell.linear(cc, attn_size, False) # environment representation if env: # 2D Tensor of shape [batch_size, env_size] with vs.variable_scope("Environment"): term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False) h_attn = math_ops.tanh(term1 + term2 + term3) else: h_attn = math_ops.tanh(term1 + term2) return h_attn, attn_weight def updateEnv(_position, _step, _mapNo): """ Update env_state according to current position and step. Args: position: a 2D Tensor of shape [batch_size, 3]. step: a 2D Tensor of shape [batch_size, 1], where 0 --> no action, 1 --> move forward 1 step, 2 --> turn right, 3 --> turn left, 4 --> turn back. mapNo: a 1D int32 Tensor of length batch_size. Returns: env: a 2D Tensor of shape [batch_size, env_size] environment state after taking the step based on the position. position: a 2D Tensor of shape [batch_size, 3] new position after taking the step based on the position. """ if not _mapNo: raise ValueError(" Invalid argument mapNo in updateEnv! ") if not _position: raise ValueError(" Invalid argument position in updateEnv! ") new_env = [] new_pos = [] # if step == None, take no step and return the environment representations of each position. if not _step: new_pos = _position for j in xrange(batch_size): vec = array_ops.slice( mapIdx, array_ops.pack([ _mapNo[j], _position[j, 0], _position[j, 1], _position[j, 2], 0 ]), [1, 1, 1, 1, state_size]) new_env.append(array_ops.squeeze(vec)) new_env = array_ops.reshape(array_ops.pack(new_env), [batch_size, state_size]) return new_pos, new_env else: def f_move(ppos): # move forward 1 step return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack( [ppos[0], ppos[1] - 1, ppos[2]]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0] + 1, ppos[1], ppos[2]]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1] + 1, ppos[2]] ), lambda: array_ops.pack( [ppos[0] - 1, ppos[1], ppos[2]])))) def f_right(ppos): # turn right return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 1]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0], ppos[1], 2]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1], 3]), lambda: array_ops. pack([ppos[0], ppos[1], 0])))) def f_left(ppos): # turn left return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0], ppos[1], 0]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1], 1]), lambda: array_ops. pack([ppos[0], ppos[1], 2])))) def f_back(ppos): # turn back return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 2]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1], 0]), lambda: array_ops. pack([ppos[0], ppos[1], 1])))) def ffn4(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.turnBack_ID), lambda: f_back(ppos), lambda: _position[j, :]) def ffn3(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.turnLeft_ID), lambda: f_left(ppos), lambda: ffn4(sstep, ppos)) def ffn2(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.turnRight_ID), lambda: f_right(ppos), lambda: ffn3(sstep, ppos)) def ffn1(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.moveAct_ID), lambda: f_move(ppos), lambda: ffn2(sstep, ppos)) for j in xrange(batch_size): #update position temp_pos = control_flow_ops.cond( math_ops.equal(_step[j], data_utils.noAct_ID), lambda: _position[j, :], lambda: ffn1(_step[j], _position[j, :])) new_pos.append( control_flow_ops.cond( math_ops.logical_or( math_ops.greater(temp_pos[0], 24), math_ops.logical_or( math_ops.greater(temp_pos[1], 24), math_ops.logical_or( math_ops.less(temp_pos[0], 0), math_ops.less(temp_pos[1], 0)))), lambda: _position[j, :], lambda: temp_pos)) # new_pos.append(temp_pos) # update env new_env.append( array_ops.reshape( array_ops.slice( mapIdx, array_ops.pack([ _mapNo[j], new_pos[-1][0], new_pos[-1][1], new_pos[-1][2], 0 ]), [1, 1, 1, 1, state_size]), [state_size])) new_pos = array_ops.pack(new_pos) new_env = array_ops.pack(new_env) return new_pos, new_env # return new_pos, None outputs = [] attentions = [] environments = [] positions = [] prev = None # print(" Action info: no act=%d, move=%d, turn left=%d, turn right=%d, turn back=%d" % # (data_utils.noAct_ID, data_utils.moveAct_ID, data_utils.turnLeft_ID, data_utils.turnRight_ID, data_utils.turnBack_ID)) if decoder_inputs_positions and decoder_inputs_maps and batch_size: position = decoder_inputs_positions[ 0] # 2d tensor of shape [batch_size, 3] _, env = updateEnv(position, None, decoder_inputs_maps) for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): inp = array_ops.stop_gradient(loop_function(prev, i)) # Run the RNN. cur_output, new_state = cell(inp, states[-1]) cur_output = array_ops.reshape(cur_output, [batch_size, attn_size]) states.append(new_state) # Run the attention mechanism. h_attn, attn_weight = attention(cur_output) attentions.append(attn_weight) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear(h_attn, output_size, False) if loop_function is not None: # We do not propagate gradients over the loop function. prev = array_ops.stop_gradient(output) if decoder_inputs_positions and decoder_inputs_maps and position: # update pos and env if loop_function: step = math_ops.argmax( nn_ops.softmax(prev), 1) # step is a list (len=batch_size) of int32 number position, env = updateEnv(position, step, decoder_inputs_maps) else: if i < len(decoder_inputs_positions) - 1: position = decoder_inputs_positions[i + 1] _, env = updateEnv(position, None, decoder_inputs_maps) outputs.append(output) environments.append(env) positions.append(position) return outputs, states, attentions, environments, positions
def _output_project(self, output, attn, project_size): with tf.variable_scope("AttnOutputProjection"): new_output = activation(linear([output, attn], project_size, False)) return new_output
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.pack(decoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp, attns], cell.input_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def attention_encoder(decoder_inputs, initial_state, attention_states, cell, num_heads=1, output_size=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """ Encoder that receives attention from another encoder Parameters ---------- decoder_inputs: second encoder's input we call it a decoder's input it should be already wrapped by add_embedding() it's A list of num_steps length 2D Tensors [batch_size, input_size = embed_size] initial_state: 2D Tensor (batch_size x cell.state_size). attention_states: 3D Tensor (batch_size x attn_length (seq_length) x attn_size) cell num_heads output_size dtype scope initial_state_attention Returns ------- A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape (batch_size x cell.state_size). """ decoder_inputs = [decoder_inputs] # in original model this is a bucket list of inputs with vs.variable_scope(scope or "attention_encoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value v = [] attention_vec_size = attn_size # Size of query vectors for attention. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] for a in xrange(num_heads): k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with vs.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) state = initial_state # this is now iterating on time steps for i, inp in enumerate(decoder_inputs): if i > 0: vs.get_variable_scope().reuse_variables() # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with vs.variable_scope(vs.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) outputs.append(output) # we only want the last state return outputs, state
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, batch_size, state_size, decoder_inputs_positions=None, decoder_inputs_maps=None, output_size=None, loop_function=None, dtype=dtypes.float32, scope=None): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. Embedded inputs. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. batch_size: need to clarify batch size explicitly since env_state is updated one sample by one sample. state_size: size of environment state. decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3], indicating intial positions of each example in a map. Default None. decoder_inputs_maps: a 1D Tensor of length batch_size indicating the map. Default None. output_size: size of the output vectors; if None, we use cell.output_size. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on the current decoder input or feed from previous output: cur_output, new_state = cell(input, prev_state). Then, we calculate new attention masks: new_attn = softmax(h_t^T * attention_states). Thus, the context vector: cont_vec = weighted_sum_of(attention_states), weighted by (new_attn), and then we calculate the attended output: attn_output = tanh(W1*current_output + W2*cont_vec + W3*env_state). The finally output for prediction: output = softmax(W*attn_output). This "output" should be a 1D Tensor of shape [num_symbols]. Every item of the output refers to the probability of predicting certain symbol for the next step. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder"): attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value mapIdx = array_ops.pack([map3.map_grid, map3.map_jelly, map3.map_one]) #map attention_vec_size = attn_size # size of query states = [initial_state] # current position and environment position, env = None, None hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # reshape for later computation def attention(query): """Put attention masks on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): # Attention mask is a softmax of h_in^T*decoder_hidden. dec_hid = array_ops.tile(query, [1, attn_length]) # replicate query for element-wise multiplication dec_hid = array_ops.reshape(dec_hid, [-1, attn_length, attention_vec_size]) attn_weight = nn_ops.softmax(math_ops.reduce_sum(attention_states*dec_hid, [2])) # attn weights for every hidden states in encoder # Now calculate the attention-weighted vector (context vector) cc. cc = math_ops.reduce_sum(array_ops.reshape(attn_weight, [-1, attn_length, 1, 1])*hidden, [1,2]) # attented hidden state with vs.variable_scope("AttnW1"): term1 = rnn_cell.linear(query, attn_size, False) with vs.variable_scope("AttnW2"): term2 = rnn_cell.linear(cc, attn_size, False) # environment representation if env: # 2D Tensor of shape [batch_size, env_size] with vs.variable_scope("Environment"): term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False) h_attn = math_ops.tanh(term1 + term2 + term3) else: h_attn = math_ops.tanh(term1 + term2) return h_attn, attn_weight def updateEnv(_position, _step, _mapNo): """ Update env_state according to current position and step. Args: position: a 2D Tensor of shape [batch_size, 3]. step: a 2D Tensor of shape [batch_size, 1], where 0 --> no action, 1 --> move forward 1 step, 2 --> turn right, 3 --> turn left, 4 --> turn back. mapNo: a 1D int32 Tensor of length batch_size. Returns: env: a 2D Tensor of shape [batch_size, env_size] environment state after taking the step based on the position. position: a 2D Tensor of shape [batch_size, 3] new position after taking the step based on the position. """ if not _mapNo: raise ValueError(" Invalid argument mapNo in updateEnv! ") if not _position: raise ValueError(" Invalid argument position in updateEnv! ") new_env = [] new_pos = [] # if step == None, take no step and return the environment representations of each position. if not _step: new_pos = _position for j in xrange(batch_size): vec = array_ops.slice(mapIdx, array_ops.pack([_mapNo[j], _position[j,0], _position[j,1], _position[j,2], 0]), [1,1,1,1,state_size]) new_env.append(array_ops.squeeze(vec)) new_env = array_ops.reshape(array_ops.pack(new_env), [batch_size, state_size]) return new_pos, new_env else: def f_move(ppos): # move forward 1 step return control_flow_ops.cond(math_ops.equal(ppos[2],0), lambda:array_ops.pack([ppos[0], ppos[1]-1, ppos[2]]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],1), lambda:array_ops.pack([ppos[0]+1, ppos[1], ppos[2]]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],2), lambda:array_ops.pack([ppos[0], ppos[1]+1, ppos[2]]), lambda:array_ops.pack([ppos[0]-1, ppos[1], ppos[2]])))) def f_right(ppos): # turn right return control_flow_ops.cond(math_ops.equal(ppos[2],0), lambda: array_ops.pack([ppos[0],ppos[1], 1]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],1), lambda: array_ops.pack([ppos[0], ppos[1], 2]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],2), lambda: array_ops.pack([ppos[0], ppos[1], 3]), lambda: array_ops.pack([ppos[0], ppos[1], 0])))) def f_left(ppos): # turn left return control_flow_ops.cond(math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond(math_ops.equal(ppos[2],1), lambda: array_ops.pack([ppos[0], ppos[1], 0]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],2), lambda:array_ops.pack([ppos[0], ppos[1], 1]), lambda:array_ops.pack([ppos[0],ppos[1],2])))) def f_back(ppos): # turn back return control_flow_ops.cond(math_ops.equal(ppos[2],0), lambda:array_ops.pack([ppos[0], ppos[1], 2]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],1), lambda:array_ops.pack([ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond(math_ops.equal(ppos[2],2), lambda:array_ops.pack([ppos[0], ppos[1], 0]), lambda:array_ops.pack([ppos[0], ppos[1], 1])))) def ffn4(sstep, ppos): return control_flow_ops.cond(math_ops.equal(sstep, data_utils.turnBack_ID), lambda:f_back(ppos), lambda:_position[j,:]) def ffn3(sstep, ppos): return control_flow_ops.cond(math_ops.equal(sstep, data_utils.turnLeft_ID), lambda:f_left(ppos), lambda:ffn4(sstep, ppos)) def ffn2(sstep, ppos): return control_flow_ops.cond(math_ops.equal(sstep, data_utils.turnRight_ID), lambda:f_right(ppos), lambda:ffn3(sstep, ppos)) def ffn1(sstep, ppos): return control_flow_ops.cond(math_ops.equal(sstep, data_utils.moveAct_ID), lambda:f_move(ppos), lambda:ffn2(sstep, ppos)) for j in xrange(batch_size): #update position temp_pos = control_flow_ops.cond(math_ops.equal(_step[j], data_utils.noAct_ID), lambda:_position[j,:], lambda:ffn1(_step[j], _position[j,:])) new_pos.append(control_flow_ops.cond(math_ops.logical_or(math_ops.greater(temp_pos[0], 24), math_ops.logical_or(math_ops.greater(temp_pos[1], 24), math_ops.logical_or(math_ops.less(temp_pos[0], 0), math_ops.less(temp_pos[1],0)))), lambda:_position[j,:], lambda:temp_pos)) # new_pos.append(temp_pos) # update env new_env.append(array_ops.reshape( array_ops.slice(mapIdx, array_ops.pack([_mapNo[j], new_pos[-1][0], new_pos[-1][1], new_pos[-1][2], 0]), [1,1,1,1,state_size]), [state_size])) new_pos = array_ops.pack(new_pos) new_env = array_ops.pack(new_env) return new_pos, new_env # return new_pos, None outputs = [] attentions = [] environments = [] positions = [] prev = None # print(" Action info: no act=%d, move=%d, turn left=%d, turn right=%d, turn back=%d" % # (data_utils.noAct_ID, data_utils.moveAct_ID, data_utils.turnLeft_ID, data_utils.turnRight_ID, data_utils.turnBack_ID)) if decoder_inputs_positions and decoder_inputs_maps and batch_size: position = decoder_inputs_positions[0] # 2d tensor of shape [batch_size, 3] _, env = updateEnv(position, None, decoder_inputs_maps) for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): inp = array_ops.stop_gradient(loop_function(prev, i)) # Run the RNN. cur_output, new_state = cell(inp, states[-1]) cur_output = array_ops.reshape(cur_output, [batch_size, attn_size]) states.append(new_state) # Run the attention mechanism. h_attn, attn_weight = attention(cur_output) attentions.append(attn_weight) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear(h_attn, output_size, False) if loop_function is not None: # We do not propagate gradients over the loop function. prev = array_ops.stop_gradient(output) if decoder_inputs_positions and decoder_inputs_maps and position: # update pos and env if loop_function: step = math_ops.argmax(nn_ops.softmax(prev), 1) # step is a list (len=batch_size) of int32 number position, env = updateEnv(position, step, decoder_inputs_maps) else: if i < len(decoder_inputs_positions) - 1: position = decoder_inputs_positions[i+1] _, env = updateEnv(position, None, decoder_inputs_maps) outputs.append(output) environments.append(env) positions.append(position) return outputs, states, attentions, environments, positions
def local_attention(decoder_hidden_state, hidden_attn, window_size=10, content_function=vinyals_kaiser, dtype=tf.float32): """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn). Parameters ---------- decoder_hidden_state : 2-D Tensor Tensor representing the current hidden state of the decoder (output of the recurrent layers). Shape is (?, decoder_size). hidden_attn : 4-D Tensor Tensor representing the hidden states of the encoder (output of the recurrent layers). It has shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate the attention score more efficiently. initializer : function Function to use when initializing variables within the variables context. window_size : int Size of each side of the window to use when applying local attention. Not relevant to global attention. Default to 10. content_function : function Content function to score the decoder hidden states and encoder hidden states to extract their weights. Default to 'vinyals_kaiser'. dtype : tensorflow dtype Type of tensors. Default to tf.float32 Returns ------- ds : 2-D Tensor Tensor representing the context vector generated after scoring the encoder and decoder hidden states. Has shape (?, decoder_size), i.e., one context vector per batch sample. """ assert content_function is not None sigma = window_size / 2 denominator = sigma ** 2 print('decode_hidden_state', decoder_hidden_state.get_shape()) attention_vec_size = hidden_attn.get_shape()[2].value attn_length = hidden_attn.get_shape()[1].value batch_size = hidden_attn.get_shape()[0].value with vs.variable_scope("AttentionLocal"): # apply content function to score the hidden states from the encoder s = content_function(hidden_attn, decoder_hidden_state) from tensorflow.python.ops.rnn_cell import _linear as linear with vs.variable_scope("WindowPrediction"): ht = linear([decoder_hidden_state], attention_vec_size, True) # get the parameters (vp) vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size]) # tanh(Wp*ht) tanh = math_ops.tanh(ht) # S * sigmoid(vp * tanh(Wp*ht)) - this is going to return a number # for each sentence in the batch - i.e., a tensor of shape batch x 1 S = attn_length print('tanh', tanh.get_shape()) pt = math_ops.reduce_sum((vp * tanh), 1) pt = math_ops.sigmoid(pt) * S # now we get only the integer part of the values pt = tf.floor(pt) his1 = tf.histogram_summary('local_window_predictions', pt) # we now create a tensor containing the indices representing each position # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3, # the resulting tensor will be: # [[0, 1, 2, 3, 4] # [0, 1, 2, 3, 4] # [0, 1, 2, 3, 4]] # indices = [] for pos in xrange(attn_length): indices.append(pos) indices = indices * batch_size idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype) idx = tf.reshape(idx, [-1, attn_length]) print('batch_size', batch_size) # print(idx.get_shape()) # here we calculate the boundaries of the attention window based on the ppositions low = pt - window_size + 1 # we add one because the floor op already generates the first position high = pt + window_size # here we check our positions against the boundaries low = tf.expand_dims(low, -1) high = tf.expand_dims(high, -1) print(idx.get_shape()) print(low.get_shape()) print(attn_length) mlow = tf.to_float(idx < low) mhigh = tf.to_float(idx > high) # now we combine both into a pre-mask that has 0s and 1s switched # i.e, at this point, True == 0 and False == 1 m = mlow + mhigh # batch_size # here we switch the 0s to 1s and the 1s to 0s # we correct the values so True == 1 and False == 0 mask = tf.to_float(tf.equal(m, 0.0)) # here we switch off all the values that fall outside the window # first we switch off those in the truncated normal alpha = s * mask masked_soft = nn_ops.softmax(alpha) his2 = tf.histogram_summary('local_alpha_weights', alpha) # here we calculate the 'truncated normal distribution' # print(pt.get_shape()) pt = tf.expand_dims(pt, -1) numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype)) div = tf.truediv(numerator, denominator) e = math_ops.exp(div) # result of the truncated normal distribution at = masked_soft * e # Now calculate the attention-weighted vector d. print('at shape', at.get_shape()) ds = math_ops.reduce_sum( tf.expand_dims(at, -1) * hidden_attn, 1) print(ds.get_shape()) # ds = array_ops.reshape(d, [-1, attention_vec_size]) his3 = tf.histogram_summary('local_attention_context', ds) return ds, [ his1, his2, his3 ]
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) print("Initial_state") state_size = int(initial_state.get_shape().with_rank(2)[1]) states =[] for kk in range(1): states.append(initial_state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # for c in range(ct): ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = [] attns.append(attention(initial_state)) tmp = tf.reshape(tf.concat(0, attns), [-1, attn_size]) attns = [] attns.append(tmp) log_beam_probs, beam_path, beam_symbols = [],[],[] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None : with variable_scope.variable_scope("loop_function", reuse=True): if prev is not None: inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) input_size = inp.get_shape().with_rank(2)[1] x = linear([inp] + attns, input_size, True) cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output if i ==0: states =[] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) outputs.append(tf.argmax(nn_ops.xw_plus_b( output, output_projection[0], output_projection[1]), dimension=1)) return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
def attention_encoder(decoder_inputs, initial_state, attention_states, cell, num_heads=1, output_size=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """ Encoder that receives attention from another encoder Parameters ---------- decoder_inputs: second encoder's input we call it a decoder's input it should be already wrapped by add_embedding() it's A list of num_steps length 2D Tensors [batch_size, input_size = embed_size] initial_state: 2D Tensor (batch_size x cell.state_size). attention_states: 3D Tensor (batch_size x attn_length (seq_length) x attn_size) cell num_heads output_size dtype scope initial_state_attention Returns ------- A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape (batch_size x cell.state_size). """ decoder_inputs = [decoder_inputs ] # in original model this is a bucket list of inputs with vs.variable_scope(scope or "attention_encoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value v = [] attention_vec_size = attn_size # Size of query vectors for attention. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] for a in xrange(num_heads): k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with vs.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [ array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) state = initial_state # this is now iterating on time steps for i, inp in enumerate(decoder_inputs): if i > 0: vs.get_variable_scope().reuse_variables() # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with vs.variable_scope(vs.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) outputs.append(output) # we only want the last state return outputs, state
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: size of the output vectors; if None, we use cell.output_size. num_heads: number of attention heads that read from attention_states. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with vs.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): inp = array_ops.stop_gradient(loop_function(prev, i)) # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with vs.variable_scope(vs.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) if loop_function is not None: # We do not propagate gradients over the loop function. prev = array_ops.stop_gradient(output) outputs.append(output) return outputs, state
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: size of the output vectors; if None, we use cell.output_size. num_heads: number of attention heads that read from attention_states. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size])) states = [initial_state] def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with vs.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): inp = array_ops.stop_gradient(loop_function(prev, i)) # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) # RNN h_i = h(Wx_i, h_i-1), # cell = GRUCell() in rnn_cell.py, new_h = u * state + (1 - u) * c return new_h, new_h states.append(new_state) # Run the attention mechanism. attns = attention(new_state) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) ###[cell_ouput]+attns:list of 2D, batch x n, Tensors, -> returns a 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]) ###number of rows is now output_size (target_vocab_size) if loop_function is not None: # We do not propagate gradients over the loop function. prev = array_ops.stop_gradient(output) outputs.append(output) return outputs, states
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = array_ops.stop_gradient(loop_function(prev, i)) # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) if loop_function is not None: # We do not propagate gradients over the loop function. prev = array_ops.stop_gradient(output) outputs.append(output) return outputs, state
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.pack(decoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum( inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp, attns], cell.input_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps