def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum( tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum( v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum( tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds
def _build_recurrent_model(self, input, state, num_units, **kwargs): from rnn_cell import linear c, h = array_ops.split(state, 2, 1) c = tf.identity(c, name='LSTMCell/c_state') h = tf.identity(c, name='LSTMCell/h_state') i, j, f, o = array_ops.split(linear([input, h], 4 * num_units, True), 4, 1) j = tf.tanh(j, name='LSTMCell/j_input') i = tf.sigmoid(i, name='LSTMCell/i_gate') f = tf.sigmoid(f, name='LSTMCell/f_gate') o = tf.sigmoid(o, name='LSTMCell/o_gate') c_ = i * j + f * c h_ = o * tf.tanh(c_) return h_, array_ops.concat([c_, h_], 1, name='LSTMCell/c_h_states')
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=tf.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: size of the output vectors; if None, we use cell.output_size. num_heads: number of attention heads that read from attention_states. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with tf.variable_scope(scope or "attention_decoder"): batch_size = tf.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = tf.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size])) states = [initial_state] def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum( v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum( tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = tf.pack([batch_size, attn_size]) attns = [tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i in xrange(len(decoder_inputs)): if i > 0: tf.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): inp = tf.stop_gradient(loop_function(prev, i)) # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. if i == 0 and initial_state_attention: with tf.variable_scope(tf.get_variable_scope(), reuse=True): attns = attention(new_state) else: attns = attention(new_state) with tf.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) if loop_function is not None: # We do not propagate gradients over the loop function. prev = tf.stop_gradient(output) outputs.append(output) return outputs, states
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=tf.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: size of the output vectors; if None, we use cell.output_size. num_heads: number of attention heads that read from attention_states. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with tf.variable_scope(scope or "attention_decoder"): batch_size = tf.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append( tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size])) states = [initial_state] def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = rnn_cell.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum( tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = tf.pack([batch_size, attn_size]) attns = [ tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i in xrange(len(decoder_inputs)): if i > 0: tf.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): inp = tf.stop_gradient(loop_function(prev, i)) # Merge input and previous attentions into one vector of the right size. x = rnn_cell.linear([inp] + attns, cell.input_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. if i == 0 and initial_state_attention: with tf.variable_scope(tf.get_variable_scope(), reuse=True): attns = attention(new_state) else: attns = attention(new_state) with tf.variable_scope("AttnOutputProjection"): output = rnn_cell.linear([cell_output] + attns, output_size, True) if loop_function is not None: # We do not propagate gradients over the loop function. prev = tf.stop_gradient(output) outputs.append(output) return outputs, states
def __init__(self, is_training, config): self.xs = tf.placeholder(tf.int32, [None, config.num_steps]) self.ys = tf.placeholder(tf.int32, [None, config.num_steps]) embedding = tf.get_variable("embedding", [config.vocab_size, config.hidden_size], dtype=tf.float32) if config.cell_type == 'rnn': print 'rnn' cell = rnn_cell.LegacyRNNCell(config.hidden_size) elif config.cell_type == 'lstm': print 'lstm' cell = rnn_cell.LegacyLSTMCell(config.hidden_size) else: print 'gru' cell = rnn_cell.LegacyGRUCell(config.hidden_size) inputs = tf.nn.embedding_lookup(embedding, self.xs) if is_training: inputs = tf.nn.dropout(inputs, config.keep_prob) init_h = tf.zeros([tf.shape(self.xs)[0], config.hidden_size], tf.float32) init_c = tf.zeros([tf.shape(self.xs)[0], config.hidden_size], tf.float32) input_ta = tf.TensorArray(tf.float32, config.num_steps, tensor_array_name='input_array') output_ta = tf.TensorArray(tf.float32, config.num_steps, tensor_array_name='output_array') input_ta = input_ta.unstack(tf.transpose(inputs, [1, 0, 2])) def loop_func(t, out_ta, h, c): inp_t = input_ta.read(t) cell_output, new_h, new_c = cell(inp_t, h, c) out_ta = out_ta.write(t, cell_output) return t + 1, out_ta, new_h, new_c time = tf.constant(0, dtype=tf.int32, name='time') loop_vars = (time, output_ta, init_h, init_c) result = tf.while_loop(lambda t, *_: t < config.num_steps, loop_func, loop_vars) outputs = result[1].stack() outputs = tf.transpose(outputs, [1, 0, 2]) outputs = tf.reshape(outputs, [-1, config.hidden_size]) logits = rnn_cell.linear(outputs, config.vocab_size, 'logits') logits = tf.reshape( logits, [tf.shape(self.xs)[0], config.num_steps, config.vocab_size]) loss = tf.contrib.seq2seq.sequence_loss( logits, self.ys, tf.ones([tf.shape(self.xs)[0], config.num_steps], dtype=tf.float32)) self.cost = loss optimizer = tf.train.GradientDescentOptimizer(config.learning_rate) #optimizer = tf.train.AdamOptimizer() if not config.clip: self.train_op = optimizer.minimize(loss) else: trainable_variables = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, trainable_variables), 5) self.train_op = optimizer.apply_gradients( zip(grads, trainable_variables))
def _build_recurrent_model(self, input, state, num_units, **kwargs): from rnn_cell import linear hidden_state = tf.tanh( linear([input, state], num_units, True, scope='BasicRNN'), 'BasicRNN/hidden_state') return hidden_state, hidden_state