def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: prev = tf.xw_plus_b( prev, output_projection[0], output_projection[1]) # prev= prev.get_shape().with_rank(2)[1] probs = tf.log(tf.nn.softmax(prev)) if i > 1: probs = tf.reshape(probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) best_probs, indices = tf.nn.top_k(probs, beam_size) indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1])) symbols = indices % num_symbols # Which word in vocabulary. beam_parent = indices // num_symbols # Which hypothesis it came from. beam_symbols.append(symbols) beam_path.append(beam_parent) log_beam_probs.append(best_probs) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = tf.nn.embedding_lookup(embedding, symbols) emb_prev = tf.reshape(emb_prev,[beam_size,embedding_size]) if not update_embedding: emb_prev = tf.stop_gradient(emb_prev) return emb_prev
def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = tf.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) emb_prev = tf.embedding_lookup(embedding, prev_symbol) return emb_prev
def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = tf.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) emb_prev = tf.embedding_lookup(embedding, prev_symbol) return emb_prev
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None,output_projection=None, beam_size=10): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with tf.variable_scope(scope or "rnn_decoder"): state = initial_state outputs = [] prev = None log_beam_probs, beam_path, beam_symbols = [],[],[] state_size = int(initial_state.get_shape().with_rank(2)[1]) for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) if i > 0: tf.get_variable_scope().reuse_variables() input_size = inp.get_shape().with_rank(2)[1] print(input_size) x = inp output, state = cell(x, state) if loop_function is not None: prev = output if i ==0: states =[] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(axis=0, values=states), [-1, state_size]) outputs.append(tf.argmax(tf.xw_plus_b( output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])
def loop_function(prev, _): if output_projection is not None: prev = tf.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = tf.argmax(prev, 1) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = tf.stop_gradient(emb_prev) return emb_prev
def get_cd_update(self, x): # contrastive divergence algorithm # First, we get the samples of x and h from the probability distribution # The sample of x x_sample = self.gibbs_sample(x) # The sample of hidden nodes, starting from the visible state of x h = self.sample(tf.sigmoid(tf.xw_plus_b(x, self.W, self.bh))) # The sample of the hidden nodes, starting from the visible state of x_sample h_sample = self.sample( tf.sigmoid(tf.xw_plus_b(x_sample, self.W, self.bh))) # Next, we update the value of W, bh and bv, based on the differences between the samples # that we drew and original values lr = tf.constant(self.learning_rate, tf.float32) # the CD learning rate size_bt = tf.cast(tf.shape(x)[0], tf.float32) # batch size W_ = tf.multiply(lr / size_bt, tf.sub(tf.matmul(x, h, transpose_a=True)), tf.matmul(x_sample, h_sample, transpose_a=True)) bv_ = tf.multiply( lr / size_bt, tf.reduce_sum(tf.sub(x, x_sample), axis=0, keep_dims=True)) bh_ = tf.multiply( lr / size_bt, tf.reduce_sum(tf.sub(h, h_sample), axis=0, keep_dims=True)) # When we do sess.run(update), tf will run all 3 updates update = [ self.W.assign_add(W_), self.bv.assign_add(bv_), self.bh.assign_add(bh_) ] return update
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=tf.float32, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with tf.variable_scope(scope or "attention_decoder"): batch_size = tf.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = tf.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size])) print("Initial_state") state_size = int(initial_state.get_shape().with_rank(2)[1]) states =[] for kk in range(1): states.append(initial_state) state = tf.reshape(tf.concat(axis=0, values=states), [-1, state_size]) def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum( v[a] * tf.nn.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,[1, 2]) # for c in range(ct): ds.append(tf.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = tf.stack([batch_size, attn_size]) attns = [tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = [] attns.append(attention(initial_state)) tmp = tf.reshape(tf.concat(axis=0, values=attns), [-1, attn_size]) attns = [] attns.append(tmp) log_beam_probs, beam_path, beam_symbols = [],[],[] for i, inp in enumerate(decoder_inputs): if i > 0: tf.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None : with tf.variable_scope("loop_function", reuse=True): if prev is not None: inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) input_size = inp.get_shape().with_rank(2)[1] x = linear([inp] + attns, input_size, True) cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with tf.variable_scope(tf.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with tf.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output if i ==0: states =[] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(axis=0, values=states), [-1, state_size]) with tf.variable_scope(tf.get_variable_scope(), reuse=True): attns = attention(state) outputs.append(tf.argmax(tf.xw_plus_b( output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])