def _output_form_split(cell_output, decoder_states, output_size, initializer): with vs.variable_scope("cell_output_linear", initializer=initializer): cell_output_ = cells.linear([cell_output], output_size, True) with vs.variable_scope("decoder_states_linear", initializer=initializer): ds_ = cells.linear([decoder_states], output_size, True) output = cell_output_ + ds_ return output
def _output_form_concat(cell_output, decoder_states, output_size, initializer): with vs.variable_scope("output_form", initializer=initializer): output = cells.linear([cell_output] + [decoder_states], output_size, True) return output
def score_combined(decoder_current, # h_t decoder_previous, # h_i reuse_variables=False, dtype=tf.float32): """ Applies a score function of the form v.(W1.hi + W2.hs) where W is a weight matrix, v is a vector of parameters, hi is one of each of the decoder hidden states and hs is the current hidden state at timestep t. The function performs a 1-by-1 convolution to calculate W.hi and performs the W2.hs step using a ``linear'' cell (see cells.linear for the documentation) and broadcasted into the result of W1.hi (encoder_hiddens) via multiplication step. After this step a reduce_sum is performed over axis=[2,3] so the correct results are obtained. Args: decoder_current: not used attn_size: the size of the attention vectors encoder_hiddens: 3-D Tensor [batch_size, timestep, hidden_dim]. It represents the hidden sattes of the decoder up to the current timestep. current_hidden: Tensor, representing the current hidden state at timestep t Returns: beta: decoder hidden states after applying the content function """ with tf.variable_scope("score_salton_combined") as scope: if reuse_variables: scope.reuse_variables() _, output_size = get_2d_tensor_shapes(decoder_current) decoder_current = cells.linear( [decoder_current], output_size, bias=False, dtype=dtype) # decoder_previous, attn_dim = reshape_attention(decoder_previous) # we first get the correct weight matrix ws = tf.get_variable("AttnDecWs", [1, 1, attn_dim, attn_dim], dtype=dtype) # we apply a small convolution to the decoder states - it is more # efficient than performing a recurrent matrix * matrix hidden_features = convolve(decoder_previous, ws) hidden_features = hidden_features + decoder_current # we then get the vector v that will be used on the second # multiplication op. vs = tf.get_variable("AttnDecVs", [attn_dim], dtype=dtype) scores = tf.reduce_sum((vs * tf.tanh(hidden_features)), [2, 3]) return scores
def mod_vinyals_kayser(hidden, decoder_hidden_state, initializer=None): # size of decoder layers attention_vec_size = hidden.get_shape()[3].value with vs.variable_scope("mod_vinyals_kayser", initializer=initializer): k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, 1], initializer=initializer) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") y = cells.linear(decoder_hidden_state, 1, True) y = array_ops.reshape(y, [-1, 1, 1, 1]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(math_ops.tanh(hidden_features + y), [2, 3]) return s
def bahdanau_nmt(hidden, decoder_previous_state, initializer=None): # size of decoder layers attention_vec_size = hidden.get_shape()[3].value decoder_size = decoder_previous_state.get_shape()[1].value with vs.variable_scope("bahdanau_nmt", initializer=initializer): # here we calculate the W_a * s_i-1 (W1 * h_1) part of the attention alignment k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, attention_vec_size], initializer=initializer) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") va = vs.get_variable("AttnV_%d" % 0, [attention_vec_size], initializer=initializer) y = cells.linear(decoder_previous_state, decoder_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(va * math_ops.tanh(hidden_features + y), [2, 3]) return s
def attention_decoder_output(decoder_inputs, initial_state, attention_states, cell, num_symbols, attention_f=global_attention, window_size=10, content_function=vinyals_kaiser, decoder_attention_f=decoder_type_2, combine_inp_attn=False, input_feeding=False, dropout=None, initializer=None, decoder_states=None, step_num=None, dtype=tf.float32, scope=None): """ Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence model. Parameters ---------- decoder_inputs: list a list of 2D Tensors [batch_size x cell.input_size]. initial_state: tensor 3D Tensor [batch_size x attn_length x attn_size]. attention_states: cell: RNNCell rnn_cell.RNNCell defining the cell function and size. batch_size: int batch size when training the model attention_f: function function indicating which type of attention to use. Default to global_attention. output_size: int size of the output vectors; if None, we use cell.output_size. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. window_size: int size of the window to apply on local attention input_feeding: boolean whether or not to use the input feeding approach by Luong et al., 2015. content_function: string dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns ------- outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ assert attention_f is not None output_size = cell.output_size if dropout is not None: for c in cell._cells: c.input_keep_prob = 1.0 - dropout if initializer is None: initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1, seed=_SEED) with vs.variable_scope(scope or "attention_decoder", initializer=initializer): emb_inp = _embed_inputs(decoder_inputs, num_symbols, cell.input_size, input_feeding=input_feeding) batch = array_ops.shape(emb_inp[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) cell_state = initial_state outputs = [] batch_attn_size = array_ops.pack([batch, attn_size]) # initial attention state ct = array_ops.zeros(batch_attn_size, dtype=dtype) ct.set_shape([None, attn_size]) if decoder_states is None: cell_outputs = [] else: cell_outputs = decoder_states for i in xrange(len(emb_inp)): if i > 0: vs.get_variable_scope().reuse_variables() if input_feeding: # if using input_feeding, concatenate previous attention with input to layers inp = array_ops.concat(1, [emb_inp[i], ct]) else: inp = emb_inp[i] if combine_inp_attn: # Merge input and previous attentions into one vector of the right size. x = cells.linear([inp] + [ct], cell.input_size, True) else: x = inp # Run the RNN. cell_output, new_state = cell(x, cell_state) cell_state = new_state if decoder_states is None: # states.append(new_state) # new_state = dt# cell_outputs.append(cell_output) else: reshaped = tf.reshape(cell_output, [-1, 1, 1, attn_size]) decoder_states = tf.concat(1, [decoder_states, reshaped]) # dt = new_state if content_function is mod_bahdanau: dt = cell_outputs[-2] else: dt = cell_output ct = attention_f(decoder_hidden_state=dt, hidden_attn=hidden, initializer=initializer, window_size=window_size, content_function=content_function, dtype=dtype) with vs.variable_scope("AttnOutputProjection", initializer=initializer): if decoder_states is None: shape1 = len(cell_outputs) top_states = [ tf.reshape(o, [-1, 1, attn_size]) for o in cell_outputs ] output_attention_states = tf.concat(1, top_states) decoder_hidden = array_ops.reshape( output_attention_states, [-1, shape1, 1, attn_size]) ht_hat = decoder_output_attention(decoder_hidden, attn_size, decoder_attention_f, initializer=initializer) else: decoder_hidden = decoder_states ht_hat = decoder_output_attention(decoder_hidden, attn_size, decoder_attention_f, initializer=initializer, step_num=step_num) output = cells.linear([ct] + [ht_hat], output_size, True) output = tf.tanh(output) outputs.append(output) if decoder_states is None: cell_outs = [ tf.reshape(o, [-1, 1, 1, attn_size]) for o in cell_outputs ] cell_outputs = tf.concat(1, cell_outs) else: cell_outputs = decoder_states return outputs, cell_state, cell_outputs
def hybrid_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10, content_function=vinyals_kaiser, dtype=tf.float32): """Put hybrid attention (mix of global and local attention) on hidden using decoder hidden states and the hidden states of encoder (hidden_attn). Parameters ---------- decoder_hidden_state : 2-D Tensor Tensor representing the current hidden state of the decoder (output of the recurrent layers). Shape is (?, decoder_size). hidden_attn : 4-D Tensor Tensor representing the hidden states of the encoder (output of the recurrent layers). It has shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate the attention score more efficiently. initializer : function Function to use when initializing variables within the variables context. window_size : int Size of each side of the window to use when applying local attention. Not relevant to global attention. Default to 10. content_function : function Content function to score the decoder hidden states and encoder hidden states to extract their weights. Default to 'vinyals_kaiser'. dtype : tensorflow dtype Type of tensors. Default to tf.float32 Returns ------- ds : 2-D Tensor Tensor representing the context vector generated after scoring the encoder and decoder hidden states. Has shape (?, decoder_size), i.e., one context vector per batch sample. """ assert content_function is not None attention_vec_size = hidden_attn.get_shape()[3].value local_attn = local_attention(decoder_hidden_state=decoder_hidden_state, hidden_attn=hidden_attn, content_function=content_function, window_size=window_size, initializer=initializer, dtype=dtype) global_attn = global_attention(decoder_hidden_state=decoder_hidden_state, hidden_attn=hidden_attn, content_function=content_function, window_size=window_size, initializer=initializer, dtype=dtype) with vs.variable_scope("FeedbackGate_%d" % 0, initializer=initializer): y = cells.linear(decoder_hidden_state, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) vb = vs.get_variable("FeedbackVb_%d" % 0, [attention_vec_size], initializer=initializer) # tanh(Wp*ht) tanh = math_ops.tanh(y) beta = math_ops.sigmoid(math_ops.reduce_sum((vb * tanh), [2, 3])) _ = tf.histogram_summary('hybrid_beta_weights', beta) attns = beta * global_attn + (1 - beta) * local_attn return attns
def local_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10, content_function=vinyals_kaiser, dtype=tf.float32): """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn). Parameters ---------- decoder_hidden_state : 2-D Tensor Tensor representing the current hidden state of the decoder (output of the recurrent layers). Shape is (?, decoder_size). hidden_attn : 4-D Tensor Tensor representing the hidden states of the encoder (output of the recurrent layers). It has shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate the attention score more efficiently. initializer : function Function to use when initializing variables within the variables context. window_size : int Size of each side of the window to use when applying local attention. Not relevant to global attention. Default to 10. content_function : function Content function to score the decoder hidden states and encoder hidden states to extract their weights. Default to 'vinyals_kaiser'. dtype : tensorflow dtype Type of tensors. Default to tf.float32 Returns ------- ds : 2-D Tensor Tensor representing the context vector generated after scoring the encoder and decoder hidden states. Has shape (?, decoder_size), i.e., one context vector per batch sample. """ assert content_function is not None sigma = window_size / 2 denominator = sigma**2 attention_vec_size = hidden_attn.get_shape()[3].value attn_length = hidden_attn.get_shape()[1].value batch_size = array_ops.shape(hidden_attn)[0] with vs.variable_scope("AttentionLocal", initializer=initializer): # apply content function to score the hidden states from the encoder s = content_function(hidden_attn, decoder_hidden_state) with vs.variable_scope("WindowPrediction", initializer=initializer): ht = cells.linear([decoder_hidden_state], attention_vec_size, True) # get the parameters (vp) vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size], initializer=initializer) # tanh(Wp*ht) tanh = math_ops.tanh(ht) # S * sigmoid(vp * tanh(Wp*ht)) - this is going to return a number # for each sentence in the batch - i.e., a tensor of shape batch x 1 S = attn_length pt = math_ops.reduce_sum((vp * tanh), [2, 3]) pt = math_ops.sigmoid(pt) * S # now we get only the integer part of the values pt = tf.floor(pt) _ = tf.histogram_summary('local_window_predictions', pt) # we now create a tensor containing the indices representing each position # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3, # the resulting tensor will be: # [[0, 1, 2, 3, 4] # [0, 1, 2, 3, 4] # [0, 1, 2, 3, 4]] # indices = [] for pos in xrange(attn_length): indices.append(pos) indices = indices * batch_size idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype) idx = tf.reshape(idx, [-1, attn_length]) # here we calculate the boundaries of the attention window based on the ppositions low = pt - window_size + 1 # we add one because the floor op already generates the first position high = pt + window_size # here we check our positions against the boundaries mlow = tf.to_float(idx < low) mhigh = tf.to_float(idx > high) # now we combine both into a pre-mask that has 0s and 1s switched # i.e, at this point, True == 0 and False == 1 m = mlow + mhigh # batch_size # here we switch the 0s to 1s and the 1s to 0s # we correct the values so True == 1 and False == 0 mask = tf.to_float(tf.equal(m, 0.0)) # here we switch off all the values that fall outside the window # first we switch off those in the truncated normal alpha = s * mask masked_soft = nn_ops.softmax(alpha) _ = tf.histogram_summary('local_alpha_weights', alpha) # here we calculate the 'truncated normal distribution' numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype)) div = tf.truediv(numerator, denominator) e = math_ops.exp(div) # result of the truncated normal distribution at = masked_soft * e # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(at, [-1, attn_length, 1, 1]) * hidden_attn, [1, 2]) ds = array_ops.reshape(d, [-1, attention_vec_size]) _ = tf.histogram_summary('local_attention_context', ds) return ds
def attention_decoder_nmt(decoder_inputs, initial_state, attention_states, cell, num_symbols, attention_f=global_attention, window_size=10, content_function=vinyals_kaiser, decoder_attention_f=decoder_type_2, combine_inp_attn=False, input_feeding=False, dropout=None, initializer=None, dtype=tf.float32, scope=None): """ Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence model. Parameters ---------- decoder_inputs: list a list of 2D Tensors [batch_size x cell.input_size]. initial_state: tensor 2d Tensor [batch_size x (number of decoder layers * hidden_layer_size * 2)] if LSTM or [batch_size x (number of decoder layers * hidden_layer_size)] if GRU representing the initial state (usually, we take the states of the encoder) to be used when running the decoder. The '2' on the LSTM formula mean that we have to set the hidden state and the cell state. attention_states: tensor 3D tensor [batch_size x attn_length (time) x attn_size (hidden_layer_size)] representing the encoder hidden states that will be used to derive the context (attention) vector. cell: RNNCell rnn_cell.RNNCell defining the cell function and size. batch_size: tensor tensor representing the batch size used when training the model attention_f: function function indicating which type of attention to use. Default to global_attention. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. window_size: int size of the window to apply on local attention.Default to 10. input_feeding : boolean Flag indicating where to use the "input feeding approach" proposed by Luong et al. (2015). Default to False. content_function: string dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns ------- outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ assert attention_f is not None output_size = cell.output_size if dropout is not None: cell.input_keep_prob = 1.0 - dropout if initializer is None: initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1, seed=_SEED) with vs.variable_scope(scope or "embedding_attention_decoder", initializer=initializer): emb_inp = _embed_inputs(decoder_inputs, num_symbols, cell.input_size, input_feeding=input_feeding) batch = array_ops.shape(emb_inp[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) cell_states = initial_state initial_state_decoder = tf.zeros_like(initial_state) initial_state_decoder.set_shape( [None, initial_state.get_shape()[1].value]) cell_outputs = [initial_state_decoder] outputs = [] batch_attn_size = array_ops.pack([batch, attn_size]) # initial attention state ct = array_ops.zeros(batch_attn_size, dtype=dtype) ct.set_shape([None, attn_size]) for i in xrange(len(emb_inp)): if i > 0: vs.get_variable_scope().reuse_variables() if input_feeding: # if using input_feeding, concatenate previous attention with input to layers inp = array_ops.concat(1, [emb_inp[i], ct]) else: inp = emb_inp[i] if combine_inp_attn: # Merge input and previous attentions into one vector of the right size. x = cells.linear([inp] + [ct], cell.input_size, True) else: x = inp dt = cell_outputs[-1] ct = attention_f(decoder_hidden_state=dt, hidden_attn=hidden, initializer=initializer, window_size=window_size, content_function=content_function, dtype=dtype) # Run the RNN. cell_output, new_state = cell(x, cell_states, context=ct) cell_states = new_state cell_outputs.append(cell_output) # with vs.variable_scope("AttnOutputProjection", initializer=initializer): with vs.variable_scope("AttnOutputProjection_logit_lstm", initializer=initializer): # if we pass a list of tensors, linear will first concatenate them over axis 1 logit_lstm = cells.linear([cell_output], output_size, True) with vs.variable_scope("AttnOutputProjection_logit_ctx", initializer=initializer): # if we pass a list of tensors, linear will first concatenate them over axis 1 logit_ctx = cells.linear([ct], output_size, True) with vs.variable_scope("AttnOutputProjection_logit_emb", initializer=initializer): # if we pass a list of tensors, linear will first concatenate them over axis 1 logit_prev = cells.linear([x], output_size, True) # if we pass a list of tensors, linear will first concatenate them over axis 1 output = tf.tanh(logit_lstm + logit_prev + logit_ctx) outputs.append(output) return outputs, cell_states
def _output_form_single(decoder_states, output_size, initializer): with vs.variable_scope("decoder_states_linear", initializer=initializer): output = cells.linear([decoder_states], output_size, True) return output
def attention_decoder_output(decoder_inputs, initial_state, attention_states, cell, num_symbols, attention_f=global_attention, window_size=10, content_function=vinyals_kaiser, decoder_attention_f=decoder_type_2, combine_inp_attn=False, input_feeding=False, dropout=None, initializer=None, decoder_states=None, step_num=None, dtype=tf.float32, scope=None): """ Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence model. Parameters ---------- decoder_inputs: list a list of 2D Tensors [batch_size x cell.input_size]. initial_state: tensor 3D Tensor [batch_size x attn_length x attn_size]. attention_states: cell: RNNCell rnn_cell.RNNCell defining the cell function and size. batch_size: int batch size when training the model attention_f: function function indicating which type of attention to use. Default to global_attention. output_size: int size of the output vectors; if None, we use cell.output_size. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. window_size: int size of the window to apply on local attention input_feeding: boolean whether or not to use the input feeding approach by Luong et al., 2015. content_function: string dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns ------- outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ assert attention_f is not None output_size = cell.output_size if dropout is not None: for c in cell._cells: c.input_keep_prob = 1.0 - dropout if initializer is None: initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1, seed=_SEED) with vs.variable_scope(scope or "attention_decoder", initializer=initializer): emb_inp = _embed_inputs(decoder_inputs, num_symbols, cell.input_size, input_feeding=input_feeding) batch = array_ops.shape(emb_inp[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) cell_state = initial_state outputs = [] batch_attn_size = array_ops.pack([batch, attn_size]) # initial attention state ct = array_ops.zeros(batch_attn_size, dtype=dtype) ct.set_shape([None, attn_size]) if decoder_states is None: cell_outputs = [] else: cell_outputs = decoder_states for i in xrange(len(emb_inp)): if i > 0: vs.get_variable_scope().reuse_variables() if input_feeding: # if using input_feeding, concatenate previous attention with input to layers inp = array_ops.concat(1, [emb_inp[i], ct]) else: inp = emb_inp[i] if combine_inp_attn: # Merge input and previous attentions into one vector of the right size. x = cells.linear([inp] + [ct], cell.input_size, True) else: x = inp # Run the RNN. cell_output, new_state = cell(x, cell_state) cell_state = new_state if decoder_states is None: # states.append(new_state) # new_state = dt# cell_outputs.append(cell_output) else: reshaped = tf.reshape(cell_output, [-1, 1, 1, attn_size]) decoder_states = tf.concat(1, [decoder_states, reshaped]) # dt = new_state if content_function is mod_bahdanau: dt = cell_outputs[-2] else: dt = cell_output ct = attention_f(decoder_hidden_state=dt, hidden_attn=hidden, initializer=initializer, window_size=window_size, content_function=content_function, dtype=dtype) with vs.variable_scope("AttnOutputProjection", initializer=initializer): if decoder_states is None: shape1 = len(cell_outputs) top_states = [tf.reshape(o, [-1, 1, attn_size]) for o in cell_outputs] output_attention_states = tf.concat(1, top_states) decoder_hidden = array_ops.reshape(output_attention_states, [-1, shape1, 1, attn_size]) ht_hat = decoder_output_attention(decoder_hidden, attn_size, decoder_attention_f, initializer=initializer) else: decoder_hidden = decoder_states ht_hat = decoder_output_attention(decoder_hidden, attn_size, decoder_attention_f, initializer=initializer, step_num=step_num) output = cells.linear([ct] + [ht_hat], output_size, True) output = tf.tanh(output) outputs.append(output) if decoder_states is None: cell_outs = [tf.reshape(o, [-1, 1, 1, attn_size]) for o in cell_outputs] cell_outputs = tf.concat(1, cell_outs) else: cell_outputs = decoder_states return outputs, cell_state, cell_outputs
def attention_decoder_nmt(decoder_inputs, initial_state, attention_states, cell, num_symbols, attention_f=global_attention, window_size=10, content_function=vinyals_kaiser, decoder_attention_f=decoder_type_2, combine_inp_attn=False, input_feeding=False, dropout=None, initializer=None, dtype=tf.float32, scope=None): """ Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence model. Parameters ---------- decoder_inputs: list a list of 2D Tensors [batch_size x cell.input_size]. initial_state: tensor 2d Tensor [batch_size x (number of decoder layers * hidden_layer_size * 2)] if LSTM or [batch_size x (number of decoder layers * hidden_layer_size)] if GRU representing the initial state (usually, we take the states of the encoder) to be used when running the decoder. The '2' on the LSTM formula mean that we have to set the hidden state and the cell state. attention_states: tensor 3D tensor [batch_size x attn_length (time) x attn_size (hidden_layer_size)] representing the encoder hidden states that will be used to derive the context (attention) vector. cell: RNNCell rnn_cell.RNNCell defining the cell function and size. batch_size: tensor tensor representing the batch size used when training the model attention_f: function function indicating which type of attention to use. Default to global_attention. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. window_size: int size of the window to apply on local attention.Default to 10. input_feeding : boolean Flag indicating where to use the "input feeding approach" proposed by Luong et al. (2015). Default to False. content_function: string dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns ------- outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ assert attention_f is not None output_size = cell.output_size if dropout is not None: cell.input_keep_prob = 1.0 - dropout if initializer is None: initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1, seed=_SEED) with vs.variable_scope(scope or "embedding_attention_decoder", initializer=initializer): emb_inp = _embed_inputs(decoder_inputs, num_symbols, cell.input_size, input_feeding=input_feeding) batch = array_ops.shape(emb_inp[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) cell_states = initial_state initial_state_decoder = tf.zeros_like(initial_state) initial_state_decoder.set_shape([None, initial_state.get_shape()[1].value]) cell_outputs = [initial_state_decoder] outputs = [] batch_attn_size = array_ops.pack([batch, attn_size]) # initial attention state ct = array_ops.zeros(batch_attn_size, dtype=dtype) ct.set_shape([None, attn_size]) for i in xrange(len(emb_inp)): if i > 0: vs.get_variable_scope().reuse_variables() if input_feeding: # if using input_feeding, concatenate previous attention with input to layers inp = array_ops.concat(1, [emb_inp[i], ct]) else: inp = emb_inp[i] if combine_inp_attn: # Merge input and previous attentions into one vector of the right size. x = cells.linear([inp] + [ct], cell.input_size, True) else: x = inp dt = cell_outputs[-1] ct = attention_f(decoder_hidden_state=dt, hidden_attn=hidden, initializer=initializer, window_size=window_size, content_function=content_function, dtype=dtype) # Run the RNN. cell_output, new_state = cell(x, cell_states, context=ct) cell_states = new_state cell_outputs.append(cell_output) # with vs.variable_scope("AttnOutputProjection", initializer=initializer): with vs.variable_scope("AttnOutputProjection_logit_lstm", initializer=initializer): # if we pass a list of tensors, linear will first concatenate them over axis 1 logit_lstm = cells.linear([cell_output], output_size, True) with vs.variable_scope("AttnOutputProjection_logit_ctx", initializer=initializer): # if we pass a list of tensors, linear will first concatenate them over axis 1 logit_ctx = cells.linear([ct], output_size, True) with vs.variable_scope("AttnOutputProjection_logit_emb", initializer=initializer): # if we pass a list of tensors, linear will first concatenate them over axis 1 logit_prev = cells.linear([x], output_size, True) # if we pass a list of tensors, linear will first concatenate them over axis 1 output = tf.tanh(logit_lstm + logit_prev + logit_ctx) outputs.append(output) return outputs, cell_states
def score_combined( decoder_current, # h_t decoder_previous, # h_i reuse_variables=False, dtype=tf.float32): """ Applies a score function of the form v.(W1.hi + W2.hs) where W is a weight matrix, v is a vector of parameters, hi is one of each of the decoder hidden states and hs is the current hidden state at timestep t. The function performs a 1-by-1 convolution to calculate W.hi and performs the W2.hs step using a ``linear'' cell (see cells.linear for the documentation) and broadcasted into the result of W1.hi (encoder_hiddens) via multiplication step. After this step a reduce_sum is performed over axis=[2,3] so the correct results are obtained. Args: decoder_current: not used attn_size: the size of the attention vectors encoder_hiddens: 3-D Tensor [batch_size, timestep, hidden_dim]. It represents the hidden sattes of the decoder up to the current timestep. current_hidden: Tensor, representing the current hidden state at timestep t Returns: beta: decoder hidden states after applying the content function """ with tf.variable_scope("score_salton_combined") as scope: if reuse_variables: scope.reuse_variables() _, output_size = get_2d_tensor_shapes(decoder_current) decoder_current = cells.linear([decoder_current], output_size, bias=False, dtype=dtype) # decoder_previous, attn_dim = reshape_attention(decoder_previous) # we first get the correct weight matrix ws = tf.get_variable("AttnDecWs", [1, 1, attn_dim, attn_dim], dtype=dtype) # we apply a small convolution to the decoder states - it is more # efficient than performing a recurrent matrix * matrix hidden_features = convolve(decoder_previous, ws) hidden_features = hidden_features + decoder_current # we then get the vector v that will be used on the second # multiplication op. vs = tf.get_variable("AttnDecVs", [attn_dim], dtype=dtype) scores = tf.reduce_sum((vs * tf.tanh(hidden_features)), [2, 3]) return scores
def local_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10, content_function=vinyals_kaiser, dtype=tf.float32): """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn). Parameters ---------- decoder_hidden_state : 2-D Tensor Tensor representing the current hidden state of the decoder (output of the recurrent layers). Shape is (?, decoder_size). hidden_attn : 4-D Tensor Tensor representing the hidden states of the encoder (output of the recurrent layers). It has shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate the attention score more efficiently. initializer : function Function to use when initializing variables within the variables context. window_size : int Size of each side of the window to use when applying local attention. Not relevant to global attention. Default to 10. content_function : function Content function to score the decoder hidden states and encoder hidden states to extract their weights. Default to 'vinyals_kaiser'. dtype : tensorflow dtype Type of tensors. Default to tf.float32 Returns ------- ds : 2-D Tensor Tensor representing the context vector generated after scoring the encoder and decoder hidden states. Has shape (?, decoder_size), i.e., one context vector per batch sample. """ assert content_function is not None sigma = window_size / 2 denominator = sigma ** 2 attention_vec_size = hidden_attn.get_shape()[3].value attn_length = hidden_attn.get_shape()[1].value batch_size = array_ops.shape(hidden_attn)[0] with vs.variable_scope("AttentionLocal", initializer=initializer): # apply content function to score the hidden states from the encoder s = content_function(hidden_attn, decoder_hidden_state) with vs.variable_scope("WindowPrediction", initializer=initializer): ht = cells.linear([decoder_hidden_state], attention_vec_size, True) # get the parameters (vp) vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size], initializer=initializer) # tanh(Wp*ht) tanh = math_ops.tanh(ht) # S * sigmoid(vp * tanh(Wp*ht)) - this is going to return a number # for each sentence in the batch - i.e., a tensor of shape batch x 1 S = attn_length pt = math_ops.reduce_sum((vp * tanh), [2, 3]) pt = math_ops.sigmoid(pt) * S # now we get only the integer part of the values pt = tf.floor(pt) _ = tf.histogram_summary('local_window_predictions', pt) # we now create a tensor containing the indices representing each position # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3, # the resulting tensor will be: # [[0, 1, 2, 3, 4] # [0, 1, 2, 3, 4] # [0, 1, 2, 3, 4]] # indices = [] for pos in xrange(attn_length): indices.append(pos) indices = indices * batch_size idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype) idx = tf.reshape(idx, [-1, attn_length]) # here we calculate the boundaries of the attention window based on the ppositions low = pt - window_size + 1 # we add one because the floor op already generates the first position high = pt + window_size # here we check our positions against the boundaries mlow = tf.to_float(idx < low) mhigh = tf.to_float(idx > high) # now we combine both into a pre-mask that has 0s and 1s switched # i.e, at this point, True == 0 and False == 1 m = mlow + mhigh # batch_size # here we switch the 0s to 1s and the 1s to 0s # we correct the values so True == 1 and False == 0 mask = tf.to_float(tf.equal(m, 0.0)) # here we switch off all the values that fall outside the window # first we switch off those in the truncated normal alpha = s * mask masked_soft = nn_ops.softmax(alpha) _ = tf.histogram_summary('local_alpha_weights', alpha) # here we calculate the 'truncated normal distribution' numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype)) div = tf.truediv(numerator, denominator) e = math_ops.exp(div) # result of the truncated normal distribution at = masked_soft * e # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(at, [-1, attn_length, 1, 1]) * hidden_attn, [1, 2]) ds = array_ops.reshape(d, [-1, attention_vec_size]) _ = tf.histogram_summary('local_attention_context', ds) return ds