def non_force(): outputs, final_state, _ = raw_rnn( decoder_cell, loop_fn_build( input_tensor, init_tensor, embedding_tensor, decoder_cell, batch_size, embedding_dim + args.encoder_hidden_dim + args.attention_output_dim, sequence_length, dense, sentence_size)) non_force_decoder = tf.transpose(outputs.stack(), [1, 0, 2]) return non_force_decoder
def my_dynamic_rnn( self, cell, sequence_length, inputs, initial_state): # initial_state = final state of encoder inputs_shape = tf.shape(inputs) max_seq_len, batch_size, input_features = self.trainingManager.configs.max_seq_len_decoder, inputs_shape[ 1], inputs.shape[2] inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_seq_len) # max_length = time inputs_ta = inputs_ta.unstack( inputs) # length array of [batch , hidden state] def loop_fn(cur_time, cur_cell_output, cur_cell_state, cur_loop_state): # current inputs nxt_emit_output = cur_cell_output # == None for time == 0 if cur_cell_output is None: # time == 0 # initialization logic nxt_cell_state = initial_state else: # any logic that depends on the cell state or cell output..ex attention # this part is 1 based nxt_cell_state = cur_cell_state # common loop logic # as in traditional loop the condition is "cur_time < sequence_length" but here i want the finished cur_elements_finished = (cur_time >= sequence_length ) # [batch] # this part is 0 based is_current_out_of_bound = tf.reduce_all( cur_elements_finished ) # scalar -- will cut to the longest sequence given for example [5,2,f] with lengths [3,4] will end at 4 # this shape has to be deterministic not [....,?] nxt_input = tf.cond( is_current_out_of_bound, lambda: tf.zeros( [batch_size, input_features ], # input shape [batch , input_features] dtype=tf.float32 ), # no input for end of loop .. can't read if out of bounds == time lambda: inputs_ta.read(cur_time) # read current input ) nxt_loop_state = None return cur_elements_finished, nxt_input, nxt_cell_state, nxt_emit_output, nxt_loop_state # next step in time outputs_ta, final_state, _ = raw_rnn(cell, loop_fn) outputs = outputs_ta.stack() # [seq_len, batch, hidden_state] # final_state # ([batch, hidden_state]) stacked times return outputs, final_state
def decode(self, decoder_inp, seq_len, encoder_hidden_states, final_state, seq_len_inp): """Attention-based decoder using LSTM+Attn to model output sequence.""" # First prepare the decoder input - Embed the input and obtain the # relevant loop function decoder_inputs, loop_function = self.prepare_decoder_input(decoder_inp) # TensorArray is used to do dynamic looping over decoder input inputs_ta = tf.TensorArray(size=self.max_output, dtype=tf.float32) inputs_ta = inputs_ta.unstack(decoder_inputs) batch_size = tf.shape(decoder_inputs)[1] embedding_size = decoder_inputs.get_shape()[2].value with variable_scope.variable_scope("attention_decoder"): attn_length = tf.shape(encoder_hidden_states)[1] attn_size = encoder_hidden_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to # reshape before. hidden = tf.expand_dims(encoder_hidden_states, 2) attention_vec_size = 64 k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV", [attention_vec_size]) batch_attn_size = array_ops.stack([batch_size, attn_size]) attn = array_ops.zeros(batch_attn_size, dtype=tf.float32) attn.set_shape([None, attn_size]) batch_alpha_size = array_ops.stack([batch_size, attn_length, 1, 1]) alpha = array_ops.zeros(batch_alpha_size, dtype=tf.float32) attn_mask = tf.sequence_mask(tf.cast(seq_len_inp, tf.int32), dtype=tf.float32) def attn_loop_function(time, cell_output, state, loop_state): def attention(query, prev_alpha): """Calculate attention weights.""" with variable_scope.variable_scope("Attention"): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) alpha = nn_ops.softmax(s) * attn_mask sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keep_dims=True) + 1e-12 norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]])) alpha = alpha / norm_term alpha = tf.expand_dims(alpha, 2) alpha = tf.expand_dims(alpha, 3) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(alpha * hidden, [1, 2]) d = array_ops.reshape(d, [-1, attn_size]) return tuple([d, alpha]) # If loop_function is set, we use it instead of decoder_inputs. elements_finished = (time >= seq_len) finished = tf.reduce_all(elements_finished) if cell_output is None: next_state = final_state output = None loop_state = tuple([attn, alpha]) next_input = inputs_ta.read(time) else: next_state = state loop_state = attention(cell_output, loop_state[1]) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output, loop_state[0]], self.cell.output_size, True) if loop_function is not None: simple_input = loop_function(output) # print ("Yolo") else: simple_input = tf.cond( finished, lambda: tf.zeros([batch_size, embedding_size], dtype=tf.float32), lambda: inputs_ta.read(time)) # Merge input and previous attentions into one vector of # the right size. input_size = simple_input.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size") with variable_scope.variable_scope("InputProjection"): next_input = linear([simple_input, loop_state[0]], input_size, True) return (elements_finished, next_input, next_state, output, loop_state) # outputs is a TensorArray with T=max(sequence_length) entries # of shape Bx|V| outputs, state, _ = rnn.raw_rnn(self.cell, attn_loop_function) # Concatenate the output across timesteps to get a tensor of TxBx|v| # shape outputs = outputs.concat() return outputs
def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None, name=None): """ Dynamic RNN decoder for a sequence-to-sequence model specified by RNNCell and decoder function. The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn` as the decoder does not make any assumptions of sequence length and batch size of the input. The `dynamic_rnn_decoder` has two modes: training or inference and expects the user to create seperate functions for each. Under both training and inference, both `cell` and `decoder_fn` are expected, where `cell` performs computation at every timestep using `raw_rnn`, and `decoder_fn` allows modeling of early stopping, output, state, and next input and context. When training the user is expected to supply `inputs`. At every time step a slice of the supplied input is fed to the `decoder_fn`, which modifies and returns the input for the next time step. `sequence_length` is needed at training time, i.e., when `inputs` is not None, for dynamic unrolling. At test time, when `inputs` is None, `sequence_length` is not needed. Under inference `inputs` is expected to be `None` and the input is inferred solely from the `decoder_fn`. Args: cell: An instance of RNNCell. decoder_fn: A function that takes time, cell state, cell input, cell output and context state. It returns a early stopping vector, cell state, next input, cell output and context state. Examples of decoder_fn can be found in the decoder_fn.py folder. inputs: The inputs for decoding (embedded format). If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`. The input to `cell` at each time step will be a `Tensor` with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. if `inputs` is not None and `sequence_length` is None it is inferred from the `inputs` as the maximal possible sequence length. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the `raw_rnn`; defaults to None. name: NameScope for the decoder; defaults to "dynamic_rnn_decoder" Returns: A tuple (outputs, final_state, final_context_state) where: outputs: the RNN output 'Tensor'. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. final_state: The final state and will be shaped `[batch_size, cell.state_size]`. final_context_state: The context state returned by the final call to decoder_fn. This is useful if the context state maintains internal data which is required after the graph is run. For example, one way to diversify the inference output is to use a stochastic decoder_fn, in which case one would want to store the decoded outputs, not just the RNN outputs. This can be done by maintaining a TensorArray in context_state and storing the decoded output of each iteration therein. Raises: ValueError: if inputs is not None and has less than three dimensions. """ with ops.name_scope(name, "dynamic_rnn_decoder", [ cell, decoder_fn, inputs, sequence_length, parallel_iterations, swap_memory, time_major, scope ]): if inputs is not None: # Convert to tensor inputs = ops.convert_to_tensor(inputs) # Test input dimensions if inputs.get_shape().ndims is not None and ( inputs.get_shape().ndims < 2): raise ValueError("Inputs must have at least two dimensions") # Setup of RNN (dimensions, sizes, length, initial state, dtype) if not time_major: # [batch, seq, features] -> [seq, batch, features] inputs = array_ops.transpose(inputs, perm=[1, 0, 2]) dtype = inputs.dtype # Get data input information input_depth = int(inputs.get_shape()[2]) batch_depth = inputs.get_shape()[1].value max_time = inputs.get_shape()[0].value if max_time is None: max_time = array_ops.shape(inputs)[0] # Setup decoder inputs as TensorArray inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time) inputs_ta = inputs_ta.unstack(inputs) def loop_fn(time, cell_output, cell_state, loop_state): if cell_state is None: # first call, before while loop (in raw_rnn) if cell_output is not None: raise ValueError( "Expected cell_output to be None when cell_state " "is None, but saw: %s" % cell_output) if loop_state is not None: raise ValueError( "Expected loop_state to be None when cell_state " "is None, but saw: %s" % loop_state) context_state = None else: # subsequent calls, inside while loop, after cell excution if isinstance(loop_state, tuple): (done, context_state) = loop_state else: done = loop_state context_state = None # call decoder function if inputs is not None: # training # get next_cell_input if cell_state is None: next_cell_input = inputs_ta.read(0) else: if batch_depth is not None: batch_size = batch_depth else: batch_size = array_ops.shape(done)[0] next_cell_input = control_flow_ops.cond( math_ops.equal(time, max_time), lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype), lambda: inputs_ta.read(time)) (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, next_cell_input, cell_output, context_state) else: # inference # next_cell_input is obtained through decoder_fn (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, None, cell_output, context_state) # check if we are done if next_done is None: # training next_done = time >= sequence_length # build next_loop_state if next_context_state is None: next_loop_state = next_done else: next_loop_state = (next_done, next_context_state) return (next_done, next_cell_input, next_cell_state, emit_output, next_loop_state) # Run raw_rnn function outputs_ta, final_state, final_loop_state = rnn.raw_rnn( cell, loop_fn, parallel_iterations=parallel_iterations, swap_memory=swap_memory, scope=scope) outputs = outputs_ta.stack() # Get final context_state, if generated by user if isinstance(final_loop_state, tuple): final_context_state = final_loop_state[1] else: final_context_state = None if not time_major: # [seq, batch, features] -> [batch, seq, features] outputs = array_ops.transpose(outputs, perm=[1, 0, 2]) return outputs, final_state, final_context_state
def decode(self, decoder_inputs, seq_len, encoder_hidden_states, final_state, seq_len_inp): """Abstract method that needs to be extended by Inheritor classes. Args: decoder_inputs: Time major decoder IDs, TxB that contain ground tr. during training and are dummy value holders at test time. seq_len: Output sequence length for each input in minibatch. Useful to limit the computation to the max output length in a minibatch. encoder_hidden_states: Batch major output, BxTxH of encoder RNN. Useful with attention-enabled decoders. final_state: Final hidden state of encoder RNN. Useful for initializing decoder RNN. seq_len_inp: Useful with attention-enabled decoders to mask the outputs corresponding to padding symbols. Returns: outputs: Time major output, TxBx|V|, of decoder RNN. """ decoder_inputs, loop_function = self.prepare_decoder_input( decoder_inputs) output_size = self.cell.output_size with variable_scope.variable_scope("attention_decoder"): batch_size = array_ops.shape(decoder_inputs)[1] embedding_size = decoder_inputs.get_shape()[2].value attn_length = tf.shape(encoder_hidden_states)[1] attn_size = encoder_hidden_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to # reshape before. hidden = tf.expand_dims(encoder_hidden_states, 2) attention_vec_size = 64 k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV", [attention_vec_size]) if self.use_conv: F = variable_scope.get_variable( "AttnF", [self.conv_filter_width, 1, 1, self.conv_num_channels]) U = variable_scope.get_variable( "AttnU", [1, 1, self.conv_num_channels, attention_vec_size]) batch_attn_size = array_ops.stack([batch_size, attn_size]) attn = array_ops.zeros(batch_attn_size, dtype=tf.float32) attn.set_shape([None, attn_size]) batch_alpha_size = array_ops.stack([batch_size, attn_length, 1, 1]) alpha = array_ops.zeros(batch_alpha_size, dtype=tf.float32) # Assumes Time major arrangement inputs_ta = tf.TensorArray(size=400, dtype=tf.float32, dynamic_size=True) inputs_ta = inputs_ta.unstack(decoder_inputs) attn_mask = tf.sequence_mask(tf.cast(seq_len_inp, tf.int32), dtype=tf.float32) def raw_loop_function(time, cell_output, state, loop_state): def attention(query, prev_alpha): """Calculate attention weights.""" with variable_scope.variable_scope("Attention"): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) if self.use_conv: conv_features = nn_ops.conv2d( prev_alpha, F, [1, 1, 1, 1], "SAME") feat_reshape = nn_ops.conv2d( conv_features, U, [1, 1, 1, 1], "SAME") s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y + feat_reshape), [2, 3]) else: s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) alpha = nn_ops.softmax(s) * attn_mask sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keep_dims=True) + 1e-12 norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]])) alpha = alpha / norm_term alpha = tf.expand_dims(alpha, 2) alpha = tf.expand_dims(alpha, 3) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(alpha * hidden, [1, 2]) d = array_ops.reshape(d, [-1, attn_size]) return tuple([d, alpha]) # If loop_function is set, we use it instead of decoder_inputs. elements_finished = (time >= seq_len) finished = tf.reduce_all(elements_finished) if cell_output is None: next_state = final_state output = None loop_state = tuple([attn, alpha]) next_input = inputs_ta.read(time) else: next_state = state loop_state = attention(cell_output, loop_state[1]) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output, loop_state[0]], output_size, True) if loop_function is not None: simple_input = loop_function(output) # print ("Yolo") else: simple_input = tf.cond( finished, lambda: tf.zeros([batch_size, embedding_size], dtype=tf.float32), lambda: inputs_ta.read(time)) # Merge input and previous attentions into one vector of # the right size. input_size = simple_input.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size") with variable_scope.variable_scope("InputProjection"): next_input = linear([simple_input, loop_state[0]], input_size, True) return (elements_finished, next_input, next_state, output, loop_state) outputs, state, _ = rnn.raw_rnn(self.cell, raw_loop_function) return outputs.concat()
def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None, name=None): with ops.name_scope(name, "dynamic_rnn_decoder", [ cell, decoder_fn, inputs, sequence_length, parallel_iterations, swap_memory, time_major, scope ]): if inputs is not None: # Convert to tensor inputs = ops.convert_to_tensor(inputs) # Test input dimensions if inputs.get_shape().ndims is not None and ( inputs.get_shape().ndims < 2): raise ValueError("Inputs must have at least two dimensions") # Setup of RNN (dimensions, sizes, length, initial state, dtype) if not time_major: # [batch, seq, features] -> [seq, batch, features] inputs = array_ops.transpose(inputs, perm=[1, 0, 2]) dtype = inputs.dtype # Get data input information input_depth = int(inputs.get_shape()[2]) batch_depth = inputs.get_shape()[1].value max_time = inputs.get_shape()[0].value if max_time is None: max_time = array_ops.shape(inputs)[0] # Setup decoder inputs as TensorArray inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time) inputs_ta = inputs_ta.unstack(inputs) def loop_fn(time, cell_output, cell_state, loop_state): if cell_state is None: # first call, before while loop (in raw_rnn) if cell_output is not None: raise ValueError( "Expected cell_output to be None when cell_state " "is None, but saw: %s" % cell_output) if loop_state is not None: raise ValueError( "Expected loop_state to be None when cell_state " "is None, but saw: %s" % loop_state) context_state = None else: # subsequent calls, inside while loop, after cell excution if isinstance(loop_state, tuple): (done, context_state) = loop_state else: done = loop_state context_state = None # call decoder function if inputs is not None: # training # get next_cell_input if cell_state is None: next_cell_input = inputs_ta.read(0) else: if batch_depth is not None: batch_size = batch_depth else: batch_size = array_ops.shape(done)[0] next_cell_input = control_flow_ops.cond( math_ops.equal(time, max_time), lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype), lambda: inputs_ta.read(time)) (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, next_cell_input, cell_output, context_state) else: # inference # next_cell_input is obtained through decoder_fn (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, None, cell_output, context_state) # check if we are done if next_done is None: # training next_done = time >= sequence_length # build next_loop_state if next_context_state is None: next_loop_state = next_done else: next_loop_state = (next_done, next_context_state) return (next_done, next_cell_input, next_cell_state, emit_output, next_loop_state) # Run raw_rnn function outputs_ta, final_state, final_loop_state = rnn.raw_rnn( cell, loop_fn, parallel_iterations=parallel_iterations, swap_memory=swap_memory, scope=scope) outputs = outputs_ta.stack() # Get final context_state, if generated by user if isinstance(final_loop_state, tuple): final_context_state = final_loop_state[1] else: final_context_state = None if not time_major: # [seq, batch, features] -> [batch, seq, features] outputs = array_ops.transpose(outputs, perm=[1, 0, 2]) return outputs, final_state, final_context_state
def inference(self, inputs, masks): """ Args: inputs: [batch_size, max_step, ...] masks: [batch_size, max_step] """ self.masks = masks num_class = self.num_class hidden_size = self.hidden_size num_layer = self.num_layer keep_prob = self.keep_prob seq_length = tf.reduce_sum(tf.cast(masks, tf.int32), -1) # raw_shape: [batch_size, max_step, ..., in_channels] raw_shape = inputs.get_shape().as_list() batch_size = raw_shape[0] num_step = raw_shape[1] in_channel = raw_shape[-1] inputs = tf.reshape(inputs, [batch_size, num_step, -1, in_channel]) # inputs : [batch_size, max_step, feature, in_channels] feature_size = inputs.get_shape()[2].value channel_size = inputs.get_shape()[3].value def loop_fn(time, cell_output, loop_state): if cell_output is None: # time == 0 emit_output = None # attend weights that dot product with inputs # init first attention attend_shape = [batch_size, feature_size, 1] attend_weights = variable_with_weight_decay('attend_weights', attend_shape, stddev=0.01, wd=0.0) else: emit_output = cell_output # attention from last output attend_weights = affine_transform(cell_output, feature_size, scope_name='attend') attend_weights = tf.nn.softmax(attend_weights) attend_weights = tf.expand_dims(attend_weights, -1) elements_finished = (time >= seq_length) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([batch_size, feature_size], dtype=tf.float32), lambda: tf.reduce_sum( inputs_ta.read(time) * attend_weights, 2) # lambda: tf.reduce_mean(inputs_ta.read(time), 2) ) next_loop_state = None return (elements_finished, next_input, emit_output, next_loop_state) # build LSTM subgraph lstm_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple=True) # dropout layer (at output) if keep_prob < 1: lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=keep_prob) # multi-cells cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layer, state_is_tuple=True) # dynamic_rnn # inputs: [batch_size, max_step, ...] inputs = tf.transpose(inputs, perm=[1, 0, 2, 3]) # inputs: [max_step, batch_size, ...] inputs_ta = tf.TensorArray(dtype=tf.float32, size=num_step) inputs_ta = inputs_ta.unpack(inputs) # init (? ...) initial_state = cell.zero_state(batch_size, tf.float32) # start to run overall outputs_ta, final_state, _ = raw_rnn(cell, loop_fn, initial_state) outputs = outputs_ta.pack() # outputs: [max_step, batch_size, hidden_size] # softmax_linear (? ...) outputs = tf.tanh(outputs) outputs = affine_transform(outputs, num_class, scope_name="softmax_linear") # outputs: [max_step, batch_size, num_class] outputs = tf.transpose(outputs, perm=[1, 0, 2]) # outputs: [batch_size, max_step, num_class] return outputs
def dynamic_rnn_decoder( cell, # 多层的 RNNCell decoder_fn, # 对每个时间步输出进行处理成输入的函数 inputs=None, # 训练时,传入该参数,为 response 的嵌入向量 [batch_size, decoder_len, 600(300为词嵌入,100*3为3个实体嵌入)] sequence_length=None, # 训练时,传入该参数,为 response 的长度向量 parallel_iterations=None, # 没用到这个参数 swap_memory=False, # 没用到这个参数 time_major=False, # 表示输入的数据集是否是 time-major 的,实验中为 False scope=None, # ="decoder_rnn" name=None): # 没用到这个参数 """ seq2seq 模型的 RNN 动态解码器. dynamic_rnn_decoder 类似于 tf.python.ops.rnn.dynamic_rnn,因为解码器没有假设序列长度和输入的 batch size dynamic_rnn_decoder 有两种模式:训练和推导。并且,希望用户为每种模式创建分别的函数 在训练和推导模式,cell 和 decoder_fn 都是被需要的。其中 cell 为每个时间步用的 RNN, decoder_fn 允许为 early stopping, state, next input, context 建模 当训练时,要求用户提供 inputs。在每个时间步上,所提供 input 的一个切片被传给 decoder_fn,这修改并返回下个时间步的 input。 sequence_length 在训练时为了展开而被需要,例如,当 input is not None。在测试时,当 input is None,sequence_length 就用不着了。 在推导时,inputs 被期望为 None,并且 input 从 decoder_fn 中被单独的推导。 Args: cell: RNNCell 的一个实例 decoder_fn: 一个需要 time, cell state, cell input,cell output 和 context state 的函数。 他返回一个 early stopping 向量,cell state, next input, cell output 和 context state。 inputs: 用于解码的输入,嵌入的形式 If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`. The input to `cell` at each time step will be a `Tensor` with dimensions `[batch_size, ...]`. sequence_length: (可选) 一个 size 为 batch_size 的 int32/int64 向量。 如果 inputs is not None 并且 sequence_length is None, 它从 inputs 中被推导出来作为最大可能的序列长度 parallel_iterations: (Default: 32). 平行运行中的迭代数量。 这些操作没有任何的时间的依赖并且能够平行运行。 这个参数为了空间折损了时间。 值 >> 1 使用更多的内存但是花费更少的时间, 然而较小的参数使用更少的内存但是计算的时间更久。 swap_memory: 透明的交换前向传播产生的张量但是需要来自 GPU 到 CPU 的反向传播 这允许训练可能不适用于单个 GPU 的 RNNs,只存在非常小的(或没有)性能损失。 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. 使用 time_major = True 是更有效率的,因为它避免了开始和结束时 RNN 计算的转换 但是大多数 TensorFlow 数据是 batch-major 的,所以这个函数默认接受和发出 batch-major 形式的输入和输出。 scope: VariableScope for the `raw_rnn`; defaults to None. name: NameScope for the decoder; defaults to "dynamic_rnn_decoder" Returns: 一个元组 (outputs, final_state, final_context_state) 其中: outputs: RNN 输出张量 If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. final_state: The final state and will be shaped `[batch_size, cell.state_size]`. final_context_state: 上下文状态通过 decoder_fn 的最终调用被返回。如果上下文状态在图运行之后 保持保持间隔数据,这就是有用的。 例如,一种使推导输出多样化的方法是使用一个随机的解码器函数,在这种情况下, 我们想存储解码的输出,而不仅仅是 RNN 的输出。这能够通过在 context_state 中维护一个 TensorArray 实现,并且存储每个迭代解码的输出。 Raises: ValueError: if inputs is not None and has less than three dimensions. """ with ops.name_scope(name, "dynamic_rnn_decoder", [ cell, decoder_fn, inputs, sequence_length, parallel_iterations, swap_memory, time_major, scope ]): if inputs is not None: # 将输入转化成张量 inputs = ops.convert_to_tensor(inputs) # 测试输入的维度,不能小于 2 if inputs.get_shape().ndims is not None and ( inputs.get_shape().ndims < 2): raise ValueError("Inputs must have at least two dimensions") # 如果不是 time_major,就要做一个转置 if not time_major: # [batch, seq, features] -> [seq, batch, features] inputs = array_ops.transpose( inputs, perm=[1, 0, 2]) # decoder_len * batch_size * 600 dtype = inputs.dtype input_depth = int(inputs.get_shape()[2]) # 600 batch_depth = inputs.get_shape()[1].value # batch_size max_time = inputs.get_shape()[0].value # decoder_len if max_time is None: max_time = array_ops.shape(inputs)[0] # 将解码器的输入设置成一个 tensor 数组 # 数组长度为 decoder_len,数组的每个元素是个 batch_size * 600 的张量 inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time) inputs_ta = inputs_ta.unstack(inputs) def loop_fn(time, cell_output, cell_state, loop_state): """loop_fn 是一个函数,这个函数在 rnn 的相邻时间步之间被调用。 函数的总体调用过程为: 1. 初始时刻,先调用一次loop_fn,获取第一个时间步的cell的输入,loop_fn 中进行读取初始时刻的输入。 2. 进行cell自环 (output, cell_state) = cell(next_input, state) 3. 在 t 时刻 RNN 计算结束时,cell 有一组输出 cell_output 和状态 cell_state,都是 tensor; 4. 到 t+1 时刻开始进行计算之前,loop_fn 被调用,调用的形式为 loop_fn( t, cell_output, cell_state, loop_state),而被期待的输出为:(finished, next_input, initial_state, emit_output, loop_state); 5. RNN 采用 loop_fn 返回的 next_input 作为输入,initial_state 作为状态,计算得到新的输出。 在每次执行(output, cell_state) = cell(next_input, state)后,执行 loop_fn() 进行数据的准备和处理。 emit_structure 即上文的 emit_output 将会按照时间存入 emit_ta 中。 loop_state 记录 rnn loop 的变量的状态。用作记录状态 tf.where 是用来实现dynamic的。 time: 第 time 个时间步之前的处理,起始为 0 cell_output: 上一个时间步的输出 cell_state: RNNCells 的长时记忆 loop_state: 保存了上个时间步执行后是否已经结束,如果输出 alignments,还保存了存有 alignments 的 TensorArray return: """ # 解码之前第一次调用 if cell_state is None: # cell_state is None 时,cell_output 应该为 None if cell_output is not None: raise ValueError( "Expected cell_output to be None when cell_state " "is None, but saw: %s" % cell_output) # cell_state is None 时,loop_state 应该为 None if loop_state is not None: raise ValueError( "Expected loop_state to be None when cell_state " "is None, but saw: %s" % loop_state) context_state = None # 后续的调用 else: if isinstance(loop_state, tuple): (done, context_state) = loop_state else: done = loop_state context_state = None # 训练 # 训练时 input is not None # 获得 next_cell_input if inputs is not None: # 第一个时间步之前的处理 if cell_state is None: next_cell_input = inputs_ta.read(0) # 其实第一列都是 GO_ID # 之后的 cell 之间的处理 else: if batch_depth is not None: batch_size = batch_depth else: batch_size = array_ops.shape(done)[ 0] # done 是对循环是否结束的标注, # 如果 time == max_time, 则 next_cell_input = batch_size * 600 的全 1 矩阵 # 否则,next_cell_input 从数据中读下一时间步的数据 next_cell_input = control_flow_ops.cond( math_ops.equal(time, max_time), lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype), lambda: inputs_ta.read(time)) # emit_output = attention (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, next_cell_input, cell_output, context_state) # 推导 else: # next_cell_input 通过 decoder_fn 获得 (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, None, cell_output, context_state) # 检查是否已经结束 if next_done is None: # 当训练时,next_done 返回的是 None next_done = time >= sequence_length # 当 time >= sequence_length 时,next_done = True # 构建 next_loop_state if next_context_state is None: # 如果不输出 alignments next_loop_state = next_done else: next_loop_state = (next_done, next_context_state) return (next_done, next_cell_input, next_cell_state, emit_output, next_loop_state) # Run raw_rnn function outputs_ta, final_state, final_loop_state = rnn.raw_rnn( cell, loop_fn, parallel_iterations=parallel_iterations, swap_memory=swap_memory, scope=scope) outputs = outputs_ta.stack() # 如果要输出 alignments,就获取 final_context_state if isinstance(final_loop_state, tuple): final_context_state = final_loop_state[1] else: final_context_state = None # 如果不是 time_major,就转置回去 if not time_major: # [seq, batch, features] -> [batch, seq, features] outputs = array_ops.transpose(outputs, perm=[1, 0, 2]) return outputs, final_state, final_context_state
def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None, name=None): """ Dynamic RNN decoder for a sequence-to-sequence model specified by RNNCell and decoder function. The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn` as the decoder does not make any assumptions of sequence length and batch size of the input. The `dynamic_rnn_decoder` has two modes: training or inference and expects the user to create seperate functions for each. Under both training and inference `cell` and `decoder_fn` is expected. Where the `cell` performs computation at every timestep using the `raw_rnn` and the `decoder_fn` allows modelling of early stopping, output, state, and next input and context. When training the user is expected to supply `inputs`. At every time step a slice of the supplied input is fed to the `decoder_fn`, which modifies and returns the input for the next time step. `sequence_length` is needed at training time, i.e., when `inputs` is not None, for dynamic unrolling. At test time, when `inputs` is None, `sequence_length` is not needed. Under inference `inputs` is expected to be `None` and the input is inferred solely from the `decoder_fn`. Args: cell: An instance of RNNCell. decoder_fn: A function that takes time, cell state, cell input, cell output and context state. It returns a early stopping vector, cell state, next input, cell output and context state. Examples of decoder_fn can be found in the decoder_fn.py folder. inputs: The inputs for decoding (embedded format). If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`. The input to `cell` at each time step will be a `Tensor` with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. if `inputs` is not None and `sequence_length` is None it is inferred from the `inputs` as the maximal possible sequence length. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the `raw_rnn`; defaults to None. name: NameScope for the decoder; defaults to "dynamic_rnn_decoder" Returns: A pair (outputs, state) where: outputs: the RNN output 'Tensor'. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. state: The final state and will be shaped `[batch_size, cell.state_size]`. Raises: ValueError: if inputs is not None and has less than three dimensions. """ with ops.name_scope(name, "dynamic_rnn_decoder", [cell, decoder_fn, inputs, sequence_length, parallel_iterations, swap_memory, time_major, scope]): if inputs is not None: # Convert to tensor inputs = ops.convert_to_tensor(inputs) # Test input dimensions if inputs.get_shape().ndims is not None and ( inputs.get_shape().ndims < 2): raise ValueError("Inputs must have at least two dimensions") # Setup of RNN (dimensions, sizes, length, initial state, dtype) if not time_major: # [batch, seq, features] -> [seq, batch, features] inputs = array_ops.transpose(inputs, perm=[1, 0, 2]) dtype = inputs.dtype # Get data input information input_depth = int(inputs.get_shape()[2]) batch_depth = inputs.get_shape()[1].value max_time = inputs.get_shape()[0].value if max_time is None: max_time = array_ops.shape(inputs)[0] # Setup decoder inputs as TensorArray inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time) inputs_ta = inputs_ta.unpack(inputs) def loop_fn(time, cell_output, cell_state, loop_state): if cell_state is None: # first call, before while loop (in raw_rnn) if cell_output is not None: raise ValueError("Expected cell_output to be None when cell_state " "is None, but saw: %s" % cell_output) if loop_state is not None: raise ValueError("Expected loop_state to be None when cell_state " "is None, but saw: %s" % loop_state) context_state = None else: # subsequent calls, inside while loop, after cell excution if isinstance(loop_state, tuple): (done, context_state) = loop_state else: done = loop_state context_state = None # call decoder function if inputs is not None: # training # get next_cell_input if cell_state is None: next_cell_input = inputs_ta.read(0) else: if batch_depth is not None: batch_size = batch_depth else: batch_size = array_ops.shape(done)[0] next_cell_input = control_flow_ops.cond( math_ops.equal(time, max_time), lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype), lambda: inputs_ta.read(time)) (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, next_cell_input, cell_output, context_state) else: # inference # next_cell_input is obtained through decoder_fn (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, None, cell_output, context_state) # check if we are done if next_done is None: # training next_done = time >= sequence_length # build next_loop_state if next_context_state is None: next_loop_state = next_done else: next_loop_state = (next_done, next_context_state) return (next_done, next_cell_input, next_cell_state, emit_output, next_loop_state) # Run raw_rnn function outputs_ta, state, _ = rnn.raw_rnn( cell, loop_fn, parallel_iterations=parallel_iterations, swap_memory=swap_memory, scope=scope) outputs = outputs_ta.pack() if not time_major: # [seq, batch, features] -> [batch, seq, features] outputs = array_ops.transpose(outputs, perm=[1, 0, 2]) return outputs, state
def policy_gradient_pointer_attention_decoder( cell, scope, memory, decoder_inputs, initial_state, enc_padding_mask, prev_coverage=None, # tokens UNK_token=0, start_tokens=None, embeddings=None, vocab_size=50000, num_source_OOVs=None, enc_batch_extended_vocab=None, # some flags reinforce=False, pointer_gen=True, use_coverage=False, debug_mode=False, # for decoding initial_state_attention=False): """PolicyGradient decoder""" # some todo's # if initial_state_attention: # raise NotImplementedError if use_coverage or prev_coverage: raise NotImplementedError if reinforce and ((embeddings is None) or (start_tokens is None)): raise ValueError("when using reinforce, " "please provide embeddings and start_tokens") print("TODO: Using tf.where to replace tf.cond in next_cell_input") print("change sampled_tokens not include <start>?") # input data max_time = decoder_inputs.get_shape()[1].value attn_size = memory.get_shape()[2].value batch_size = memory.get_shape()[0].value input_size = decoder_inputs.get_shape()[2].value sequence_length = array_ops.tile([max_time], [batch_size]) inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time) # TensorArray will unstack first dimension inputs_ta = inputs_ta.unstack(tf.transpose(decoder_inputs, perm=[1, 0, 2])) with variable_scope.variable_scope(scope.Attention): # layers # To calculate attention, we calculate # v^T tanh(W_h h_i + W_s s_t + b_attn) # where h_i is an encoder state, and s_t a decoder state. # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t). # We set it to be equal to the size of the encoder states. attention_vec_size = attn_size # memory kernel maps encoder hidden states into memory memory_kernel = core_layers.Dense( units=attention_vec_size, use_bias=False, name="memory_kernel") # query kernel maps decoder hidden state into query query_kernel = core_layers.Dense( units=attention_vec_size, use_bias=True, name="query_kernel") # input kernel maps decoder hidden state into query input_kernel = core_layers.Dense( units=input_size, use_bias=True, name="input_kernel") # pgen_kernel maps states into p_gen pgen_kernel = core_layers.Dense( units=1, activation=tf.sigmoid, use_bias=True, name="pgen_kernel") # output_kernel maps cell_outputs into final cell outputs output_kernel = core_layers.Dense( units=cell.output_size, use_bias=True, name="output_kernel") # coverage kernels transforms coverage vector coverage_kernel = core_layers.Dense( units=attention_vec_size, use_bias=False, name="coverage_kernel") # output_kernel maps cell_outputs into final cell outputs logits_kernel = core_layers.Dense( units=vocab_size, use_bias=True, name="logits_kernel") # Get the weight matrix W_h and apply it to each encoder state to get # (W_h h_i), the encoder features # shape (batch_size,attn_length,1,attention_vec_size) processed_memory = memory_kernel(memory) def masked_attention(score): """Softmax + enc_padding_mask + re-normalize""" # take softmax. shape (batch_size, attn_length) attn_dist = nn_ops.softmax(score) attn_dist *= enc_padding_mask # shape (batch_size) masked_sums = math_ops.reduce_sum(attn_dist, axis=1) # re-normalize return attn_dist / array_ops.reshape(masked_sums, [-1, 1]) def _compute_attention(cell_output, coverage=None): # Pass the decoder state through a linear layer # (this is W_s s_t + b_attn in the paper) # shape (batch_size, attention_vec_size) processed_query = control_flow_ops.cond( # i.e. None or not set _is_zero_matrix(coverage), # v^T tanh(W_h h_i + W_s s_t + b_attn) true_fn=lambda: query_kernel(cell_output), # v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) false_fn=lambda: (query_kernel(cell_output) + coverage_kernel(coverage))) score = attention_utils._bahdanau_score( processed_query=processed_query, keys=processed_memory, normalize=False) # Calculate attention distribution alignments = masked_attention(score) if use_coverage: # update coverage coverage = coverage + alignments # Reshape from [batch_size, memory_time] # to [batch_size, 1, memory_time] expanded_alignments = array_ops.expand_dims(alignments, 1) # Context is the inner product of alignments and values along the # memory time dimension. # alignments shape is # [batch_size, 1, memory_time] # attention_mechanism.values shape is # [batch_size, memory_time, memory_size] # the batched matmul is over memory_time, so the output shape is # [batch_size, 1, memory_size]. # we then squeeze out the singleton dim. context = math_ops.matmul(expanded_alignments, memory) context = array_ops.squeeze(context, [1]) return context, alignments, coverage def loop_fn(loop_time, cell_output, cell_state, loop_state): if cell_output is None: # time == 0 final_dist = None emit_output = final_dist # == None for time == 0 next_cell_state = initial_state # encoder last states coverage = (array_ops.zeros([batch_size, attn_size]) if prev_coverage is None else prev_coverage) # convext vector will initially be zeros # Ensure the second shape of attention vectors is set. context_vector = array_ops.zeros([batch_size, attn_size]) context_vector.set_shape([None, attn_size]) if initial_state_attention: with variable_scope.variable_scope( scope.Attention, reuse=tf.AUTO_REUSE): # true in decode mode # Re-calculate the context vector from the previous # step so that we can pass it through a linear layer # with this step's input to get a modified version of # the input in decode mode, this is what updates the # coverage vector context_vector, _, coverage = _compute_attention( cell_output=next_cell_state[-1].h, coverage=coverage) # all TensorArrays for recoding sequences outputs_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) alignments_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) p_gens_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) coverages_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) sampled_tokens_history = tensor_array_ops.TensorArray( dtype=tf.int32, size=0, dynamic_size=True) # mostly used in debugging logits_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) vocab_dists_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) final_dists_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) else: # normal workflow: # decoder_inputs = input_kernel(inputs; context) # cell_output, states = cell(decoder_inputs, states) # context, att_dist, coverage = attention(states, coverage) # p_gen = pgen_kernel(...) # cell_outputs = output_kernel(cell_output, context) # since raw-rnn encapsulates cell call # we do this: # context, att_dist, coverage = attention(states, coverage) # p_gen = pgen_kernel(...) # cell_outputs = output_kernel(cell_output, context) # next_inputs = input_kernel(inputs; context) --> changed # Run the attention mechanism. # no change next_cell_state = cell_state # get the cell state of last layer's cell last_layer_state = cell_state[-1] # cell_input is cell inputs (sampled_tokens_history, outputs_history, alignments_history, p_gens_history, coverages_history, logits_history, vocab_dists_history, final_dists_history, coverage, cell_input) = loop_state # Run the attention mechanism. with variable_scope.variable_scope( scope.Attention, reuse=tf.AUTO_REUSE): # reuse=initial_state_attention or i > 0 # or scope.Attention.reuse): context_vector, attn_dist, coverage = _compute_attention( cell_output=cell_output, coverage=coverage) # Concatenate the cell_output (= decoder state) # and the context vector, and pass them through # a linear layer. This is V[s_t, h*_t] + b in the paper attention_output = output_kernel( array_ops.concat([cell_output, context_vector], -1)) # update attention and cell_outputs outputs_history = outputs_history.write( loop_time - 1, attention_output) alignments_history = alignments_history.write( loop_time - 1, attn_dist) coverages_history = coverages_history.write( loop_time - 1, coverage) # Calculate p_gen if pointer_gen: with variable_scope.variable_scope(scope.Pointer): p_gen = pgen_kernel(array_ops.concat([ context_vector, last_layer_state.c, last_layer_state.h, cell_input], -1)) # update p_gens_history distributions p_gens_history = p_gens_history.write( loop_time - 1, p_gen) # reuse variables # probably not necessary # [scope.Decoder[i].reuse_variables() # for i in range(len(scope.Decoder))] # scope.Attention.reuse_variables() # scope.Pointer.reuse_variables() # distribution logits = logits_kernel(attention_output) vocab_dist = nn_ops.softmax(logits) final_dist = _calc_final_dist( vocab_dist=vocab_dist, attn_dist=attn_dist, p_gen=p_gen, batch_size=batch_size, vocab_size=vocab_size, num_source_OOVs=num_source_OOVs, enc_batch_extended_vocab=enc_batch_extended_vocab) # raw_rnn requires `emit_output` to have same # shape with cell.output_size # thus we have to output attention_output # but not the final_distribution emit_output = attention_output # save these for debugging logits_history = logits_history.write( loop_time - 1, logits) vocab_dists_history = vocab_dists_history.write( loop_time - 1, vocab_dist) final_dists_history = final_dists_history.write( loop_time - 1, final_dist) # generic elements_finished = (loop_time >= sequence_length) finished = math_ops.reduce_all(elements_finished) if reinforce and not initial_state_attention: # see Google's code # elements_finished = tf.logical_or( # tf.equal(chosen_outputs, misc.BF_EOS_INT), # loop_time >= global_config.timestep_limit) # they have this logical_or to stop # generation when sampled STOP # I am ignoring this for now, but probably # look back on this later? # also, Google used prev_elements_finished # but I used elements_finished, is that correct? if cell_output is None: # time == 0 # when time == 0, use start_tokens tf.logging.info("Running RLModel") chosen_outputs = start_tokens else: def _multinomial_sample(probs): # tf.multinomial only samples from # logits (unnormalized probability) # here we only have normalized probability # thus we use distributions.Categorical dist = categorical.Categorical(probs=probs) # use argmax during debugging if not debug_mode: sampled_tokens = dist.sample() else: sampled_tokens = dist.mode() # since final_dist = vocab_dist + copy_dist # sampled_tokens can have index out-of vocab_dist # in this case we cast them into UNK UNKs = array_ops.ones_like(sampled_tokens) * UNK_token sampled_tokens = array_ops.where( math_ops.greater(sampled_tokens, vocab_size), UNKs, sampled_tokens, name="sampled_tokens") return sampled_tokens # otherwise, do the sampling in sequence_length chosen_outputs = tf.to_int32(array_ops.where( elements_finished, array_ops.zeros([batch_size], dtype=tf.int32), _multinomial_sample(final_dist))) sampled_tokens_history = sampled_tokens_history.write( loop_time - 1, chosen_outputs) next_input = array_ops.gather(embeddings, chosen_outputs) else: next_input = control_flow_ops.cond( finished, lambda: array_ops.zeros( [batch_size, input_size], dtype=tf.float32), lambda: inputs_ta.read(loop_time)) with variable_scope.variable_scope(scope.Attention): # next inputs = input_kernel(inp; context) next_cell_input = input_kernel( array_ops.concat([next_input, context_vector], -1)) next_loop_state = ( sampled_tokens_history, outputs_history, alignments_history, p_gens_history, coverages_history, logits_history, vocab_dists_history, final_dists_history, coverage, next_cell_input) return (elements_finished, next_cell_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope("policy"): (decoder_outputs_ta, final_cell_state, final_loop_state) = rnn_ops.raw_rnn( cell=cell, loop_fn=loop_fn) (sampled_tokens_history, outputs_history, alignments_history, p_gens_history, coverages_history, logits_history, vocab_dists_history, final_dists_history, coverage, cell_input) = final_loop_state # [time, batch, nun_units] to [batch, time, num_units] final_dists = array_ops.transpose( final_dists_history.stack(), perm=[1, 0, 2]) attn_dists = array_ops.transpose( alignments_history.stack(), perm=[1, 0, 2]) p_gens = array_ops.transpose( p_gens_history.stack(), perm=[1, 0, 2]) sampled_tokens = None if reinforce: sampled_tokens = array_ops.transpose( sampled_tokens_history.stack(), perm=(1, 0)) # HG: what is that? # If using coverage, reshape it if coverage is not None: coverage = array_ops.reshape(coverage, [batch_size, -1]) # used in debugging debug_variables = { "memory_kernel": memory_kernel, "query_kernel": query_kernel, "input_kernel": input_kernel, "pgen_kernel": pgen_kernel, "output_kernel": output_kernel, "coverage_kernel": coverage_kernel, "logits_kernel": logits_kernel, "memory": memory, "processed_memory": processed_memory} return (final_dists, final_cell_state, attn_dists, p_gens, coverage, sampled_tokens, decoder_outputs_ta, debug_variables, final_loop_state)
def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None, name=None): """ 动态 RNN 解码器, 用于指定使用RNNCell的sequence-to-sequence模型的解码器功能 `dynamic_rnn_decoder` 和官方提供的 `tf.python.ops.rnn.dynamic_rnn`差不多, 但是不限制输入的sequence和batch size `dynamic_rnn_decoder` 有两个方法 : training 和 inference, 这两个是分开的方法 不管是training 还是 inference, `cell` 和 `decoder_fn` 都是需要的, `cell` 用于每一步使用`raw_rnn`的计算, `decoder_fn` 可以控制模型的early stopping, output, state, 还有下一个input 和 context. 训练时需要提供 `inputs`. 在每一个时间步,每一步的输入会被喂进 `decoder_fn`, 做一些更新和返回 下一个时间步的输入 为了做RNN的动态展开训练的时候`sequence_length` 是必需的, `input`也不能为None 测试时, 如果 `inputs` 是 None,`sequence_length` 不是必需的 在inference时`inputs`应为 `None`,并且来自`decoder_fn`的输出 Args: cell: An instance of RNNCell. decoder_fn: A function that takes time, cell state, cell input, cell output and context state. It returns a early stopping vector, cell state, next input, cell output and context state. Examples of decoder_fn can be found in the decoder_fn.py folder. inputs: The inputs for decoding (embedded format). If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`. The input to `cell` at each time step will be a `Tensor` with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. if `inputs` is not None and `sequence_length` is None it is inferred from the `inputs` as the maximal possible sequence length. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the `raw_rnn`; defaults to None. name: NameScope for the decoder; defaults to "dynamic_rnn_decoder" Returns: A tuple (outputs, final_state, final_context_state) where: outputs: the RNN output 'Tensor'. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. final_state: The final state and will be shaped `[batch_size, cell.state_size]`. final_context_state: The context state returned by the final call to decoder_fn. This is useful if the context state maintains internal data which is required after the graph is run. For example, one way to diversify the inference output is to use a stochastic decoder_fn, in which case one would want to store the decoded outputs, not just the RNN outputs. This can be done by maintaining a TensorArray in context_state and storing the decoded output of each iteration therein. Raises: ValueError: if inputs is not None and has less than three dimensions. """ with ops.name_scope(name, "dynamic_rnn_decoder", [ cell, decoder_fn, inputs, sequence_length, parallel_iterations, swap_memory, time_major, scope ]): if inputs is not None: # 转成 tensor inputs = ops.convert_to_tensor(inputs) # 检查 input 维度 if inputs.get_shape().ndims is not None and ( inputs.get_shape().ndims < 2): raise ValueError("Inputs must have at least two dimensions") # 设置RNN (dimensions, sizes, length, initial state, dtype) if not time_major: # [batch, seq, features] -> [seq, batch, features] inputs = array_ops.transpose(inputs, perm=[1, 0, 2]) dtype = inputs.dtype # Get data input information input_depth = int(inputs.get_shape()[2]) batch_depth = inputs.get_shape()[1].value max_time = inputs.get_shape()[0].value if max_time is None: max_time = array_ops.shape(inputs)[0] # Setup decoder inputs as TensorArray inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time) inputs_ta = inputs_ta.unstack(inputs) def loop_fn(time, cell_output, cell_state, loop_state): if cell_state is None: # first call, before while loop (in raw_rnn) if cell_output is not None: raise ValueError( "Expected cell_output to be None when cell_state " "is None, but saw: %s" % cell_output) if loop_state is not None: raise ValueError( "Expected loop_state to be None when cell_state " "is None, but saw: %s" % loop_state) context_state = None else: # subsequent calls, inside while loop, after cell excution if isinstance(loop_state, tuple): (done, context_state) = loop_state else: done = loop_state context_state = None # call decoder function if inputs is not None: # training # get next_cell_input if cell_state is None: next_cell_input = inputs_ta.read(0) else: if batch_depth is not None: batch_size = batch_depth else: batch_size = array_ops.shape(done)[0] next_cell_input = control_flow_ops.cond( math_ops.equal(time, max_time), lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype), lambda: inputs_ta.read(time)) (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, next_cell_input, cell_output, context_state) else: # inference # next_cell_input is obtained through decoder_fn (next_done, next_cell_state, next_cell_input, emit_output, next_context_state) = decoder_fn(time, cell_state, None, cell_output, context_state) # check if we are done if next_done is None: # training next_done = time >= sequence_length # build next_loop_state if next_context_state is None: next_loop_state = next_done else: next_loop_state = (next_done, next_context_state) return (next_done, next_cell_input, next_cell_state, emit_output, next_loop_state) # Run raw_rnn function outputs_ta, final_state, final_loop_state = rnn.raw_rnn( cell, loop_fn, parallel_iterations=parallel_iterations, swap_memory=swap_memory, scope=scope) outputs = outputs_ta.stack() # Get final context_state, if generated by user if isinstance(final_loop_state, tuple): final_context_state = final_loop_state[1] else: final_context_state = None if not time_major: # [seq, batch, features] -> [batch, seq, features] outputs = array_ops.transpose(outputs, perm=[1, 0, 2]) return outputs, final_state, final_context_state
sequence_length = tf.placeholder(shape=(batch_size, max_time), dtype=tf.int32) inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time) inputs_ta = inputs_ta.unpack(inputs) def loop_fn(time, cell_output, loop_state): emit_output = cell_output # == None for time == 0 elements_finished = (time >= sequence_length) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32), lambda: inputs_ta.read(time)) next_loop_state = None return (elements_finished, next_input, emit_output, next_loop_state) cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True) initial_state = cell.zero_state(batch_size, tf.float32) outputs_ta, final_state, _ = raw_rnn(cell, loop_fn, initial_state) outputs = outputs_ta.pack() loss_op = kits.loss(outputs, labels) import pdb pdb.set_trace() # breakpoint 49561a3a // init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) sess.run(loss_op)
def decode(self, decoder_inp, seq_len, encoder_hidden_states, final_state, seq_len_inp): """Basic decoder using LSTM to model output sequence.""" # First prepare the decoder input - Embed the input and obtain the # relevant loop function decoder_inputs, loop_function = self.prepare_decoder_input(decoder_inp) # TensorArray is used to do dynamic looping over decoder input inputs_ta = tf.TensorArray(size=self.max_output, dtype=tf.float32) inputs_ta = inputs_ta.unstack(decoder_inputs) batch_size = tf.shape(decoder_inputs)[1] emb_size = decoder_inputs.get_shape()[2].value with variable_scope.variable_scope("rnn_decoder"): def simple_loop_function(time, cell_output, state, loop_state): # Check which sequences are processed elements_finished = (time >= tf.cast(seq_len, tf.int32)) # finished would indicate if all output sequences have been # processed finished = tf.reduce_all(elements_finished) if cell_output is None: # 0th time step. Initialize the decoder hidden state with # final hidden state of encoder. next_state = final_state # Read the <GO> tag to start decoding next_input = inputs_ta.read(time) output = None else: next_state = state output = cell_output if self.isTraining: if loop_function is not None: # Perform Scheduled sampling # https://arxiv.org/abs/1506.03099 random_prob = tf.random_uniform([]) next_input = tf.cond( finished, lambda: tf.zeros( [batch_size, emb_size], dtype=tf.float32), lambda: tf.cond( tf.greater_equal(random_prob, self. samp_prob), lambda: inputs_ta.read( time), lambda: loop_function(output))) else: # Read the decoder input till all output # sequences are not finished. next_input = tf.cond( finished, lambda: tf.zeros( [batch_size, emb_size], dtype=tf.float32), lambda: inputs_ta.read(time)) else: # During evaluation, the output of previous time step # is fed into next time step next_input = loop_function(output) return (elements_finished, next_input, next_state, output, None) # outputs is a TensorArray with T=max(sequence_length) entries # of shape Bx|V| outputs, state, _ = rnn.raw_rnn(self.cell, simple_loop_function) # Concatenate the output across timesteps to get a tensor of TxBx|v| # shape outputs = outputs.concat() return outputs
def my_attentive(self, cell, sequence_length, inputs, encoder_final_state, memory_fw, memory_bw): inputs_shape = tf.shape(inputs) max_seq_len, batch_size, input_features = self.trainingManager.configs.max_seq_len_decoder, inputs_shape[ 1], inputs.shape[2] inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_seq_len) # max_length = time inputs_ta = inputs_ta.unstack( inputs) # length array of [batch , hidden state] attention_weights = tf.TensorArray( size=max_seq_len, dtype=tf.float32) # max_length = time def loop_fn(cur_time, cur_cell_output, cur_cell_state, cur_loop_state): # current inputs # cur_cell_output = None at time = 0 nxt_emit_output = tf.zeros( [ cell.output_size * 2 ], # (decoder hidden = 512 )+(context vector_fw = 256)+(context vector_bw = 256) dtype=tf.float32 ) # define initial size of output or the default is cell , dont give batch size !!!!!! compressed_context_vector_fw = tf.zeros( [batch_size, input_features // 2 ], # [batch , input_features/2] dtype=tf.float32) compressed_context_vector_bw = tf.zeros( [batch_size, input_features // 2 ], # [batch , input_features/2] dtype=tf.float32) if cur_cell_output is None: # time == 0 # initialization logic nxt_cell_state = encoder_final_state nxt_loop_state = attention_weights else: # any logic that depends on the cell state or cell output..ex attention # this part is 1 based nxt_cell_state = cur_cell_state # [batch , stacked*hidden_decoder=(512*2)] pure_context_vector_fw, scalars_fw = self.attention_step( cur_cell_state, memory_fw) # [batch ,hidden_encoder] pure_context_vector_bw, scalars_bw = self.attention_step( cur_cell_state, memory_bw) # [batch ,hidden_encoder] compressed_context_vector_fw = tf.layers.dense( pure_context_vector_fw, units=input_features // 2) # [batch ,input_features/2] compressed_context_vector_bw = tf.layers.dense( pure_context_vector_bw, units=input_features // 2) # [batch ,input_features/2] nxt_emit_output = tf.concat( (cur_cell_output, pure_context_vector_fw, pure_context_vector_bw), axis=1 ) # [batch ,hidden_decoder+hidden_encoder+hidden_encoder=(512+256+256)] not_finished = (cur_time - 1 < sequence_length ) # this part is 1 based nxt_loop_state = cur_loop_state.write( cur_time - 1, (tf.where(not_finished, scalars_fw, tf.zeros_like(scalars_fw)), tf.where(not_finished, scalars_bw, tf.zeros_like(scalars_bw)) ) # a pair of forward and backward attention weights ) # common loop logic # as in traditional loop the condition is "cur_time < sequence_length" but here i want the finished cur_elements_finished = (cur_time >= sequence_length ) # [batch] # this part is 0 based is_current_out_of_bound = tf.reduce_all( cur_elements_finished ) # scalar -- will cut to the longest sequence given for example [5,2,f] with lengths [3,4] will end at 4 # this shape has to be deterministic not [....,?] nxt_input = tf.cond( is_current_out_of_bound, lambda: tf.zeros( [batch_size, input_features * 2 ], # input shape [batch , input_features+input_features] dtype=tf.float32 ), # no input for end of loop .. can't read if out of bounds == time lambda: tf.concat( (inputs_ta.read(cur_time), compressed_context_vector_fw, compressed_context_vector_bw), axis=1) # read current input and concat context vector ) # nxt_loop_state = None return cur_elements_finished, nxt_input, nxt_cell_state, nxt_emit_output, nxt_loop_state # next step in time outputs_ta, final_state, loop_ta = raw_rnn(cell, loop_fn, swap_memory=True) outputs = outputs_ta.stack( ) # [seq_len_decoder_decoder, batch, hidden_state+context vector(512+256+256) ] loop = loop_ta.stack( ) # [seq_len_decoder_decoder, 2, batch,seq_len_encoder(memory)] return outputs, final_state, loop
def my_attentive_concat_memory(self, cell, sequence_length, inputs, encoder_final_state, memory): inputs_shape = tf.shape(inputs) max_seq_len, batch_size, input_features = self.trainingManager.configs.max_seq_len_decoder, inputs_shape[ 1], inputs.shape[2] inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_seq_len) # max_length = time inputs_ta = inputs_ta.unstack( inputs) # length array of [batch , hidden state] attention_weights = tf.TensorArray( size=max_seq_len, dtype=tf.float32) # max_length = tim def loop_fn(cur_time, cur_cell_output, cur_cell_state, cur_loop_state): # current inputs nxt_emit_output = tf.zeros( [cell.output_size * 2], dtype=tf.float32 ) # define initial size of output or the default is cell , dont give batch size !!!!!! # nxt_emit_output = None compressed_context_vector = tf.zeros( [batch_size, input_features], # [batch , input_features] dtype=tf.float32) if cur_cell_output is None: # time == 0 # initialization logic nxt_cell_state = encoder_final_state nxt_loop_state = attention_weights else: # any logic that depends on the cell state or cell output..ex attention # this part is 1 based nxt_cell_state = cur_cell_state # [batch , stacked*hidden_decoder=(1024*2)] scalars = tf.reduce_sum( tf.multiply( memory, cur_cell_state[-1] ), # [seq_len_encoder, batch ,stacked*hidden_encoder=(512*2)] , mul by top state axis=2) # [seq_len_encoder, batch ] this is cross product scalars = tf.transpose(tf.nn.softmax( scalars, axis=0)) # [batch,seq_len_encoder] this is cross product memory_trans = tf.transpose(memory, [ 2, 1, 0 ]) # [stacked*hidden_encoder=(512*2), batch,seq_len_encoder] pure_context_vector = tf.reduce_sum( tf.transpose( tf.multiply( memory_trans, scalars ), # [stacked*hidden_encoder=(512*2),batch,seq_len_encoder ] [2, 1, 0] ), # [seq_len_encoder, batch ,stacked*hidden_encoder=(512*2)] axis=0) # [batch ,stacked*hidden_encoder=(512*2)] compressed_context_vector = tf.layers.dense( pure_context_vector, units=input_features) # [batch ,seq_len_encoder] nxt_emit_output = tf.concat( (cur_cell_output, pure_context_vector), axis=1) # [batch ,2*stacked*hidden_encoder=(512*2*2)] nxt_loop_state = cur_loop_state.write( cur_time - 1, tf.where( cur_time - 1 < sequence_length, # this part is 1 based scalars, tf.zeros_like(scalars))) # common loop logic # as in traditional loop the condition is "cur_time < sequence_length" but here i want the finished cur_elements_finished = (cur_time >= sequence_length ) # [batch] # this part is 0 based is_current_out_of_bound = tf.reduce_all( cur_elements_finished ) # scalar -- will cut to the longest sequence given for example [5,2,f] with lengths [3,4] will end at 4 # this shape has to be deterministic not [....,?] nxt_input = tf.cond( is_current_out_of_bound, lambda: tf.zeros( [batch_size, input_features * 2 ], # input shape [batch , input_features+input_features] dtype=tf.float32 ), # no input for end of loop .. can't read if out of bounds == time lambda: tf.concat( (inputs_ta.read(cur_time), compressed_context_vector), axis=1) # read current input and concat context vector ) # nxt_loop_state = None return cur_elements_finished, nxt_input, nxt_cell_state, nxt_emit_output, nxt_loop_state # next step in time outputs_ta, final_state, loop_ta = raw_rnn(cell, loop_fn, swap_memory=True) outputs = outputs_ta.stack( ) # [seq_len_decoder_decoder, batch, hidden_state ] loop = loop_ta.stack( ) # [seq_len_decoder_decoder, seq_len_encoder(memory), batch] return outputs, final_state, loop
def rnn_decoder_attention(cell, num_attention_units, attention_inputs, decoder_inputs, initial_state, decoder_length, decoder_fn, attention_length=None, weight_initializer=None, encoder_projection=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None): """ Dynamic RNN decoder with attention for a sequence-to-sequence model specified by RNNCell 'cell'. The 'rnn_decoder_attention' is similar to the 'tf.python.ops.rnn.dynamic_rnn'. As the decoder does not make any assumptions of sequence length of the input or how many steps it can decode, since 'dynamic_rnn_decoder' uses dynamic unrolling. This allows 'attention_inputs' and 'decoder_inputs' to have [None] in the sequence length of the decoder inputs. The parameters attention_inputs and decoder_inputs are nessesary for both training and evaluation. During training all of attention_inputs and a slice of decoder_inputs is feed at every timestep. During evaluation decoder_inputs it is only feed at time==0, as the decoder needs the 'start-of-sequence' symbol, known from Bahdanau et al., 2014 https://arxiv.org/abs/1409.0473, at the beginning of decoding. The parameter initial_state is used to initialize the decoder RNN. As default a linear transformation with a tf.nn.tanh linearity is used. By a linear transformation we can have different number of units between the encoder and decoder. The parameter sequence length is nessesary as it determines how many timesteps to decode for each sample. TODO: Could make it optional for training. The parameter attention_length is used for masking the alpha values computes over the attention_input. Is set to None (default) no mask is computed. Extensions of interest: - Support time_major=True for attention_input (not using conv2D) - Look into rnn.raw_rnn so we don't need to handle zero states - Make 'alpha' usable - Don't use decoder_inputs for evaluation - Make a attention class to allow custom attention functions - Multi-layered decoder - Beam search Args: cell: An instance of RNNCell. num_attention_units: The number of units used for attention. attention_inputs: The encoded inputs. The input used to attend over at every timestep, must be of size [batch_size, seq_len, features] decoder_inputs: The inputs for decoding (embedded format). If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`. The input to `cell` at each time step will be a `Tensor` with dimensions `[batch_size, ...]`. initial_state: An initial state for the decoder's RNN. Must be [batch_size, num_features], where num_features does not have to match the cell.state_size. As a projection is performed at the beginning of the decoding. decoder_length: An int32/int64 vector sized `[batch_size]`. decoder_fn: A function that takes a state and returns an embedding. Here is an example of a `decoder_fn`: def decoder_fn(embeddings, weight, bias): def dec_fn(state): prev = tf.matmul(state, weight) + bias return tf.gather(embeddings, tf.argmax(prev, 1)) return dec_fn encoder_projection: (optional) given that the encoder might have a different size than the decoder, we project the intial state as described in Bahdanau, 2014 (https://arxiv.org/abs/1409.0473). The optional `encoder_projection` is a `tf.contrib.layers.fully_connected` with `activation_fn=tf.python.ops.nn.tanh`. weight_initializer: (optional) An initializer used for attention. attention_length: (optional) An int32/int64 vector sized `[batch_size]`. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the created subgraph; defaults to "decoder_attention". Returns: A pair (outputs_train, outputs_eval) where: outputs_train/eval: the RNN output 'Tensor' If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. NOTICE: output_train is commonly used for calculating loss. Raises: #TODO Put up some raises """ with vs.variable_scope(scope or "decoder") as varscope: # Project initial_state as described in Bahdanau et al. 2014 # https://arxiv.org/abs/1409.0473 if encoder_projection is None: encoder_projection = partial(fully_connected, activation_fn=math_ops.tanh) state = encoder_projection(initial_state, cell.output_size) # Setup of RNN (dimensions, sizes, length, initial state, dtype) # Setup dtype dtype = state.dtype if not time_major: # [batch, seq, features] -> [seq, batch, features] decoder_inputs = array_ops.transpose(decoder_inputs, perm=[1, 0, 2]) # Get data input information batch_size = array_ops.shape(decoder_inputs)[1] attention_input_depth = int(attention_inputs.get_shape()[2]) decoder_input_depth = int(decoder_inputs.get_shape()[2]) attention_max_length = array_ops.shape(attention_inputs)[1] # Setup decoder inputs as TensorArray decoder_inputs_ta = tensor_array_ops.TensorArray(dtype, size=0, dynamic_size=True) decoder_inputs_ta = decoder_inputs_ta.unpack(decoder_inputs) print "attention_input_depth,", attention_input_depth print "decoder_input_depth,", decoder_input_depth # Setup attention weight if weight_initializer is None: weight_initializer = init_ops.truncated_normal_initializer( stddev=0.1) with vs.variable_scope("attention") as attnscope: v_a = vs.get_variable('v_a', shape=[num_attention_units], initializer=weight_initializer) W_a = vs.get_variable( 'W_a', shape=[cell.output_size, num_attention_units], initializer=weight_initializer) # Encode attention_inputs for attention hidden = array_ops.reshape( attention_inputs, [-1, attention_max_length, 1, attention_input_depth]) part1 = conv2d(hidden, num_attention_units, (1, 1)) part1 = array_ops.squeeze(part1, [2]) # Squeeze out the third dimension def context_fn(state, inp): with vs.variable_scope("attention") as attnscope: part2 = math_ops.matmul(state, W_a) # [batch, attn_units] part2 = array_ops.expand_dims(part2, 1) # [batch, 1, attn_units] cmb_attn = part1 + part2 # [batch, seq, attn_units] e = math_ops.reduce_sum(v_a * math_ops.tanh(cmb_attn), [2]) # [batch, seq] alpha = nn.softmax(e) # Mask if attention_length is not None: alpha = math_ops.to_float(mask(attention_length)) * alpha alpha = alpha / math_ops.reduce_sum(alpha, [1], keep_dims=True) # [batch, features] context = math_ops.reduce_sum( array_ops.expand_dims(alpha, 2) * attention_inputs, [1]) context.set_shape([None, attention_input_depth]) con = array_ops.concat(1, (inp, context)) print "con,", con.get_shape() return con, alpha # loop function train def loop_fn_train(time, cell_output, cell_state, loop_state): print "@@@TRAIN@@@" emit_output = cell_output if cell_output is None: next_cell_state = state # Use projection of prev encoder state else: next_cell_state = cell_state elements_finished = (time >= decoder_length ) # TODO handle seq_len=None finished = math_ops.reduce_all(elements_finished) next_input, _ = control_flow_ops.cond( finished, # Handle zero states lambda: (array_ops.zeros( [batch_size, decoder_input_depth + attention_input_depth], dtype=dtype), array_ops.zeros([batch_size, attention_max_length], dtype=dtype)), # Read data and calculate attention lambda: context_fn(next_cell_state, decoder_inputs_ta.read(time ))) next_input.set_shape([ None, decoder_input_depth + attention_input_depth ]) # it loses its shape at some point next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) # loop function eval def loop_fn_eval(time, cell_output, cell_state, loop_state): print "@@@EVAL@@@" emit_output = cell_output if cell_output is None: next_cell_state = state else: next_cell_state = cell_state elements_finished = (time >= decoder_length ) # TODO handle seq_len=None finished = math_ops.reduce_all(elements_finished) varscope.reuse_variables() next_input, _ = control_flow_ops.cond( finished, # Handle zero states lambda: (array_ops.zeros( [batch_size, decoder_input_depth + attention_input_depth], dtype=dtype), array_ops.zeros([batch_size, attention_max_length], dtype=dtype)), # Read data and calculate attention lambda: control_flow_ops.cond( math_ops.greater(time, 0), lambda: context_fn( next_cell_state, decoder_fn(next_cell_state)), lambda: context_fn(next_cell_state, decoder_inputs_ta.read(0)))) # next_input loses its shape at some point next_input.set_shape( [None, decoder_input_depth + attention_input_depth]) next_loop_state = None print "next_input,", next_input.get_shape() return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) # Run raw_rnn function outputs_ta_train, _, _ = rnn.raw_rnn(cell, loop_fn_train) varscope.reuse_variables() outputs_ta_eval, _, _ = rnn.raw_rnn(cell, loop_fn_eval) outputs_train = outputs_ta_train.pack() outputs_eval = outputs_ta_eval.pack() if not time_major: outputs_train = array_ops.transpose(outputs_train, perm=[1, 0, 2]) outputs_eval = array_ops.transpose(outputs_eval, perm=[1, 0, 2]) return outputs_train, outputs_eval