def get_next_inputs(): """ Retrieves the inputs for the next time step """ inputs_next_step = sample_ids inputs_emb_next_step = self._input_layer( self._embedding_fn(inputs_next_step)) # [bat, beam, in_sz] # Applying mask # inputs_one_hot: (batch, beam, 1, VOC, 1) # mask_t: (batch, 1, 1, VOC, VOC) # next_mask: (batch, beam, VOC) inputs_one_hot = array_ops.one_hot(inputs_next_step, self.vocab_size)[:, :, None, :, None] mask_t = sparse_ops.sparse_tensor_to_dense( _slice_mask(self._mask, [-1, next_time, -1, -1], time_major=self._time_major))[:, None, :, :, :] mask_t.set_shape([None, 1, 1, self.vocab_size, self.vocab_size]) next_mask = math_ops.reduce_sum(inputs_one_hot * mask_t, axis=[2, 3]) next_mask = gen_math_ops.minimum(next_mask, 1.) # Prevents this branch from executing eagerly with ops.control_dependencies([inputs_emb_next_step, next_mask]): return MaskedInputs( inputs=array_ops.identity(inputs_emb_next_step), mask=array_ops.identity(next_mask))
def get_next_inputs(): """ Retrieves the inputs for the next time step """ def get_training_inputs(): """ Selecting training inputs """ read_op = self._input_tas.read(next_time) with ops.control_dependencies([read_op]): return array_ops.identity(read_op) def get_sample_inputs(): """ Selecting greedy/sample inputs """ return sample_ids inputs_next_step = control_flow_ops.case( [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), get_training_inputs), (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), get_sample_inputs), (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), get_sample_inputs)], default=get_training_inputs) inputs_emb_next_step = self._input_layer(self._order_embedding_fn(inputs_next_step)) candidate_next_step = self._candidate_tas.read(next_time) candidate_emb_next_step = self._candidate_embedding_fn(candidate_next_step) # Prevents this branch from executing eagerly with ops.control_dependencies([inputs_emb_next_step, candidate_next_step, candidate_emb_next_step]): return CandidateInputs(inputs=array_ops.identity(inputs_emb_next_step), candidates=array_ops.identity(candidate_next_step), candidates_emb=array_ops.identity(candidate_emb_next_step))
def get_zero_memory_and_attn(): """ Time = 0, we don't concatenate to memory and attention is all 0. """ next_memory = state.memory next_attention = array_ops.zeros( [batch_size, self._attention_layer_size], dtype=inputs.dtype) with ops.control_dependencies([next_memory, next_attention]): return array_ops.identity(next_memory), array_ops.identity( next_attention)
def get_next_inputs(): """ Retrieves the inputs for the next time step """ inputs_next_step = sample_ids inputs_emb_next_step = self._input_layer(self._order_embedding_fn(inputs_next_step)) candidate_next_step = self._candidate_tas.read(next_time) candidate_emb_next_step = self._candidate_embedding_fn(candidate_next_step) # Prevents this branch from executing eagerly with ops.control_dependencies([inputs_emb_next_step, candidate_next_step, candidate_emb_next_step]): return CandidateInputs(inputs=array_ops.identity(inputs_emb_next_step), candidates=array_ops.identity(candidate_next_step), candidates_emb=array_ops.identity(candidate_emb_next_step))
def get_next_memory_and_attn(): """ Gets the next memory and attention """ next_memory = array_ops.concat( [ state.memory, # [b, t, mem_size] array_ops.expand_dims(self._input_fn(inputs), axis=1) ], axis=1) next_attention = self._compute_attention(inputs, next_memory) with ops.control_dependencies([next_memory, next_attention]): return array_ops.identity(next_memory), array_ops.identity( next_attention)
def sample(): """ Sampling """ logits = outputs if self._softmax_temperature is None else outputs / self._softmax_temperature sample_id_sampler = categorical.Categorical(logits=logits) sample_op = sample_id_sampler.sample(seed=self._seed) with ops.control_dependencies([sample_op]): return array_ops.identity(sample_op)
def greedy(): """ Selecting greedy """ argmax_op = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32) with ops.control_dependencies([argmax_op]): return array_ops.identity(argmax_op)
def _zero_grad(): """ Returns a zeroed-out gradient """ zero_values = array_ops.zeros_like(grad.values) with ops.control_dependencies([zero_values]): return ops.IndexedSlices( values=array_ops.identity(zero_values), indices=math_ops.cast(grad.indices, dtypes.int64), dense_shape=math_ops.cast(grad.dense_shape, dtypes.int64))
def _take_grad(): """ Computes the gradient from the accumulator """ avg_grad = grad_accum.take_indexed_slices_grad(num_required=1) with ops.control_dependencies([avg_grad]): return ops.IndexedSlices(values=array_ops.identity( avg_grad.values), indices=avg_grad.indices, dense_shape=avg_grad.dense_shape)
def greedy(): """ Selecting greedy """ argmax_id = math_ops.cast(math_ops.argmax(cell_outputs, axis=-1), dtypes.int32) nb_candidate = array_ops.shape(candidate)[1] candidate_ids = \ math_ops.reduce_sum(array_ops.one_hot(argmax_id, nb_candidate, dtype=dtypes.int32) * candidate, axis=-1) with ops.control_dependencies([candidate_ids]): return array_ops.identity(candidate_ids)
def get_next_inputs(): """ Retrieves the inputs for the next time step """ def get_training_inputs(): """ Selecting training inputs """ read_op = self._input_tas.read(next_time) with ops.control_dependencies([read_op]): return array_ops.identity(read_op) def get_sample_inputs(): """ Selecting greedy/sample inputs """ return sample_ids inputs_next_step = control_flow_ops.case( [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), get_training_inputs), (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), get_sample_inputs), (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), get_sample_inputs)], default=get_training_inputs) inputs_emb_next_step = self._input_layer( self._embedding_fn(inputs_next_step)) # Applying mask # inputs_one_hot: (b, 1, VOC, 1) # mask_t: (b, 1, VOC, VOC) # next_mask: (b, VOC) -- DenseTensor inputs_one_hot = array_ops.one_hot(inputs_next_step, self.vocab_size)[:, None, :, None] mask_t = _slice_mask(self._mask, [-1, next_time, -1, -1], time_major=self._time_major) next_mask = sparse_ops.sparse_reduce_sum(inputs_one_hot * mask_t, axis=[1, 2]) next_mask = gen_math_ops.minimum(next_mask, 1.) next_mask.set_shape([None, self.vocab_size]) # Prevents this branch from executing eagerly with ops.control_dependencies( [inputs_emb_next_step, next_mask]): return MaskedInputs( inputs=array_ops.identity(inputs_emb_next_step), mask=array_ops.identity(next_mask))
def sample(): """ Sampling """ logits = cell_outputs if self._softmax_temperature is None else cell_outputs / self._softmax_temperature sample_id_sampler = categorical.Categorical(logits=logits) sample_ids = sample_id_sampler.sample(seed=self._seed) nb_candidate = array_ops.shape(candidate)[1] reduce_op = math_ops.reduce_sum(array_ops.one_hot(sample_ids, nb_candidate, dtype=dtypes.int32) * candidate, axis=-1) with ops.control_dependencies([reduce_op]): return array_ops.identity(reduce_op)
def zero_state(self, batch_size, dtype): """ Return an initial (zero) state tuple for this `AttentionWrapper`. :param batch_size: `0D` integer tensor: the batch size. :param dtype: The internal state data type. :return: AttentionWrapperState` tuple containing zeroed out tensors and, possibly, empty `TensorArrays`. """ with ops.name_scope(type(self).__name__ + 'ZeroState', values=[batch_size]): if self._initial_cell_state is not None: cell_state = self._initial_cell_state else: cell_state = self._cell.zero_state(batch_size, dtype) error_message = ( 'When calling zero_state of AttentionWrapper %s: ' % self._base_name + 'Non-matching batch sizes between the memory encoder output) and the requested batch ' 'size. Are you using the BeamSearchDecoder? If so, make sure your encoder output has been ' 'tiled to beam_width via tf.contrib.seq2seq.tile_batch, and the batch_size= argument ' 'passed to zero_state is batch_size * beam_width.') with ops.control_dependencies( self._batch_size_checks(batch_size, error_message)): cell_state = nest.map_structure( lambda state: array_ops.identity( state, name='checked_cell_state'), cell_state) initial_alignments = [ attention_mechanism.initial_alignments(batch_size, dtype) for attention_mechanism in self._attention_mechanisms ] return AttentionWrapperState( cell_state=cell_state, time=array_ops.zeros([], dtype=dtypes.int32), attention=_zero_state_tensors(self._attention_layer_size, batch_size, dtype), alignments=self._item_or_tuple(initial_alignments), attention_state=self._item_or_tuple( attention_mechanism.initial_state(batch_size, dtype) for attention_mechanism in self._attention_mechanisms), alignment_history=self._item_or_tuple( tensor_array_ops.TensorArray(dtype, size=0, dynamic_size=True, element_shape=alignment.shape) if self._alignment_history else () for alignment in initial_alignments))
def get_dropout_mask(): """ Computes the dropout mask """ # random_tensor = uniform [keep_probs, 1.0 + keep_probs) random_tensor = keep_probs random_tensor += seeded_random( seeds, offset=offset if offset is not None else seeded_dropout.offset, shape=noise_shape[1:], dtype=inputs.dtype, seed=seed) # 0. if [keep_probs, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = gen_math_ops.floor(random_tensor) ret = math_ops.divide(inputs, keep_probs) * binary_tensor ret.set_shape(inputs.get_shape()) # Setting control flow ops to avoid computing this function if not required with ops.control_dependencies([ret]): return array_ops.identity(ret)
def _take_grad(): """ Computes the gradient from the accumulator """ avg_grad = grad_accum.take_grad(num_required=1) with ops.control_dependencies([avg_grad]): return array_ops.identity(avg_grad)
def get_next_alignments(): """ Returns the next alignments """ next_align = self._alignments_ta.read(next_time) with ops.control_dependencies([next_align]): return array_ops.identity(next_align)
def call(self, inputs, state): """ Performs a step of attention-wrapped RNN. 1) Mix the `inputs` and previous step's `attention` output via `cell_input_fn`. 2) Call the wrapped `cell` with this input and its previous state. 3) Score the cell's output with `attention_mechanism`. 4) Calculate the alignments by passing the score through the `normalizer`. 5) Calculate the context vector as the inner product between the alignments and the attention_mechanism's values (memory). 6) Calculate the attention output by concatenating the cell output and context through the attention layer (a linear layer with `attention_layer_size` outputs). :param inputs: (Possibly nested tuple of) Tensor, the input at this time step. :param state: An instance of `AttentionWrapperState` containing tensors from the previous time step. :return: A tuple `(attention_or_cell_output, next_state)`, where: - `attention_or_cell_output` depending on `output_attention`. - `next_state` is an instance of `AttentionWrapperState` containing the state calculated at this time step. """ # pylint: disable=arguments-differ if not isinstance(state, AttentionWrapperState): raise TypeError( 'Expected state to be instance of AttentionWrapperState. Rcvd %s instead. ' % type(state)) # Step 1: Calculate the true inputs to the cell based on the previous attention value. cell_inputs = self._cell_input_fn(inputs, state.attention) cell_state = state.cell_state cell_output, next_cell_state = self._cell(cell_inputs, cell_state) cell_batch_size = tensor_shape.dimension_value( cell_output.shape[0]) or array_ops.shape(cell_output)[0] error_message = ( 'When applying AttentionWrapper %s: ' % self.name + 'Non-matching batch sizes between ' 'the memory (encoder output) and the query (decoder output). Are you using the ' 'BeamSearchDecoder? You may need to tile your memory input via the tf.contrib.seq2seq.' 'tile_batch function with argument multiple=beam_width.') with variable_scope.variable_scope(self._name_or_scope, 'AttentionWrapper', [inputs, state]): with ops.control_dependencies( self._batch_size_checks(cell_batch_size, error_message)): cell_output = array_ops.identity(cell_output, name='checked_cell_output') if self._is_multi: previous_attention_state = state.attention_state previous_alignment_history = state.alignment_history else: previous_attention_state = [state.attention_state] previous_alignment_history = [state.alignment_history] # Computing attention all_alignments = [] all_attentions = [] all_attention_states = [] maybe_all_histories = [] for i, attention_mechanism in enumerate( self._attention_mechanisms): attention, alignments, next_attention_state = _compute_attention( attention_mechanism, cell_output, previous_attention_state[i], self._attention_layers[i] if self._attention_layers else None) alignment_history = previous_alignment_history[i].write( state.time, alignments) if self._alignment_history else () all_attention_states.append(next_attention_state) all_alignments.append(alignments) all_attentions.append(attention) maybe_all_histories.append(alignment_history) # Building next state attention = array_ops.concat(all_attentions, 1) next_state = AttentionWrapperState( time=state.time + 1, cell_state=next_cell_state, attention=attention, attention_state=self._item_or_tuple(all_attention_states), alignments=self._item_or_tuple(all_alignments), alignment_history=self._item_or_tuple(maybe_all_histories)) # Returning if self._output_attention: return attention, next_state return cell_output, next_state
def __init__(self, cell, attention_mechanism, attention_layer_size=None, alignment_history=False, cell_input_fn=None, output_attention=True, initial_cell_state=None, name_or_scope='AttentionWrapper', attention_layer=None): """ Construct the `AttentionWrapper`. :param cell: An instance of `RNNCell`. :param attention_mechanism: A list of `AttentionMechanism` instances or a singleinstance. :param attention_layer_size: A list of Python integers or a single Python integer, the depth of the attention (output) layer(s). :param alignment_history: Python boolean, whether to store alignment history from all time steps in the final output state :param cell_input_fn: (optional) A `callable`. The default is: concat([inputs, attention], axis=-1) :param output_attention: Python bool. If `True` (default), the output at each time step is the attn value. :param initial_cell_state: The initial state value to use for the cell when the user calls `zero_state()`. :param name_or_scope: String or VariableScope to use when creating ops. :param attention_layer: A list of `tf.layers.Layer` instances or a single `tf.layers.Layer` instance taking the context and cell output as inputs to generate attention at each time step. If None (default), use the context as attention at each time step. **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in `AttentionWrapper`, then you must ensure that: - The encoder output has been tiled to `beam_width` via `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`). - The `batch_size` argument passed to the `zero_state` method of this wrapper is equal to `true_batch_size * beam_width`. - The initial state created with `zero_state` above contains a `cell_state` value containing properly tiled final state from the encoder. """ # pylint: disable=too-many-arguments self._name_or_scope = name_or_scope with variable_scope.variable_scope(name_or_scope, 'AttentionWrapper'): super(AttentionWrapper, self).__init__() rnn_cell_impl.assert_like_rnncell("cell", cell) # Attention mechanism if isinstance(attention_mechanism, (list, tuple)): self._is_multi = True attention_mechanisms = attention_mechanism for attn_mechanism in attention_mechanisms: if not isinstance(attn_mechanism, AttentionMechanism): raise TypeError( 'attention_mechanism must contain only instances of AttentionMechanism, saw ' 'type: %s' % type(attn_mechanism).__name__) else: self._is_multi = False if not isinstance(attention_mechanism, AttentionMechanism): raise TypeError( 'attention_mechanism must be an AttentionMechanism or list of multiple ' 'AttentionMechanism instances, saw type: %s' % type(attention_mechanism).__name__) attention_mechanisms = (attention_mechanism, ) # Cell input function if cell_input_fn is None: cell_input_fn = lambda inputs, attention: array_ops.concat( [inputs, attention], -1) else: if not callable(cell_input_fn): raise TypeError( 'cell_input_fn must be callable, saw type: %s' % type(cell_input_fn).__name__) # Attention layer size if attention_layer_size is not None and attention_layer is not None: raise ValueError( 'Only one of attention_layer_size and attention_layer should be set' ) if attention_layer_size is not None: attention_layer_sizes = tuple( attention_layer_size if isinstance(attention_layer_size, ( list, tuple)) else (attention_layer_size, )) if len(attention_layer_sizes) != len(attention_mechanisms): raise ValueError( 'If provided, attention_layer_size must contain exactly one integer per ' 'attention_mechanism, saw: %d vs %d' % (len(attention_layer_sizes), len(attention_mechanisms))) self._attention_layers = tuple( core.Dense(attention_layer_size, name='attention_layer', use_bias=False, dtype=attention_mechanisms[i].dtype) for i, attention_layer_size in enumerate(attention_layer_sizes)) self._attention_layer_size = sum(attention_layer_sizes) elif attention_layer is not None: self._attention_layers = tuple(attention_layer if isinstance( attention_layer, (list, tuple)) else (attention_layer, )) if len(self._attention_layers) != len(attention_mechanisms): raise ValueError( 'If provided, attention_layer must contain exactly one layer per ' 'attention_mechanism, saw: %d vs %d' % (len(self._attention_layers), len(attention_mechanisms))) self._attention_layer_size = \ sum(tensor_shape.dimension_value( layer.compute_output_shape([None, cell.output_size + tensor_shape.dimension_value(mechanism.values.shape[-1])])[-1]) for layer, mechanism in zip(self._attention_layers, attention_mechanisms)) else: self._attention_layers = None self._attention_layer_size = sum( tensor_shape.dimension_value( attention_mechanism.values.shape[-1]) for attention_mechanism in attention_mechanisms) self._cell = cell self._attention_mechanisms = attention_mechanisms self._cell_input_fn = cell_input_fn self._output_attention = output_attention self._alignment_history = alignment_history if initial_cell_state is None: self._initial_cell_state = None else: final_state_tensor = nest.flatten(initial_cell_state)[-1] state_batch_size = (tensor_shape.dimension_value( final_state_tensor.shape[0]) or array_ops.shape(final_state_tensor)[0]) error_message = ( 'When constructing AttentionWrapper %s: ' % self._base_name + 'Non-matching batch sizes between the memory (encoder output) and initial_cell_state. ' 'Are you using the BeamSearchDecoder? You may need to tile your initial state via the ' 'tf.contrib.seq2seq.tile_batch function with argument multiple=beam_width.' ) with ops.control_dependencies( self._batch_size_checks(state_batch_size, error_message)): self._initial_cell_state = \ nest.map_structure(lambda state: array_ops.identity(state, name='check_initial_cell_state'), initial_cell_state)
def training(): """ Selecting training / teacher forcing """ fill_op = gen_array_ops.fill([array_ops.shape(outputs)[0]], -1) with ops.control_dependencies([fill_op]): return array_ops.identity(fill_op)
def get_training_inputs(): """ Selecting training inputs """ read_op = self._input_tas.read(next_time) with ops.control_dependencies([read_op]): return array_ops.identity(read_op)
def _zero_grad(): """ Returns a zeroed-out gradient """ zero_like_grad = array_ops.zeros_like(grad) with ops.control_dependencies([zero_like_grad]): return array_ops.identity(zero_like_grad)
def start_inputs(): """ Returns the GO_ID initial input """ embed_op = self._embedding_fn(self._start_inputs) with ops.control_dependencies([embed_op]): return array_ops.identity(embed_op)
def training_inputs(): """ Returns the training initial input """ embed_op = self._embedding_fn(self._input_tas.read(0)) with ops.control_dependencies([embed_op]): return array_ops.identity(embed_op)