def get_next_inputs(): """ Retrieves the inputs for the next time step """ inputs_next_step = sample_ids inputs_emb_next_step = self._input_layer( self._embedding_fn(inputs_next_step)) # [bat, beam, in_sz] # Applying mask # inputs_one_hot: (batch, beam, 1, VOC, 1) # mask_t: (batch, 1, 1, VOC, VOC) # next_mask: (batch, beam, VOC) inputs_one_hot = array_ops.one_hot(inputs_next_step, self.vocab_size)[:, :, None, :, None] mask_t = sparse_ops.sparse_tensor_to_dense( _slice_mask(self._mask, [-1, next_time, -1, -1], time_major=self._time_major))[:, None, :, :, :] mask_t.set_shape([None, 1, 1, self.vocab_size, self.vocab_size]) next_mask = math_ops.reduce_sum(inputs_one_hot * mask_t, axis=[2, 3]) next_mask = gen_math_ops.minimum(next_mask, 1.) # Prevents this branch from executing eagerly with ops.control_dependencies([inputs_emb_next_step, next_mask]): return MaskedInputs( inputs=array_ops.identity(inputs_emb_next_step), mask=array_ops.identity(next_mask))
def get_next_inputs(): """ Retrieves the inputs for the next time step """ def get_training_inputs(): """ Selecting training inputs """ read_op = self._input_tas.read(next_time) with ops.control_dependencies([read_op]): return array_ops.identity(read_op) def get_sample_inputs(): """ Selecting greedy/sample inputs """ return sample_ids inputs_next_step = control_flow_ops.case( [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), get_training_inputs), (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), get_sample_inputs), (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), get_sample_inputs)], default=get_training_inputs) inputs_emb_next_step = self._input_layer( self._embedding_fn(inputs_next_step)) # Applying mask # inputs_one_hot: (b, 1, VOC, 1) # mask_t: (b, 1, VOC, VOC) # next_mask: (b, VOC) -- DenseTensor inputs_one_hot = array_ops.one_hot(inputs_next_step, self.vocab_size)[:, None, :, None] mask_t = _slice_mask(self._mask, [-1, next_time, -1, -1], time_major=self._time_major) next_mask = sparse_ops.sparse_reduce_sum(inputs_one_hot * mask_t, axis=[1, 2]) next_mask = gen_math_ops.minimum(next_mask, 1.) next_mask.set_shape([None, self.vocab_size]) # Prevents this branch from executing eagerly with ops.control_dependencies( [inputs_emb_next_step, next_mask]): return MaskedInputs( inputs=array_ops.identity(inputs_emb_next_step), mask=array_ops.identity(next_mask))
def _compute_attention(self, query, memory): """ Computes the attention and alignments for the Bahdanau attention mechanism . :param query: The query (inputs) to use to compute attention. Size [b, input_size] :param memory: The memory (previous outputs) used to compute attention [b, time_step, memory_size] :return: The attention. Size [b, attn_size] """ assert len( memory.shape) == 3, 'Memory needs to be [batch, time, memory_size]' memory_time = array_ops.shape(memory)[1] memory_size = memory.shape[2] num_units = self._num_units assert self._memory_size == memory_size, 'Expected mem size of %s - Got %s' % ( self._memory_size, memory_size) # Query, memory, and attention layers query_layer = core.Dense(num_units, name='query_layer', use_bias=False, dtype=self._dtype) memory_layer = lambda x: x if memory_size != self._num_units: memory_layer = core.Dense(num_units, name='memory_layer', use_bias=False, dtype=self._dtype) attn_layer = lambda x: x if self._attention_layer_size is not None and memory_size != self._attention_layer_size: attn_layer = core.Dense(self._attention_layer_size, name='attn_layer', use_bias=False, dtype=self._dtype) # Masking memory sequence_length = gen_math_ops.minimum(memory_time, self._sequence_length) sequence_mask = array_ops.sequence_mask(sequence_length, maxlen=memory_time, dtype=dtypes.float32)[..., None] values = memory * sequence_mask keys = memory_layer(values) # Computing scores processed_query = query_layer(query) scores = _bahdanau_score(processed_query, keys, self._normalize) # Getting alignments masked_scores = _maybe_mask_score(scores, sequence_length, self._score_mask_value) alignments = self._wrapped_probability_fn(masked_scores, None) # [batch, time] # Getting attention expanded_alignments = array_ops.expand_dims(alignments, 1) # [batch, 1, time] context = math_ops.matmul(expanded_alignments, memory) # [batch, 1, memory_size] context = array_ops.squeeze(context, [1]) # [batch, memory_size] attention = attn_layer(context) # [batch, attn_size] # Returning attention return attention
def seeded_dropout(inputs, seeds, keep_probs, offset=None, noise_shape=None, seed=None, name=None): """ Computes dropout (with a deterministic mask). Every item in the batch has a deterministic seed to compute the deterministic mask With probability `keep_probs`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`. The scaling is so that the expected sum is unchanged. By default, each element is kept or dropped independently. If `noise_shape` is specified, it must be broadcastable to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]` will make independent decisions. For example, if `shape(x) = [k, l, m, n]` and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be kept independently and each row and column will be kept or not kept together. :param inputs: A floating point tensor. :param seeds: A tensor representing the seed for each item in the batch. (Size: (batch,)) :param keep_probs: A scalar or vector of size (batch,). The probability that each element is kept. :param offset: Integer. Alternative offset to apply to compute the deterministic mask (e.g. in a loop). :param noise_shape: A 1-D `Tensor` of type `int32`, represents the shape for randomly generated keep/drop flags. :param seed: A Python integer. Used to create a default seed for the operation. :param name: name: A name for this operation (optional). :return: A Tensor of the same shape of `x`. """ if offset is None: seeded_dropout.offset += 40555607 # If inputs is a scalar, this is likely the 'time' attribute in a state, we don't want to mask it # Same thing for integers - We can safely ignore them # So we don't want to mask it if not inputs.shape or inputs.dtype.is_integer: return inputs with ops.name_scope(name, 'seeded_dropout', [inputs]): inputs = ops.convert_to_tensor(inputs, name='x') if not inputs.dtype.is_floating: raise ValueError( 'Expected a floating point tensor. Got a %s tensor instead.' % inputs.dtype) if isinstance(keep_probs, float) and not 0 < keep_probs <= 1: raise ValueError( 'keep_probs must be a scalar tensor or a float in the range (0, 1], got %g' % keep_probs) # Early return if nothing needs to be dropped. if isinstance(keep_probs, float) and keep_probs == 1: return inputs # Not supported in eager mode if context.executing_eagerly(): raise ValueError('This function is not supported in eager mode.') # Converting to tensor keep_probs = ops.convert_to_tensor(keep_probs, dtype=inputs.dtype, name='keep_probs') keep_probs = gen_math_ops.maximum(0., gen_math_ops.minimum(1., keep_probs)) keep_probs = gen_array_ops.reshape(keep_probs, [-1] + [1] * (len(inputs.shape) - 1)) all_keep_probs_are_one = math_ops.reduce_all( gen_math_ops.equal(keep_probs, 1.)) # Computing noise shape noise_shape = nn_ops._get_noise_shape(inputs, noise_shape) # pylint: disable=protected-access def get_dropout_mask(): """ Computes the dropout mask """ # random_tensor = uniform [keep_probs, 1.0 + keep_probs) random_tensor = keep_probs random_tensor += seeded_random( seeds, offset=offset if offset is not None else seeded_dropout.offset, shape=noise_shape[1:], dtype=inputs.dtype, seed=seed) # 0. if [keep_probs, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = gen_math_ops.floor(random_tensor) ret = math_ops.divide(inputs, keep_probs) * binary_tensor ret.set_shape(inputs.get_shape()) # Setting control flow ops to avoid computing this function if not required with ops.control_dependencies([ret]): return array_ops.identity(ret) # Returning the dropout mask return control_flow_ops.cond(all_keep_probs_are_one, true_fn=lambda: inputs, false_fn=get_dropout_mask)
def _convert_to_probs_tensor(keep_probs): """ Converts a keep_probs tensor to its broadcastable shape """ probs_tensor = ops.convert_to_tensor(keep_probs) probs_tensor = gen_math_ops.maximum( 0., gen_math_ops.minimum(1., probs_tensor)) return gen_array_ops.reshape(probs_tensor, [-1, 1])
def _step(self, inputs, past_attns, time, feeder_cell, feeder_state): """ Performs the block operation on n-layers :param inputs: The tensor inputs (embedding of each word) - [batch, seq_len, emb_size] :param past_attns: The past attentions - [batch, nb_layers, 2, nb_heads. past_length, emb_size // nb_heads] :param time: A tensor representing the current time step :param feeder_cell: None or A feeder cell that returns a RNN cell output to use for conditioning :param feeder_state: None or the initial state of the feeder cell :param name: Name of the scope - To share weights between calls :return: A tuple consisting of: 1) The cell outputs - [batch, seq_len, emb_size] 2) The present attention - [batch, nb_layers, 2, nb_heads. seq_len, emb_size // nb_heads] 3) The new state of the feeder cell """ with variable_scope.variable_scope(self._scope, default_name='step'): past_length = array_ops.shape(past_attns)[ -2] # How many past attention steps we have seq_len = array_ops.shape(inputs)[ -2] # How many steps are we computing for the current time emb_size = inputs.shape[-1].value # The size of the embedding assert emb_size == self._emb_size, 'Expected an embedding size of %d' % self._emb_size # 1) Computing the word embedding of each token assert inputs.shape.ndims == 3, 'Expected [batch, seq_len, emb_size]' # [bz, seq, emb] out_h = inputs # 2) Computing the position embedding of each token # If we know the context was padded, the effective past length is the context length + nb of time steps if self._past_seq_lengths is not None: past_length = gen_math_ops.minimum( past_length, self._past_seq_lengths + time)[:, None] # [bz, 1] else: past_length = gen_array_ops.fill([self._batch_size, 1], value=past_length) # [bz, 1] step_ix = math_ops.range(seq_len)[None, :] # [1, seq_len] token_positions = gen_math_ops.add(past_length, step_ix) # [batch, seq_len] token_positions = gen_math_ops.minimum( self._position_emb_size - 1, token_positions) # [batch, seq_len] h_pos = self._position_embedding_fn( token_positions) # [bz, seq, emb] out_h = out_h + h_pos # 3) If we have a feeder cell, we also need to condition 'h' on it. next_feeder_state = feeder_state if feeder_cell is not None: assert feeder_state is not None, 'A feeder state is required if a feeder cell is provided.' assert inputs.shape[ 1].value == 1, 'The seq dimension must be 1 to use a feeder_cell' feeder_outputs, next_feeder_state = feeder_cell( array_ops.squeeze(inputs, axis=1), feeder_state) h_feed = feeder_outputs # [bz, feeder_sz] if feeder_outputs.shape[-1].value != emb_size: h_feed = core.Dense(emb_size, activation=None, name='h_feed')(h_feed) # [bz, emb] h_feed = gen_array_ops.tile(h_feed[:, None, :], [1, seq_len, 1]) # [bz, seq, emb] out_h = out_h + h_feed # Transformer presents = [] pasts = array_ops.unstack( past_attns, axis=1) # list of [batch, 2, heads, past_len, head_sz] assert len( pasts ) == self._nb_layers, 'Expected the past attention to have %d layers.' % self._nb_layers for layer_ix, past_attn in enumerate(pasts): out_h, present = self._block(out_h, past_attn, 'layer.%d' % layer_ix) presents += [present] presents = array_ops.stack(presents, axis=1) # Normalizing and returning cell_outputs = self._norm(out_h, 'norm_h') # [batch, seq, emb] return cell_outputs, presents, next_feeder_state