Example #1
0
        def get_next_inputs():
            """ Retrieves the inputs for the next time step """
            inputs_next_step = sample_ids
            inputs_emb_next_step = self._input_layer(
                self._embedding_fn(inputs_next_step))  # [bat, beam, in_sz]

            # Applying mask
            # inputs_one_hot:   (batch, beam,   1, VOC,   1)
            # mask_t:           (batch,    1,   1, VOC, VOC)
            # next_mask:        (batch, beam, VOC)
            inputs_one_hot = array_ops.one_hot(inputs_next_step,
                                               self.vocab_size)[:, :, None, :,
                                                                None]
            mask_t = sparse_ops.sparse_tensor_to_dense(
                _slice_mask(self._mask, [-1, next_time, -1, -1],
                            time_major=self._time_major))[:, None, :, :, :]
            mask_t.set_shape([None, 1, 1, self.vocab_size, self.vocab_size])
            next_mask = math_ops.reduce_sum(inputs_one_hot * mask_t,
                                            axis=[2, 3])
            next_mask = gen_math_ops.minimum(next_mask, 1.)

            # Prevents this branch from executing eagerly
            with ops.control_dependencies([inputs_emb_next_step, next_mask]):
                return MaskedInputs(
                    inputs=array_ops.identity(inputs_emb_next_step),
                    mask=array_ops.identity(next_mask))
Example #2
0
            def get_next_inputs():
                """ Retrieves the inputs for the next time step """
                def get_training_inputs():
                    """ Selecting training inputs """
                    read_op = self._input_tas.read(next_time)
                    with ops.control_dependencies([read_op]):
                        return array_ops.identity(read_op)

                def get_sample_inputs():
                    """ Selecting greedy/sample inputs """
                    return sample_ids

                inputs_next_step = control_flow_ops.case(
                    [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER),
                      get_training_inputs),
                     (gen_math_ops.equal(self._decoder_type,
                                         GREEDY_DECODER), get_sample_inputs),
                     (gen_math_ops.equal(self._decoder_type,
                                         SAMPLE_DECODER), get_sample_inputs)],
                    default=get_training_inputs)
                inputs_emb_next_step = self._input_layer(
                    self._embedding_fn(inputs_next_step))

                # Applying mask
                # inputs_one_hot:   (b, 1, VOC, 1)
                # mask_t:           (b, 1, VOC, VOC)
                # next_mask:        (b, VOC)        -- DenseTensor
                inputs_one_hot = array_ops.one_hot(inputs_next_step,
                                                   self.vocab_size)[:, None, :,
                                                                    None]
                mask_t = _slice_mask(self._mask, [-1, next_time, -1, -1],
                                     time_major=self._time_major)
                next_mask = sparse_ops.sparse_reduce_sum(inputs_one_hot *
                                                         mask_t,
                                                         axis=[1, 2])
                next_mask = gen_math_ops.minimum(next_mask, 1.)
                next_mask.set_shape([None, self.vocab_size])

                # Prevents this branch from executing eagerly
                with ops.control_dependencies(
                    [inputs_emb_next_step, next_mask]):
                    return MaskedInputs(
                        inputs=array_ops.identity(inputs_emb_next_step),
                        mask=array_ops.identity(next_mask))
Example #3
0
    def _compute_attention(self, query, memory):
        """ Computes the attention and alignments for the Bahdanau attention mechanism .
            :param query: The query (inputs) to use to compute attention. Size [b, input_size]
            :param memory: The memory (previous outputs) used to compute attention [b, time_step, memory_size]
            :return: The attention. Size [b, attn_size]
        """
        assert len(
            memory.shape) == 3, 'Memory needs to be [batch, time, memory_size]'
        memory_time = array_ops.shape(memory)[1]
        memory_size = memory.shape[2]
        num_units = self._num_units
        assert self._memory_size == memory_size, 'Expected mem size of %s - Got %s' % (
            self._memory_size, memory_size)

        # Query, memory, and attention layers
        query_layer = core.Dense(num_units,
                                 name='query_layer',
                                 use_bias=False,
                                 dtype=self._dtype)
        memory_layer = lambda x: x
        if memory_size != self._num_units:
            memory_layer = core.Dense(num_units,
                                      name='memory_layer',
                                      use_bias=False,
                                      dtype=self._dtype)
        attn_layer = lambda x: x
        if self._attention_layer_size is not None and memory_size != self._attention_layer_size:
            attn_layer = core.Dense(self._attention_layer_size,
                                    name='attn_layer',
                                    use_bias=False,
                                    dtype=self._dtype)

        # Masking memory
        sequence_length = gen_math_ops.minimum(memory_time,
                                               self._sequence_length)
        sequence_mask = array_ops.sequence_mask(sequence_length,
                                                maxlen=memory_time,
                                                dtype=dtypes.float32)[...,
                                                                      None]
        values = memory * sequence_mask
        keys = memory_layer(values)

        # Computing scores
        processed_query = query_layer(query)
        scores = _bahdanau_score(processed_query, keys, self._normalize)

        # Getting alignments
        masked_scores = _maybe_mask_score(scores, sequence_length,
                                          self._score_mask_value)
        alignments = self._wrapped_probability_fn(masked_scores,
                                                  None)  # [batch, time]

        # Getting attention
        expanded_alignments = array_ops.expand_dims(alignments,
                                                    1)  # [batch, 1, time]
        context = math_ops.matmul(expanded_alignments,
                                  memory)  # [batch, 1, memory_size]
        context = array_ops.squeeze(context, [1])  # [batch, memory_size]
        attention = attn_layer(context)  # [batch, attn_size]

        # Returning attention
        return attention
Example #4
0
def seeded_dropout(inputs,
                   seeds,
                   keep_probs,
                   offset=None,
                   noise_shape=None,
                   seed=None,
                   name=None):
    """ Computes dropout (with a deterministic mask).
        Every item in the batch has a deterministic seed to compute the deterministic mask

        With probability `keep_probs`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`.
        The scaling is so that the expected sum is unchanged.

        By default, each element is kept or dropped independently. If `noise_shape` is specified, it must be
        broadcastable to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]` will make
        independent decisions.

        For example, if `shape(x) = [k, l, m, n]` and `noise_shape = [k, 1, 1, n]`, each batch and channel component
        will be kept independently and each row and column will be kept or not kept together.

        :param inputs: A floating point tensor.
        :param seeds: A tensor representing the seed for each item in the batch. (Size: (batch,))
        :param keep_probs: A scalar or vector of size (batch,). The probability that each element is kept.
        :param offset: Integer. Alternative offset to apply to compute the deterministic mask (e.g. in a loop).
        :param noise_shape: A 1-D `Tensor` of type `int32`, represents the shape for randomly generated keep/drop flags.
        :param seed: A Python integer. Used to create a default seed for the operation.
        :param name: name: A name for this operation (optional).
        :return: A Tensor of the same shape of `x`.
    """
    if offset is None:
        seeded_dropout.offset += 40555607

    # If inputs is a scalar, this is likely the 'time' attribute in a state, we don't want to mask it
    # Same thing for integers - We can safely ignore them
    # So we don't want to mask it
    if not inputs.shape or inputs.dtype.is_integer:
        return inputs

    with ops.name_scope(name, 'seeded_dropout', [inputs]):
        inputs = ops.convert_to_tensor(inputs, name='x')
        if not inputs.dtype.is_floating:
            raise ValueError(
                'Expected a floating point tensor. Got a %s tensor instead.' %
                inputs.dtype)
        if isinstance(keep_probs, float) and not 0 < keep_probs <= 1:
            raise ValueError(
                'keep_probs must be a scalar tensor or a float in the range (0, 1], got %g'
                % keep_probs)

        # Early return if nothing needs to be dropped.
        if isinstance(keep_probs, float) and keep_probs == 1:
            return inputs

        # Not supported in eager mode
        if context.executing_eagerly():
            raise ValueError('This function is not supported in eager mode.')

        # Converting to tensor
        keep_probs = ops.convert_to_tensor(keep_probs,
                                           dtype=inputs.dtype,
                                           name='keep_probs')
        keep_probs = gen_math_ops.maximum(0.,
                                          gen_math_ops.minimum(1., keep_probs))
        keep_probs = gen_array_ops.reshape(keep_probs, [-1] + [1] *
                                           (len(inputs.shape) - 1))
        all_keep_probs_are_one = math_ops.reduce_all(
            gen_math_ops.equal(keep_probs, 1.))

        # Computing noise shape
        noise_shape = nn_ops._get_noise_shape(inputs, noise_shape)  # pylint: disable=protected-access

        def get_dropout_mask():
            """ Computes the dropout mask """
            # random_tensor = uniform [keep_probs, 1.0 + keep_probs)
            random_tensor = keep_probs
            random_tensor += seeded_random(
                seeds,
                offset=offset if offset is not None else seeded_dropout.offset,
                shape=noise_shape[1:],
                dtype=inputs.dtype,
                seed=seed)

            # 0. if [keep_probs, 1.0) and 1. if [1.0, 1.0 + keep_prob)
            binary_tensor = gen_math_ops.floor(random_tensor)
            ret = math_ops.divide(inputs, keep_probs) * binary_tensor
            ret.set_shape(inputs.get_shape())

            # Setting control flow ops to avoid computing this function if not required
            with ops.control_dependencies([ret]):
                return array_ops.identity(ret)

        # Returning the dropout mask
        return control_flow_ops.cond(all_keep_probs_are_one,
                                     true_fn=lambda: inputs,
                                     false_fn=get_dropout_mask)
Example #5
0
 def _convert_to_probs_tensor(keep_probs):
     """ Converts a keep_probs tensor to its broadcastable shape """
     probs_tensor = ops.convert_to_tensor(keep_probs)
     probs_tensor = gen_math_ops.maximum(
         0., gen_math_ops.minimum(1., probs_tensor))
     return gen_array_ops.reshape(probs_tensor, [-1, 1])
Example #6
0
    def _step(self, inputs, past_attns, time, feeder_cell, feeder_state):
        """ Performs the block operation on n-layers
            :param inputs: The tensor inputs (embedding of each word) - [batch, seq_len, emb_size]
            :param past_attns: The past attentions - [batch, nb_layers, 2, nb_heads. past_length, emb_size // nb_heads]
            :param time: A tensor representing the current time step
            :param feeder_cell: None or A feeder cell that returns a RNN cell output to use for conditioning
            :param feeder_state: None or the initial state of the feeder cell
            :param name: Name of the scope - To share weights between calls
            :return: A tuple consisting of:
                        1) The cell outputs - [batch, seq_len, emb_size]
                        2) The present attention - [batch, nb_layers, 2, nb_heads. seq_len, emb_size // nb_heads]
                        3) The new state of the feeder cell
        """
        with variable_scope.variable_scope(self._scope, default_name='step'):
            past_length = array_ops.shape(past_attns)[
                -2]  # How many past attention steps we have
            seq_len = array_ops.shape(inputs)[
                -2]  # How many steps are we computing for the current time
            emb_size = inputs.shape[-1].value  # The size of the embedding
            assert emb_size == self._emb_size, 'Expected an embedding size of %d' % self._emb_size

            # 1) Computing the word embedding of each token
            assert inputs.shape.ndims == 3, 'Expected [batch, seq_len, emb_size]'  # [bz, seq, emb]
            out_h = inputs

            # 2) Computing the position embedding of each token
            # If we know the context was padded, the effective past length is the context length + nb of time steps
            if self._past_seq_lengths is not None:
                past_length = gen_math_ops.minimum(
                    past_length,
                    self._past_seq_lengths + time)[:, None]  # [bz, 1]
            else:
                past_length = gen_array_ops.fill([self._batch_size, 1],
                                                 value=past_length)  # [bz, 1]
            step_ix = math_ops.range(seq_len)[None, :]  # [1, seq_len]
            token_positions = gen_math_ops.add(past_length,
                                               step_ix)  # [batch, seq_len]
            token_positions = gen_math_ops.minimum(
                self._position_emb_size - 1,
                token_positions)  # [batch, seq_len]
            h_pos = self._position_embedding_fn(
                token_positions)  # [bz, seq, emb]
            out_h = out_h + h_pos

            # 3) If we have a feeder cell, we also need to condition 'h' on it.
            next_feeder_state = feeder_state
            if feeder_cell is not None:
                assert feeder_state is not None, 'A feeder state is required if a feeder cell is provided.'
                assert inputs.shape[
                    1].value == 1, 'The seq dimension must be 1 to use a feeder_cell'
                feeder_outputs, next_feeder_state = feeder_cell(
                    array_ops.squeeze(inputs, axis=1), feeder_state)
                h_feed = feeder_outputs  # [bz, feeder_sz]
                if feeder_outputs.shape[-1].value != emb_size:
                    h_feed = core.Dense(emb_size,
                                        activation=None,
                                        name='h_feed')(h_feed)  # [bz, emb]
                h_feed = gen_array_ops.tile(h_feed[:, None, :],
                                            [1, seq_len, 1])  # [bz, seq, emb]
                out_h = out_h + h_feed

            # Transformer
            presents = []
            pasts = array_ops.unstack(
                past_attns,
                axis=1)  # list of [batch, 2, heads, past_len, head_sz]
            assert len(
                pasts
            ) == self._nb_layers, 'Expected the past attention to have %d layers.' % self._nb_layers

            for layer_ix, past_attn in enumerate(pasts):
                out_h, present = self._block(out_h, past_attn,
                                             'layer.%d' % layer_ix)
                presents += [present]
            presents = array_ops.stack(presents, axis=1)

            # Normalizing and returning
            cell_outputs = self._norm(out_h, 'norm_h')  # [batch, seq, emb]
            return cell_outputs, presents, next_feeder_state