Example #1
0
    def initialize(self, name=None):
        """ Performs helper initialization (to get initial state) """
        with ops.name_scope(name, 'CustomHelperInitialize'):
            finished = gen_math_ops.equal(0, self._sequence_length)
            all_finished = math_ops.reduce_all(finished)
            initial_candidates = self._candidate_tas.read(0)

            def training_inputs():
                """ Returns the training initial input """
                embed_op = self._order_embedding_fn(self._input_tas.read(0))
                with ops.control_dependencies([embed_op]):
                    return array_ops.identity(embed_op)

            def start_inputs():
                """ Returns the GO_ID initial input """
                embed_op = self._order_embedding_fn(self._start_inputs)
                with ops.control_dependencies([embed_op]):
                    return array_ops.identity(embed_op)

            # Getting initial inputs
            initial_inputs = control_flow_ops.case(
                [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), training_inputs),
                 (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), start_inputs),
                 (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), start_inputs)],
                default=training_inputs)

            next_inputs = \
                control_flow_ops.cond(all_finished,
                                      lambda: self._zero_inputs,
                                      lambda: CandidateInputs(
                                          inputs=self._input_layer(initial_inputs),
                                          candidates=initial_candidates,
                                          candidates_emb=self._candidate_embedding_fn(initial_candidates)))
            return (finished, next_inputs)
Example #2
0
def seeded_random(seeds, offset, shape, dtype, seed=None, name=None):
    """ Outputs random values from a uniform distribution.
        The random values are deterministic given a seed.

        :param seeds: A vector of seeds (Size: [batch,]) - If 0, defaults to seed attr, then graph seed, then random.
        :param offset: Integer to add to the seed to get a deterministic mask.
        :param shape: The shape required for each seed (e.g. [3, 5] with a batch of 10 will return [10, 3, 5]).
        :param dtype: The type of the output. `float16`, `float32`, `float64`
        :param seed: A Python integer. Used to create a default seed for the operation.
        :param name: A name for the operation (optional).
        :return: A tensor of the specified shape filled with deterministic random values.
    """
    if dtype not in (dtypes.float16, dtypes.bfloat16, dtypes.float32,
                     dtypes.float64):
        raise ValueError('Invalid dtype %r' % dtype)
    with ops.name_scope(name, 'seeded_random', [shape]):
        seeds = ops.convert_to_tensor(seeds, dtype=dtypes.int32, name='seeds')
        shape = ops.convert_to_tensor(shape, dtype=dtypes.int32, name='shape')
        offset = ops.convert_to_tensor(offset,
                                       dtype=dtypes.int32,
                                       name='offset')
        size = math_ops.reduce_prod(shape)
        graph_seed, op_seed = random_seed.get_seed(seed)
        matrix_output = SEEDED_RANDOM_SO.seeded_random(seeds,
                                                       offset,
                                                       size,
                                                       seed=graph_seed,
                                                       seed2=op_seed)
        output = gen_array_ops.reshape(
            matrix_output, array_ops.concat([(-1, ), shape], axis=0))
        return math_ops.cast(output, dtype)
Example #3
0
    def next_inputs(self, time, outputs, state, sample_ids, name=None):
        """ Computes the next inputs at a time step """
        with ops.name_scope(name, 'CustomHelperNextInputs',
                            [time, outputs, state, sample_ids]):
            next_time = time + 1
            finished = (next_time >= self._sequence_length)
            all_finished = math_ops.reduce_all(finished)

            def get_next_inputs():
                """ Retrieves the inputs for the next time step """
                def get_training_inputs():
                    """ Selecting training inputs """
                    read_op = self._input_tas.read(next_time)
                    with ops.control_dependencies([read_op]):
                        return array_ops.identity(read_op)

                def get_sample_inputs():
                    """ Selecting greedy/sample inputs """
                    return sample_ids

                inputs_next_step = control_flow_ops.case(
                    [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER),
                      get_training_inputs),
                     (gen_math_ops.equal(self._decoder_type,
                                         GREEDY_DECODER), get_sample_inputs),
                     (gen_math_ops.equal(self._decoder_type,
                                         SAMPLE_DECODER), get_sample_inputs)],
                    default=get_training_inputs)
                inputs_emb_next_step = self._input_layer(
                    self._embedding_fn(inputs_next_step))

                # Applying mask
                # inputs_one_hot:   (b, 1, VOC, 1)
                # mask_t:           (b, 1, VOC, VOC)
                # next_mask:        (b, VOC)        -- DenseTensor
                inputs_one_hot = array_ops.one_hot(inputs_next_step,
                                                   self.vocab_size)[:, None, :,
                                                                    None]
                mask_t = _slice_mask(self._mask, [-1, next_time, -1, -1],
                                     time_major=self._time_major)
                next_mask = sparse_ops.sparse_reduce_sum(inputs_one_hot *
                                                         mask_t,
                                                         axis=[1, 2])
                next_mask = gen_math_ops.minimum(next_mask, 1.)
                next_mask.set_shape([None, self.vocab_size])

                # Prevents this branch from executing eagerly
                with ops.control_dependencies(
                    [inputs_emb_next_step, next_mask]):
                    return MaskedInputs(
                        inputs=array_ops.identity(inputs_emb_next_step),
                        mask=array_ops.identity(next_mask))

            next_inputs = control_flow_ops.cond(
                all_finished,
                true_fn=lambda: self._zero_inputs,
                false_fn=get_next_inputs)

            # Returning
            return (finished, next_inputs, state)
    def _update_distributed_as_chief(self, version_step=None):
        """ Performs the gradient averaging, updates the variables, and the global step
            :param version_step: A variable that represents the model's version
            :return: The update operation to run

            Note: This method is called by the chief when synchronization is required.
        """
        # Creating sync_token queue
        with ops.device(self._global_step.device), ops.name_scope(''):
            self._sync_token_queue = data_flow_ops.FIFOQueue(
                capacity=-1,
                dtypes=self._global_step.dtype.base_dtype,
                shapes=(),
                name='sync_token_q',
                shared_name='sync_token_q')

            # Applying grads, then adding tokens to queue
            with ops.control_dependencies([self._apply_grad_op]):
                tokens = gen_array_ops.fill([self._num_workers],
                                            self._global_step)
                sync_op = self._sync_token_queue.enqueue_many((tokens, ))

                # Waiting for token in queue (sync point)
                with ops.control_dependencies([sync_op]):
                    token = self._sync_token_queue.dequeue()
                    update_ops = [state_ops.assign(self._local_step, token)]

                    # Increasing version step
                    if version_step is not None:
                        update_ops += [state_ops.assign_add(version_step, 1)]

                    # Returning
                    return control_flow_ops.group(*update_ops)
Example #5
0
    def sample(self, time, outputs, state, name=None):
        """ Samples the id for the next time step (or -1 for teacher forcing) """
        with ops.name_scope(name, 'CustomHelperSample',
                            [time, outputs, state]):

            def training():
                """ Selecting training / teacher forcing """
                fill_op = gen_array_ops.fill([array_ops.shape(outputs)[0]], -1)
                with ops.control_dependencies([fill_op]):
                    return array_ops.identity(fill_op)

            def greedy():
                """ Selecting greedy """
                argmax_op = math_ops.argmax(outputs,
                                            axis=-1,
                                            output_type=dtypes.int32)
                with ops.control_dependencies([argmax_op]):
                    return array_ops.identity(argmax_op)

            def sample():
                """ Sampling """
                logits = outputs if self._softmax_temperature is None else outputs / self._softmax_temperature
                sample_id_sampler = categorical.Categorical(logits=logits)
                sample_op = sample_id_sampler.sample(seed=self._seed)
                with ops.control_dependencies([sample_op]):
                    return array_ops.identity(sample_op)

            return control_flow_ops.case(
                [(gen_math_ops.equal(self._decoder_type,
                                     TRAINING_DECODER), training),
                 (gen_math_ops.equal(self._decoder_type,
                                     GREEDY_DECODER), greedy),
                 (gen_math_ops.equal(self._decoder_type,
                                     SAMPLE_DECODER), sample)],
                default=training)
Example #6
0
 def zero_state(self, batch_size, dtype):
     """ Return an initial (zero) state tuple for this `IdentityCell`.
         :param batch_size: `0D` integer tensor: the batch size.
         :param dtype: The internal state data type.
         :return: A zeroed out scalar representing the initial state of the cell.
     """
     with ops.name_scope(type(self).__name__ + 'ZeroState',
                         values=[batch_size]):
         return array_ops.zeros([], dtype=dtypes.int32)
Example #7
0
 def zero_state(self, batch_size, dtype):
     """ Return an initial (zero) state tuple for this cell.
         :param batch_size: `0D` integer tensor: the batch size.
         :param dtype: The internal state data type.
         :return: A tuple containing zeroed out tensors and, possibly, empty TA objects.
     """
     with ops.name_scope(type(self).__name__ + 'ZeroState',
                         values=[batch_size]):
         return self._cell.zero_state(batch_size, dtype)
Example #8
0
 def zero_state(self, batch_size, dtype):
     """ Return an initial (zero) state tuple for this `ArrayConcatWrapper`.
         :param batch_size: `0D` integer tensor: the batch size.
         :param dtype: The internal state data type.
         :return: An `ArrayConcatWrapperState` tuple containing zeroed out tensors and, possibly, empty TA objects.
     """
     with ops.name_scope(type(self).__name__ + 'ZeroState',
                         values=[batch_size]):
         return ArrayConcatWrapperState(
             cell_state=self._cell.zero_state(batch_size, dtype),
             time=array_ops.zeros([], dtype=dtypes.int32))
Example #9
0
    def __init__(self, decoder_type, inputs, order_embedding, candidate_embedding, sequence_length, candidates,
                 input_layer=None, time_major=False, softmax_temperature=None, seed=None, name=None):
        """ Constructor
            :param decoder_type: An uint8 representing TRAINING_DECODER, GREEDY_DECODER, or SAMPLE_DECODER
            :param inputs: The decoder input (b, dec_len)
            :param order_embedding: The order embedding vector
            :param candidate_embedding: The candidate embedding vector
            :param sequence_length: The length of each input (b,)
            :param candidates: The candidates at each time step -- Size: (b, nb_cand, max_candidates)
            :param input_layer: Optional. A layer to apply on the inputs
            :param time_major: If true indicates that the first dimension is time, otherwise it is batch size
            :param softmax_temperature: Optional. Softmax temperature. None, scalar, or size: (batch_size,)
            :param seed: Optional. The sampling seed
            :param name: Optional scope name.
        """
        # pylint: disable=too-many-arguments
        with ops.name_scope(name, "CustomHelper", [inputs, sequence_length, order_embedding, candidate_embedding]):
            inputs = ops.convert_to_tensor(inputs, name="inputs")
            candidates = ops.convert_to_tensor(candidates, name="candidates")
            self._inputs = inputs
            self._order_embedding_fn = _get_embedding_fn(order_embedding)
            self._candidate_embedding_fn = _get_embedding_fn(candidate_embedding)
            if not time_major:
                inputs = nest.map_structure(_transpose_batch_time, inputs)
                candidates = nest.map_structure(_transpose_batch_time, candidates)
            self._input_tas = nest.map_structure(_unstack_ta, inputs)
            self._candidate_tas = nest.map_structure(_unstack_ta, candidates)
            self._decoder_type = decoder_type
            self._sequence_length = ops.convert_to_tensor(sequence_length, name="sequence_length")
            if self._sequence_length.get_shape().ndims != 1:
                raise ValueError("Expected vector for sequence_length. Shape: %s" % self._sequence_length.get_shape())
            self._input_layer = input_layer if input_layer is not None else lambda x: x
            self._batch_size = array_ops.size(sequence_length)
            self._start_inputs = gen_array_ops.fill([self._batch_size], GO_ID)
            self._softmax_temperature = softmax_temperature
            self._seed = seed

            # Compute input shape
            self._zero_inputs = \
                CandidateInputs(inputs=
                                array_ops.zeros_like(self._input_layer(self._order_embedding_fn(self._start_inputs))),
                                candidates=array_ops.zeros_like(candidates[0, :]),
                                candidates_emb=array_ops.zeros_like(self._candidate_embedding_fn(candidates[0, :])))

            # Preventing div by zero
            # Adding an extra dim to the matrix, so we can broadcast with the outputs shape
            if softmax_temperature is not None:
                self._softmax_temperature = gen_math_ops.maximum(1e-10, self._softmax_temperature)
                if self._softmax_temperature.get_shape().ndims == 1:
                    self._softmax_temperature = self._softmax_temperature[:, None]
Example #10
0
    def step(self, time, inputs, state, name=None):
        """ Performs a decoding step
            :param time: scalar `int32` tensor.
            :param inputs: A (structure of) input tensors.  (** This is a MaskedInputs tuple **)
            :param state: A (structure of) state tensors and TensorArrays.
            :param name: Name scope for any created operations.
            :return: (outputs, next_state, next_inputs, finished)
        """
        assert isinstance(
            inputs,
            CandidateInputs), 'The inputs must be of type "CandidateInputs"'
        with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)):
            inputs, candidates, candidates_emb = inputs.inputs, inputs.candidates, inputs.candidates_emb
            cell_outputs, cell_state = self._cell(inputs, state)
            cell_state_output = cell_outputs  # Corresponds to cell_state.h (before output layer)
            if self._output_layer is not None:
                cell_outputs = self._output_layer(cell_outputs)

            # Adding a bias dimension, then computing candidate logits and masking PAD_IDs
            cell_outputs = array_ops.pad(cell_outputs, [(0, 0), (0, 1)],
                                         constant_values=1.)
            cell_outputs = math_ops.reduce_sum(cell_outputs[:, None, :] *
                                               candidates_emb,
                                               axis=-1)
            output_mask = math_ops.cast(gen_math_ops.greater(candidates, 0),
                                        dtypes.float32)
            cell_outputs = gen_math_ops.add(cell_outputs, (1. - output_mask) *
                                            LARGE_NEGATIVE)

            # Sampling and computing next inputs
            sample_ids = self._helper.sample(time=time,
                                             outputs=(cell_outputs,
                                                      candidates),
                                             state=cell_state)
            (finished, next_inputs,
             next_state) = self._helper.next_inputs(time=time,
                                                    outputs=cell_outputs,
                                                    state=cell_state,
                                                    sample_ids=sample_ids)
        if self.extract_state:
            outputs = BasicDecoderWithStateOutput(cell_outputs,
                                                  cell_state_output,
                                                  sample_ids)
        else:
            outputs = seq2seq.BasicDecoderOutput(cell_outputs, sample_ids)
        return outputs, next_state, next_inputs, finished
Example #11
0
 def zero_state(self, batch_size, dtype):
     """ Return an initial (zero) state tuple for this `AttentionWrapper`.
         :param batch_size: `0D` integer tensor: the batch size.
         :param dtype: The internal state data type.
         :return: An `SelfAttentionWrapperState` tuple containing zeroed out tensors.
     """
     with ops.name_scope(type(self).__name__ + 'ZeroState',
                         values=[batch_size]):
         # Using batch_size * 0, rather than just 0 to have a dynamic dimension
         initial_cell_state = self._cell.zero_state(batch_size, dtype)
         initial_memory = array_ops.zeros(
             [batch_size, batch_size * 0, self._memory_size],
             dtype=self._dtype)
         return SelfAttentionWrapperState(cell_state=initial_cell_state,
                                          time=array_ops.zeros(
                                              [], dtype=dtypes.int32),
                                          memory=initial_memory)
Example #12
0
    def zero_state(self, batch_size, dtype):
        """ Return an initial (zero) state tuple for this `AttentionWrapper`.
            :param batch_size: `0D` integer tensor: the batch size.
            :param dtype: The internal state data type.
            :return: AttentionWrapperState` tuple containing zeroed out tensors and, possibly, empty `TensorArrays`.
        """
        with ops.name_scope(type(self).__name__ + 'ZeroState',
                            values=[batch_size]):
            if self._initial_cell_state is not None:
                cell_state = self._initial_cell_state
            else:
                cell_state = self._cell.zero_state(batch_size, dtype)

            error_message = (
                'When calling zero_state of AttentionWrapper %s: ' %
                self._base_name +
                'Non-matching batch sizes between the memory encoder output) and the requested batch '
                'size. Are you using the BeamSearchDecoder? If so, make sure your encoder output has been '
                'tiled to beam_width via tf.contrib.seq2seq.tile_batch, and the batch_size= argument '
                'passed to zero_state is batch_size * beam_width.')
            with ops.control_dependencies(
                    self._batch_size_checks(batch_size, error_message)):
                cell_state = nest.map_structure(
                    lambda state: array_ops.identity(
                        state, name='checked_cell_state'), cell_state)
            initial_alignments = [
                attention_mechanism.initial_alignments(batch_size, dtype)
                for attention_mechanism in self._attention_mechanisms
            ]
            return AttentionWrapperState(
                cell_state=cell_state,
                time=array_ops.zeros([], dtype=dtypes.int32),
                attention=_zero_state_tensors(self._attention_layer_size,
                                              batch_size, dtype),
                alignments=self._item_or_tuple(initial_alignments),
                attention_state=self._item_or_tuple(
                    attention_mechanism.initial_state(batch_size, dtype)
                    for attention_mechanism in self._attention_mechanisms),
                alignment_history=self._item_or_tuple(
                    tensor_array_ops.TensorArray(dtype,
                                                 size=0,
                                                 dynamic_size=True,
                                                 element_shape=alignment.shape)
                    if self._alignment_history else ()
                    for alignment in initial_alignments))
Example #13
0
    def zero_state(self, batch_size, dtype):
        """ Return an initial (zero) state tuple for this `IdentityCell`.
            :param batch_size: `0D` integer tensor: the batch size.
            :param dtype: The internal state data type.
            :return: A zeroed out scalar representing the initial state of the cell.
        """
        with ops.name_scope(type(self).__name__ + 'ZeroState',
                            values=[batch_size]):
            if self._feeder_cell is None:
                feeder_init_state = array_ops.zeros([], dtype=dtype)
            elif self._feeder_init_state is not None:
                feeder_init_state = self._feeder_init_state
            else:
                feeder_init_state = self._feeder_cell.zero_state(
                    batch_size, dtype)

            # Empty past attentions
            if self._past_attns is None:
                head_size = self._emb_size // self._nb_heads
                past_attns_shape = [
                    batch_size, self._nb_layers, 2, self._nb_heads,
                    0 * batch_size, head_size
                ]
                self._past_attns = array_ops.zeros(past_attns_shape,
                                                   dtype=dtypes.float32)

            # No Context - Returning a zero past attention
            if self._context is None:
                return TransformerCellState(past_attentions=self._past_attns,
                                            feeder_state=feeder_init_state,
                                            time=array_ops.zeros(
                                                [], dtype=dtypes.int32))

            # Context provided - Computing attention by running a single block step
            _, present_attns, _ = self._step(
                inputs=self._context_word_embedding_fn(self._context),
                past_attns=self._past_attns,
                time=0,
                feeder_cell=None,
                feeder_state=None)
            return TransformerCellState(past_attentions=present_attns,
                                        feeder_state=feeder_init_state,
                                        time=array_ops.zeros(
                                            [], dtype=dtypes.int32))
    def _update_standalone(self, version_step=None):
        """ Performs the gradient averaging, updates the variables, and the global step
            :param version_step: A variable that represents the model's version
            :return: The update operation to run

            Note: This method is called when there are no workers (no synchronization)
        """
        with ops.device(self._global_step.device), ops.name_scope(''):
            with ops.control_dependencies([self._apply_grad_op]):
                update_ops = [
                    state_ops.assign(self._local_step, self._global_step)
                ]

                # Increasing version step
                if version_step is not None:
                    update_ops += [state_ops.assign_add(version_step, 1)]

                # Returning
                return control_flow_ops.group(*update_ops)
    def _update_distributed_as_worker(self):
        """ Performs the gradient averaging, updates the variables, and the global step
            :param version_step: A variable that represents the model's version
            :return: The update operation to run

            Note: This method is called by a worker when synchronization is required.
        """
        # Creating sync_token queue
        with ops.device(self._global_step.device), ops.name_scope(''):
            self._sync_token_queue = data_flow_ops.FIFOQueue(
                capacity=-1,
                dtypes=self._global_step.dtype.base_dtype,
                shapes=(),
                name='sync_token_q',
                shared_name='sync_token_q')

            # Waiting for token in queue (sync point)
            token = self._sync_token_queue.dequeue()
            return state_ops.assign(self._local_step, token)
Example #16
0
    def next_inputs(self, time, outputs, state, sample_ids, name=None):
        """ Computes the next inputs at a time step """
        with ops.name_scope(name, 'CustomHelperNextInputs', [time, outputs, state, sample_ids]):
            next_time = time + 1
            finished = (next_time >= self._sequence_length)
            all_finished = math_ops.reduce_all(finished)

            def get_next_inputs():
                """ Retrieves the inputs for the next time step """
                def get_training_inputs():
                    """ Selecting training inputs """
                    read_op = self._input_tas.read(next_time)
                    with ops.control_dependencies([read_op]):
                        return array_ops.identity(read_op)

                def get_sample_inputs():
                    """ Selecting greedy/sample inputs """
                    return sample_ids

                inputs_next_step = control_flow_ops.case(
                    [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), get_training_inputs),
                     (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), get_sample_inputs),
                     (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), get_sample_inputs)],
                    default=get_training_inputs)
                inputs_emb_next_step = self._input_layer(self._order_embedding_fn(inputs_next_step))
                candidate_next_step = self._candidate_tas.read(next_time)
                candidate_emb_next_step = self._candidate_embedding_fn(candidate_next_step)

                # Prevents this branch from executing eagerly
                with ops.control_dependencies([inputs_emb_next_step, candidate_next_step, candidate_emb_next_step]):
                    return CandidateInputs(inputs=array_ops.identity(inputs_emb_next_step),
                                           candidates=array_ops.identity(candidate_next_step),
                                           candidates_emb=array_ops.identity(candidate_emb_next_step))

            next_inputs = control_flow_ops.cond(all_finished,
                                                true_fn=lambda: self._zero_inputs,
                                                false_fn=get_next_inputs)

            # Returning
            return (finished, next_inputs, state)
Example #17
0
    def sample(self, time, outputs, state, name=None):
        """ Samples the id for the next time step (or -1 for teacher forcing)
            Note: outputs is a tuple of (cell_outputs, candidate)
        """
        cell_outputs, candidate = outputs

        with ops.name_scope(name, 'CustomHelperSample', [time, outputs, state]):

            def training():
                """ Selecting training / teacher forcing """
                fill_op = gen_array_ops.fill([array_ops.shape(cell_outputs)[0]], -1)
                with ops.control_dependencies([fill_op]):
                    return array_ops.identity(fill_op)

            def greedy():
                """ Selecting greedy """
                argmax_id = math_ops.cast(math_ops.argmax(cell_outputs, axis=-1), dtypes.int32)
                nb_candidate = array_ops.shape(candidate)[1]
                candidate_ids = \
                    math_ops.reduce_sum(array_ops.one_hot(argmax_id, nb_candidate, dtype=dtypes.int32) * candidate,
                                        axis=-1)
                with ops.control_dependencies([candidate_ids]):
                    return array_ops.identity(candidate_ids)

            def sample():
                """ Sampling """
                logits = cell_outputs if self._softmax_temperature is None else cell_outputs / self._softmax_temperature
                sample_id_sampler = categorical.Categorical(logits=logits)
                sample_ids = sample_id_sampler.sample(seed=self._seed)
                nb_candidate = array_ops.shape(candidate)[1]
                reduce_op = math_ops.reduce_sum(array_ops.one_hot(sample_ids,
                                                                  nb_candidate,
                                                                  dtype=dtypes.int32) * candidate, axis=-1)
                with ops.control_dependencies([reduce_op]):
                    return array_ops.identity(reduce_op)

            return control_flow_ops.case([(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), training),
                                          (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), greedy),
                                          (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), sample)],
                                         default=training)
Example #18
0
 def step(self, time, inputs, state, name=None):
     """ Performs a decoding step
         :param time: scalar `int32` tensor.
         :param inputs: A (structure of) input tensors.  (** This is a MaskedInputs tuple **)
         :param state: A (structure of) state tensors and TensorArrays.
         :param name: Name scope for any created operations.
         :return: (outputs, next_state, next_inputs, finished)
     """
     assert isinstance(
         inputs,
         (MaskedInputs, ops.Tensor)), 'Expected "MaskedInputs" or a Tensor.'
     with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)):
         inputs, output_mask = inputs, None
         if isinstance(inputs, MaskedInputs):
             inputs, output_mask = inputs.inputs, inputs.mask
         cell_outputs, cell_state = self._cell(inputs, state)
         cell_state_output = cell_outputs  # Corresponds to cell_state.h (before output layer)
         if self._output_layer is not None:
             cell_outputs = self._output_layer(cell_outputs)
         if output_mask is not None:
             cell_outputs = gen_math_ops.add(
                 cell_outputs, (1. - output_mask) * LARGE_NEGATIVE)
         sample_ids = self._helper.sample(time=time,
                                          outputs=cell_outputs,
                                          state=cell_state)
         (finished, next_inputs,
          next_state) = self._helper.next_inputs(time=time,
                                                 outputs=cell_outputs,
                                                 state=cell_state,
                                                 sample_ids=sample_ids)
     if self.extract_state:
         outputs = BasicDecoderWithStateOutput(cell_outputs,
                                               cell_state_output,
                                               sample_ids)
     else:
         outputs = seq2seq.BasicDecoderOutput(cell_outputs, sample_ids)
     return outputs, next_state, next_inputs, finished
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """ Accumulates gradients for the variables and stores them in the accumulator

            Note: This does not update the variables, it just accumulate the gradients

            :param grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients()
            :param global_step: Optional variable to increment by one after the variables have been updated.
            :param name: Optional name for the returned op. Default to name passed to the optimizer constructor.
            :return: The training operation to be run by each replica

            Raises - ValueError if the grads_and_vars is empty
                   - ValueError if global step is not provided
                   - ValueError if update() has already been called
        """
        # Making sure grads_and_var and global_step are provided
        if not grads_and_vars:
            raise ValueError('Must supply at least one variable.')
        if not global_step:
            raise ValueError('You must provide a global_step variable')
        if self._finalized:
            raise ValueError(
                'The optimizer has been finalized. You cannot call this method after update().'
            )

        train_ops = []
        accumulated_grad = []
        var_list = []
        chief_init_ops = []
        self._global_step = global_step

        # Colocating local step to prevent it from being placed on the parameter server
        local_anchor = gen_control_flow_ops.no_op()
        with ops.device(local_anchor.device):
            self._local_step = variable_scope.variable(
                initial_value=0,
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                dtype=global_step.dtype.base_dtype,
                name='sync_rep_local_step')

        # Setting initial step
        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step)
        chief_init_ops += [self.local_step_init_op]

        with ops.name_scope(None, self._name):

            # Creating accumulators
            for grad, var in grads_and_vars:
                var_list += [var]
                with ops.device(var.device):

                    # No gradient - Pass-Through
                    if grad is None:
                        accumulated_grad += [None]
                        continue

                    # Sparse Variable - Accumulating over the dense shape
                    elif isinstance(grad, ops.IndexedSlices):
                        grad_accum = self._create_sparse_accumulator(var, grad)
                        train_ops += [
                            grad_accum.apply_indexed_slices_grad(
                                grad, local_step=self._local_step)
                        ]
                        accumulated_grad += [
                            self._take_sparse_grad(grad_accum, grad)
                        ]
                        chief_init_ops += [
                            grad_accum.set_global_step(global_step,
                                                       name='SetGlobalStep')
                        ]

                    # Dense Variable
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = self._create_dense_accumulator(var, grad)
                        train_ops += [
                            grad_accum.apply_grad(grad,
                                                  local_step=self._local_step)
                        ]
                        accumulated_grad += [
                            self._take_dense_grad(grad_accum, grad)
                        ]
                        chief_init_ops += [
                            grad_accum.set_global_step(global_step,
                                                       name='SetGlobalStep')
                        ]

                    # Unknown
                    else:
                        raise RuntimeError('Unsupported gradient type.')

            # Building update_op
            with ops.device(self._global_step.device), ops.name_scope(''):
                accumulated_grad = [
                    ensure_finite(gradient) for gradient in accumulated_grad
                ]
                if self._max_gradient_norm:
                    accumulated_grad, _ = clip_ops.clip_by_global_norm(
                        accumulated_grad, self._max_gradient_norm)
                self._apply_grad_op = self._optimizer.apply_gradients(
                    zip(accumulated_grad, var_list), global_step)

            # Building chief init ops
            self.chief_init_op = control_flow_ops.group(*chief_init_ops)

            # Building train_op
            return control_flow_ops.group(*train_ops)
Example #20
0
def seeded_dropout(inputs,
                   seeds,
                   keep_probs,
                   offset=None,
                   noise_shape=None,
                   seed=None,
                   name=None):
    """ Computes dropout (with a deterministic mask).
        Every item in the batch has a deterministic seed to compute the deterministic mask

        With probability `keep_probs`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`.
        The scaling is so that the expected sum is unchanged.

        By default, each element is kept or dropped independently. If `noise_shape` is specified, it must be
        broadcastable to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]` will make
        independent decisions.

        For example, if `shape(x) = [k, l, m, n]` and `noise_shape = [k, 1, 1, n]`, each batch and channel component
        will be kept independently and each row and column will be kept or not kept together.

        :param inputs: A floating point tensor.
        :param seeds: A tensor representing the seed for each item in the batch. (Size: (batch,))
        :param keep_probs: A scalar or vector of size (batch,). The probability that each element is kept.
        :param offset: Integer. Alternative offset to apply to compute the deterministic mask (e.g. in a loop).
        :param noise_shape: A 1-D `Tensor` of type `int32`, represents the shape for randomly generated keep/drop flags.
        :param seed: A Python integer. Used to create a default seed for the operation.
        :param name: name: A name for this operation (optional).
        :return: A Tensor of the same shape of `x`.
    """
    if offset is None:
        seeded_dropout.offset += 40555607

    # If inputs is a scalar, this is likely the 'time' attribute in a state, we don't want to mask it
    # Same thing for integers - We can safely ignore them
    # So we don't want to mask it
    if not inputs.shape or inputs.dtype.is_integer:
        return inputs

    with ops.name_scope(name, 'seeded_dropout', [inputs]):
        inputs = ops.convert_to_tensor(inputs, name='x')
        if not inputs.dtype.is_floating:
            raise ValueError(
                'Expected a floating point tensor. Got a %s tensor instead.' %
                inputs.dtype)
        if isinstance(keep_probs, float) and not 0 < keep_probs <= 1:
            raise ValueError(
                'keep_probs must be a scalar tensor or a float in the range (0, 1], got %g'
                % keep_probs)

        # Early return if nothing needs to be dropped.
        if isinstance(keep_probs, float) and keep_probs == 1:
            return inputs

        # Not supported in eager mode
        if context.executing_eagerly():
            raise ValueError('This function is not supported in eager mode.')

        # Converting to tensor
        keep_probs = ops.convert_to_tensor(keep_probs,
                                           dtype=inputs.dtype,
                                           name='keep_probs')
        keep_probs = gen_math_ops.maximum(0.,
                                          gen_math_ops.minimum(1., keep_probs))
        keep_probs = gen_array_ops.reshape(keep_probs, [-1] + [1] *
                                           (len(inputs.shape) - 1))
        all_keep_probs_are_one = math_ops.reduce_all(
            gen_math_ops.equal(keep_probs, 1.))

        # Computing noise shape
        noise_shape = nn_ops._get_noise_shape(inputs, noise_shape)  # pylint: disable=protected-access

        def get_dropout_mask():
            """ Computes the dropout mask """
            # random_tensor = uniform [keep_probs, 1.0 + keep_probs)
            random_tensor = keep_probs
            random_tensor += seeded_random(
                seeds,
                offset=offset if offset is not None else seeded_dropout.offset,
                shape=noise_shape[1:],
                dtype=inputs.dtype,
                seed=seed)

            # 0. if [keep_probs, 1.0) and 1. if [1.0, 1.0 + keep_prob)
            binary_tensor = gen_math_ops.floor(random_tensor)
            ret = math_ops.divide(inputs, keep_probs) * binary_tensor
            ret.set_shape(inputs.get_shape())

            # Setting control flow ops to avoid computing this function if not required
            with ops.control_dependencies([ret]):
                return array_ops.identity(ret)

        # Returning the dropout mask
        return control_flow_ops.cond(all_keep_probs_are_one,
                                     true_fn=lambda: inputs,
                                     false_fn=get_dropout_mask)
Example #21
0
    def __init__(self,
                 decoder_type,
                 inputs,
                 embedding,
                 sequence_length,
                 mask,
                 input_layer=None,
                 time_major=False,
                 softmax_temperature=None,
                 seed=None,
                 name=None):
        """ Constructor
            :param decoder_type: An uint8 representing TRAINING_DECODER, GREEDY_DECODER, or SAMPLE_DECODER
            :param inputs: The decoder input (b, dec_len)
            :param embedding: The embedding vector
            :param sequence_length: The length of each input (b,)
            :param mask: [SparseTensor] Mask to apply at each time step -- Size: (b, dec_len, vocab_size, vocab_size)
            :param input_layer: Optional. A layer to apply on the inputs
            :param time_major: If true indicates that the first dimension is time, otherwise it is batch size
            :param softmax_temperature: Optional. Softmax temperature. None or size: (batch_size,)
            :param seed: Optional. The sampling seed
            :param name: Optional scope name.
        """
        # pylint: disable=too-many-arguments
        with ops.name_scope(name, "CustomHelper",
                            [inputs, sequence_length, embedding]):
            assert isinstance(mask,
                              SparseTensor), 'The mask must be a SparseTensor'
            inputs = ops.convert_to_tensor(inputs, name="inputs")
            self._inputs = inputs
            self._mask = mask
            self._time_major = time_major
            self._embedding_fn = embedding if callable(
                embedding) else lambda ids: embedding_lookup(embedding, ids)
            if not time_major:
                inputs = nest.map_structure(_transpose_batch_time, inputs)
            self._input_tas = nest.map_structure(_unstack_ta, inputs)
            self._decoder_type = decoder_type
            self._sequence_length = ops.convert_to_tensor(
                sequence_length, name="sequence_length")
            if self._sequence_length.get_shape().ndims != 1:
                raise ValueError(
                    "Expected vector for sequence_length. Shape: %s" %
                    self._sequence_length.get_shape())
            self._input_layer = input_layer if callable(
                input_layer) else lambda x: x
            self._batch_size = array_ops.size(sequence_length)
            self._start_inputs = gen_array_ops.fill([self._batch_size], GO_ID)
            self._softmax_temperature = softmax_temperature
            self._seed = seed
            self.vocab_size = VOCABULARY_SIZE
            self._zero_inputs = \
                MaskedInputs(inputs=array_ops.zeros_like(self._input_layer(self._embedding_fn(self._start_inputs))),
                             mask=_slice_mask(self._mask,
                                              slicing=[-1, 0, GO_ID, -1],
                                              squeeze=True,
                                              time_major=self._time_major))

            # Preventing div by zero
            # Adding an extra dim to the matrix, so we can broadcast with the outputs shape
            if softmax_temperature is not None:
                self._softmax_temperature = gen_math_ops.maximum(
                    1e-10, self._softmax_temperature)
                if self._softmax_temperature.get_shape().ndims == 1:
                    self._softmax_temperature = self._softmax_temperature[:,
                                                                          None]