def _update_distributed_as_chief(self, version_step=None):
        """ Performs the gradient averaging, updates the variables, and the global step
            :param version_step: A variable that represents the model's version
            :return: The update operation to run

            Note: This method is called by the chief when synchronization is required.
        """
        # Creating sync_token queue
        with ops.device(self._global_step.device), ops.name_scope(''):
            self._sync_token_queue = data_flow_ops.FIFOQueue(
                capacity=-1,
                dtypes=self._global_step.dtype.base_dtype,
                shapes=(),
                name='sync_token_q',
                shared_name='sync_token_q')

            # Applying grads, then adding tokens to queue
            with ops.control_dependencies([self._apply_grad_op]):
                tokens = gen_array_ops.fill([self._num_workers],
                                            self._global_step)
                sync_op = self._sync_token_queue.enqueue_many((tokens, ))

                # Waiting for token in queue (sync point)
                with ops.control_dependencies([sync_op]):
                    token = self._sync_token_queue.dequeue()
                    update_ops = [state_ops.assign(self._local_step, token)]

                    # Increasing version step
                    if version_step is not None:
                        update_ops += [state_ops.assign_add(version_step, 1)]

                    # Returning
                    return control_flow_ops.group(*update_ops)
Beispiel #2
0
 def sample():
     """ Sampling """
     logits = outputs if self._softmax_temperature is None else outputs / self._softmax_temperature
     sample_id_sampler = categorical.Categorical(logits=logits)
     sample_op = sample_id_sampler.sample(seed=self._seed)
     with ops.control_dependencies([sample_op]):
         return array_ops.identity(sample_op)
Beispiel #3
0
        def get_next_inputs():
            """ Retrieves the inputs for the next time step """
            inputs_next_step = sample_ids
            inputs_emb_next_step = self._input_layer(
                self._embedding_fn(inputs_next_step))  # [bat, beam, in_sz]

            # Applying mask
            # inputs_one_hot:   (batch, beam,   1, VOC,   1)
            # mask_t:           (batch,    1,   1, VOC, VOC)
            # next_mask:        (batch, beam, VOC)
            inputs_one_hot = array_ops.one_hot(inputs_next_step,
                                               self.vocab_size)[:, :, None, :,
                                                                None]
            mask_t = sparse_ops.sparse_tensor_to_dense(
                _slice_mask(self._mask, [-1, next_time, -1, -1],
                            time_major=self._time_major))[:, None, :, :, :]
            mask_t.set_shape([None, 1, 1, self.vocab_size, self.vocab_size])
            next_mask = math_ops.reduce_sum(inputs_one_hot * mask_t,
                                            axis=[2, 3])
            next_mask = gen_math_ops.minimum(next_mask, 1.)

            # Prevents this branch from executing eagerly
            with ops.control_dependencies([inputs_emb_next_step, next_mask]):
                return MaskedInputs(
                    inputs=array_ops.identity(inputs_emb_next_step),
                    mask=array_ops.identity(next_mask))
Beispiel #4
0
            def get_next_inputs():
                """ Retrieves the inputs for the next time step """
                def get_training_inputs():
                    """ Selecting training inputs """
                    read_op = self._input_tas.read(next_time)
                    with ops.control_dependencies([read_op]):
                        return array_ops.identity(read_op)

                def get_sample_inputs():
                    """ Selecting greedy/sample inputs """
                    return sample_ids

                inputs_next_step = control_flow_ops.case(
                    [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER), get_training_inputs),
                     (gen_math_ops.equal(self._decoder_type, GREEDY_DECODER), get_sample_inputs),
                     (gen_math_ops.equal(self._decoder_type, SAMPLE_DECODER), get_sample_inputs)],
                    default=get_training_inputs)
                inputs_emb_next_step = self._input_layer(self._order_embedding_fn(inputs_next_step))
                candidate_next_step = self._candidate_tas.read(next_time)
                candidate_emb_next_step = self._candidate_embedding_fn(candidate_next_step)

                # Prevents this branch from executing eagerly
                with ops.control_dependencies([inputs_emb_next_step, candidate_next_step, candidate_emb_next_step]):
                    return CandidateInputs(inputs=array_ops.identity(inputs_emb_next_step),
                                           candidates=array_ops.identity(candidate_next_step),
                                           candidates_emb=array_ops.identity(candidate_emb_next_step))
Beispiel #5
0
 def greedy():
     """ Selecting greedy """
     argmax_op = math_ops.argmax(outputs,
                                 axis=-1,
                                 output_type=dtypes.int32)
     with ops.control_dependencies([argmax_op]):
         return array_ops.identity(argmax_op)
 def _zero_grad():
     """ Returns a zeroed-out gradient """
     zero_values = array_ops.zeros_like(grad.values)
     with ops.control_dependencies([zero_values]):
         return ops.IndexedSlices(
             values=array_ops.identity(zero_values),
             indices=math_ops.cast(grad.indices, dtypes.int64),
             dense_shape=math_ops.cast(grad.dense_shape, dtypes.int64))
 def _take_grad():
     """ Computes the gradient from the accumulator """
     avg_grad = grad_accum.take_indexed_slices_grad(num_required=1)
     with ops.control_dependencies([avg_grad]):
         return ops.IndexedSlices(values=array_ops.identity(
             avg_grad.values),
                                  indices=avg_grad.indices,
                                  dense_shape=avg_grad.dense_shape)
Beispiel #8
0
 def get_zero_memory_and_attn():
     """ Time = 0, we don't concatenate to memory and attention is all 0. """
     next_memory = state.memory
     next_attention = array_ops.zeros(
         [batch_size, self._attention_layer_size], dtype=inputs.dtype)
     with ops.control_dependencies([next_memory, next_attention]):
         return array_ops.identity(next_memory), array_ops.identity(
             next_attention)
Beispiel #9
0
 def greedy():
     """ Selecting greedy """
     argmax_id = math_ops.cast(math_ops.argmax(cell_outputs, axis=-1), dtypes.int32)
     nb_candidate = array_ops.shape(candidate)[1]
     candidate_ids = \
         math_ops.reduce_sum(array_ops.one_hot(argmax_id, nb_candidate, dtype=dtypes.int32) * candidate,
                             axis=-1)
     with ops.control_dependencies([candidate_ids]):
         return array_ops.identity(candidate_ids)
Beispiel #10
0
 def sample():
     """ Sampling """
     logits = cell_outputs if self._softmax_temperature is None else cell_outputs / self._softmax_temperature
     sample_id_sampler = categorical.Categorical(logits=logits)
     sample_ids = sample_id_sampler.sample(seed=self._seed)
     nb_candidate = array_ops.shape(candidate)[1]
     reduce_op = math_ops.reduce_sum(array_ops.one_hot(sample_ids,
                                                       nb_candidate,
                                                       dtype=dtypes.int32) * candidate, axis=-1)
     with ops.control_dependencies([reduce_op]):
         return array_ops.identity(reduce_op)
Beispiel #11
0
 def get_next_memory_and_attn():
     """ Gets the next memory and attention """
     next_memory = array_ops.concat(
         [
             state.memory,  # [b, t, mem_size]
             array_ops.expand_dims(self._input_fn(inputs), axis=1)
         ],
         axis=1)
     next_attention = self._compute_attention(inputs, next_memory)
     with ops.control_dependencies([next_memory, next_attention]):
         return array_ops.identity(next_memory), array_ops.identity(
             next_attention)
Beispiel #12
0
        def get_next_inputs():
            """ Retrieves the inputs for the next time step """
            inputs_next_step = sample_ids
            inputs_emb_next_step = self._input_layer(self._order_embedding_fn(inputs_next_step))
            candidate_next_step = self._candidate_tas.read(next_time)
            candidate_emb_next_step = self._candidate_embedding_fn(candidate_next_step)

            # Prevents this branch from executing eagerly
            with ops.control_dependencies([inputs_emb_next_step, candidate_next_step, candidate_emb_next_step]):
                return CandidateInputs(inputs=array_ops.identity(inputs_emb_next_step),
                                       candidates=array_ops.identity(candidate_next_step),
                                       candidates_emb=array_ops.identity(candidate_emb_next_step))
Beispiel #13
0
    def zero_state(self, batch_size, dtype):
        """ Return an initial (zero) state tuple for this `AttentionWrapper`.
            :param batch_size: `0D` integer tensor: the batch size.
            :param dtype: The internal state data type.
            :return: AttentionWrapperState` tuple containing zeroed out tensors and, possibly, empty `TensorArrays`.
        """
        with ops.name_scope(type(self).__name__ + 'ZeroState',
                            values=[batch_size]):
            if self._initial_cell_state is not None:
                cell_state = self._initial_cell_state
            else:
                cell_state = self._cell.zero_state(batch_size, dtype)

            error_message = (
                'When calling zero_state of AttentionWrapper %s: ' %
                self._base_name +
                'Non-matching batch sizes between the memory encoder output) and the requested batch '
                'size. Are you using the BeamSearchDecoder? If so, make sure your encoder output has been '
                'tiled to beam_width via tf.contrib.seq2seq.tile_batch, and the batch_size= argument '
                'passed to zero_state is batch_size * beam_width.')
            with ops.control_dependencies(
                    self._batch_size_checks(batch_size, error_message)):
                cell_state = nest.map_structure(
                    lambda state: array_ops.identity(
                        state, name='checked_cell_state'), cell_state)
            initial_alignments = [
                attention_mechanism.initial_alignments(batch_size, dtype)
                for attention_mechanism in self._attention_mechanisms
            ]
            return AttentionWrapperState(
                cell_state=cell_state,
                time=array_ops.zeros([], dtype=dtypes.int32),
                attention=_zero_state_tensors(self._attention_layer_size,
                                              batch_size, dtype),
                alignments=self._item_or_tuple(initial_alignments),
                attention_state=self._item_or_tuple(
                    attention_mechanism.initial_state(batch_size, dtype)
                    for attention_mechanism in self._attention_mechanisms),
                alignment_history=self._item_or_tuple(
                    tensor_array_ops.TensorArray(dtype,
                                                 size=0,
                                                 dynamic_size=True,
                                                 element_shape=alignment.shape)
                    if self._alignment_history else ()
                    for alignment in initial_alignments))
Beispiel #14
0
            def get_next_inputs():
                """ Retrieves the inputs for the next time step """
                def get_training_inputs():
                    """ Selecting training inputs """
                    read_op = self._input_tas.read(next_time)
                    with ops.control_dependencies([read_op]):
                        return array_ops.identity(read_op)

                def get_sample_inputs():
                    """ Selecting greedy/sample inputs """
                    return sample_ids

                inputs_next_step = control_flow_ops.case(
                    [(gen_math_ops.equal(self._decoder_type, TRAINING_DECODER),
                      get_training_inputs),
                     (gen_math_ops.equal(self._decoder_type,
                                         GREEDY_DECODER), get_sample_inputs),
                     (gen_math_ops.equal(self._decoder_type,
                                         SAMPLE_DECODER), get_sample_inputs)],
                    default=get_training_inputs)
                inputs_emb_next_step = self._input_layer(
                    self._embedding_fn(inputs_next_step))

                # Applying mask
                # inputs_one_hot:   (b, 1, VOC, 1)
                # mask_t:           (b, 1, VOC, VOC)
                # next_mask:        (b, VOC)        -- DenseTensor
                inputs_one_hot = array_ops.one_hot(inputs_next_step,
                                                   self.vocab_size)[:, None, :,
                                                                    None]
                mask_t = _slice_mask(self._mask, [-1, next_time, -1, -1],
                                     time_major=self._time_major)
                next_mask = sparse_ops.sparse_reduce_sum(inputs_one_hot *
                                                         mask_t,
                                                         axis=[1, 2])
                next_mask = gen_math_ops.minimum(next_mask, 1.)
                next_mask.set_shape([None, self.vocab_size])

                # Prevents this branch from executing eagerly
                with ops.control_dependencies(
                    [inputs_emb_next_step, next_mask]):
                    return MaskedInputs(
                        inputs=array_ops.identity(inputs_emb_next_step),
                        mask=array_ops.identity(next_mask))
Beispiel #15
0
        def get_dropout_mask():
            """ Computes the dropout mask """
            # random_tensor = uniform [keep_probs, 1.0 + keep_probs)
            random_tensor = keep_probs
            random_tensor += seeded_random(
                seeds,
                offset=offset if offset is not None else seeded_dropout.offset,
                shape=noise_shape[1:],
                dtype=inputs.dtype,
                seed=seed)

            # 0. if [keep_probs, 1.0) and 1. if [1.0, 1.0 + keep_prob)
            binary_tensor = gen_math_ops.floor(random_tensor)
            ret = math_ops.divide(inputs, keep_probs) * binary_tensor
            ret.set_shape(inputs.get_shape())

            # Setting control flow ops to avoid computing this function if not required
            with ops.control_dependencies([ret]):
                return array_ops.identity(ret)
    def _update_standalone(self, version_step=None):
        """ Performs the gradient averaging, updates the variables, and the global step
            :param version_step: A variable that represents the model's version
            :return: The update operation to run

            Note: This method is called when there are no workers (no synchronization)
        """
        with ops.device(self._global_step.device), ops.name_scope(''):
            with ops.control_dependencies([self._apply_grad_op]):
                update_ops = [
                    state_ops.assign(self._local_step, self._global_step)
                ]

                # Increasing version step
                if version_step is not None:
                    update_ops += [state_ops.assign_add(version_step, 1)]

                # Returning
                return control_flow_ops.group(*update_ops)
Beispiel #17
0
    def call(self, inputs, state):
        """ Performs a step of attention-wrapped RNN.

            1) Mix the `inputs` and previous step's `attention` output via `cell_input_fn`.
            2) Call the wrapped `cell` with this input and its previous state.
            3) Score the cell's output with `attention_mechanism`.
            4) Calculate the alignments by passing the score through the `normalizer`.
            5) Calculate the context vector as the inner product between the alignments and the attention_mechanism's
               values (memory).
            6) Calculate the attention output by concatenating the cell output and context through the attention
               layer (a linear layer with `attention_layer_size` outputs).

            :param inputs: (Possibly nested tuple of) Tensor, the input at this time step.
            :param state: An instance of `AttentionWrapperState` containing tensors from the previous time step.
            :return: A tuple `(attention_or_cell_output, next_state)`, where:
            - `attention_or_cell_output` depending on `output_attention`.
            - `next_state` is an instance of `AttentionWrapperState` containing the state calculated at this time step.
        """
        # pylint: disable=arguments-differ
        if not isinstance(state, AttentionWrapperState):
            raise TypeError(
                'Expected state to be instance of AttentionWrapperState. Rcvd %s instead. '
                % type(state))

        # Step 1: Calculate the true inputs to the cell based on the previous attention value.
        cell_inputs = self._cell_input_fn(inputs, state.attention)
        cell_state = state.cell_state
        cell_output, next_cell_state = self._cell(cell_inputs, cell_state)

        cell_batch_size = tensor_shape.dimension_value(
            cell_output.shape[0]) or array_ops.shape(cell_output)[0]
        error_message = (
            'When applying AttentionWrapper %s: ' % self.name +
            'Non-matching batch sizes between '
            'the memory (encoder output) and the query (decoder output). Are you using the '
            'BeamSearchDecoder? You may need to tile your memory input via the tf.contrib.seq2seq.'
            'tile_batch function with argument multiple=beam_width.')

        with variable_scope.variable_scope(self._name_or_scope,
                                           'AttentionWrapper',
                                           [inputs, state]):

            with ops.control_dependencies(
                    self._batch_size_checks(cell_batch_size, error_message)):
                cell_output = array_ops.identity(cell_output,
                                                 name='checked_cell_output')

            if self._is_multi:
                previous_attention_state = state.attention_state
                previous_alignment_history = state.alignment_history
            else:
                previous_attention_state = [state.attention_state]
                previous_alignment_history = [state.alignment_history]

            # Computing attention
            all_alignments = []
            all_attentions = []
            all_attention_states = []
            maybe_all_histories = []
            for i, attention_mechanism in enumerate(
                    self._attention_mechanisms):
                attention, alignments, next_attention_state = _compute_attention(
                    attention_mechanism, cell_output,
                    previous_attention_state[i], self._attention_layers[i]
                    if self._attention_layers else None)
                alignment_history = previous_alignment_history[i].write(
                    state.time, alignments) if self._alignment_history else ()

                all_attention_states.append(next_attention_state)
                all_alignments.append(alignments)
                all_attentions.append(attention)
                maybe_all_histories.append(alignment_history)

            # Building next state
            attention = array_ops.concat(all_attentions, 1)
            next_state = AttentionWrapperState(
                time=state.time + 1,
                cell_state=next_cell_state,
                attention=attention,
                attention_state=self._item_or_tuple(all_attention_states),
                alignments=self._item_or_tuple(all_alignments),
                alignment_history=self._item_or_tuple(maybe_all_histories))

            # Returning
            if self._output_attention:
                return attention, next_state
            return cell_output, next_state
Beispiel #18
0
 def get_next_alignments():
     """ Returns the next alignments """
     next_align = self._alignments_ta.read(next_time)
     with ops.control_dependencies([next_align]):
         return array_ops.identity(next_align)
Beispiel #19
0
    def __init__(self,
                 cell,
                 attention_mechanism,
                 attention_layer_size=None,
                 alignment_history=False,
                 cell_input_fn=None,
                 output_attention=True,
                 initial_cell_state=None,
                 name_or_scope='AttentionWrapper',
                 attention_layer=None):
        """ Construct the `AttentionWrapper`.

            :param cell: An instance of `RNNCell`.
            :param attention_mechanism: A list of `AttentionMechanism` instances or a singleinstance.
            :param attention_layer_size: A list of Python integers or a single Python integer,
                                         the depth of the attention (output) layer(s).
            :param alignment_history: Python boolean, whether to store alignment history from all time steps in the
                                      final output state
            :param cell_input_fn: (optional) A `callable`. The default is: concat([inputs, attention], axis=-1)
            :param output_attention: Python bool. If `True` (default), the output at each time step is the attn value.
            :param initial_cell_state: The initial state value to use for the cell when the user calls `zero_state()`.
            :param name_or_scope: String or VariableScope to use when creating ops.
            :param attention_layer: A list of `tf.layers.Layer` instances or a single `tf.layers.Layer` instance taking
                                    the context and cell output as inputs to generate attention at each time step.
                                    If None (default), use the context as attention at each time step.

            **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in `AttentionWrapper`,
                     then you must ensure that:

            - The encoder output has been tiled to `beam_width` via `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
            - The `batch_size` argument passed to the `zero_state` method of this wrapper is equal to
                `true_batch_size * beam_width`.
            - The initial state created with `zero_state` above contains a `cell_state` value containing properly
                tiled final state from the encoder.
        """
        # pylint: disable=too-many-arguments
        self._name_or_scope = name_or_scope
        with variable_scope.variable_scope(name_or_scope, 'AttentionWrapper'):
            super(AttentionWrapper, self).__init__()
            rnn_cell_impl.assert_like_rnncell("cell", cell)

            # Attention mechanism
            if isinstance(attention_mechanism, (list, tuple)):
                self._is_multi = True
                attention_mechanisms = attention_mechanism
                for attn_mechanism in attention_mechanisms:
                    if not isinstance(attn_mechanism, AttentionMechanism):
                        raise TypeError(
                            'attention_mechanism must contain only instances of AttentionMechanism, saw '
                            'type: %s' % type(attn_mechanism).__name__)
            else:
                self._is_multi = False
                if not isinstance(attention_mechanism, AttentionMechanism):
                    raise TypeError(
                        'attention_mechanism must be an AttentionMechanism or list of multiple '
                        'AttentionMechanism instances, saw type: %s' %
                        type(attention_mechanism).__name__)
                attention_mechanisms = (attention_mechanism, )

            # Cell input function
            if cell_input_fn is None:
                cell_input_fn = lambda inputs, attention: array_ops.concat(
                    [inputs, attention], -1)
            else:
                if not callable(cell_input_fn):
                    raise TypeError(
                        'cell_input_fn must be callable, saw type: %s' %
                        type(cell_input_fn).__name__)

            # Attention layer size
            if attention_layer_size is not None and attention_layer is not None:
                raise ValueError(
                    'Only one of attention_layer_size and attention_layer should be set'
                )

            if attention_layer_size is not None:
                attention_layer_sizes = tuple(
                    attention_layer_size if isinstance(attention_layer_size, (
                        list, tuple)) else (attention_layer_size, ))
                if len(attention_layer_sizes) != len(attention_mechanisms):
                    raise ValueError(
                        'If provided, attention_layer_size must contain exactly one integer per '
                        'attention_mechanism, saw: %d vs %d' %
                        (len(attention_layer_sizes),
                         len(attention_mechanisms)))
                self._attention_layers = tuple(
                    core.Dense(attention_layer_size,
                               name='attention_layer',
                               use_bias=False,
                               dtype=attention_mechanisms[i].dtype) for i,
                    attention_layer_size in enumerate(attention_layer_sizes))
                self._attention_layer_size = sum(attention_layer_sizes)

            elif attention_layer is not None:
                self._attention_layers = tuple(attention_layer if isinstance(
                    attention_layer, (list, tuple)) else (attention_layer, ))
                if len(self._attention_layers) != len(attention_mechanisms):
                    raise ValueError(
                        'If provided, attention_layer must contain exactly one layer per '
                        'attention_mechanism, saw: %d vs %d' %
                        (len(self._attention_layers),
                         len(attention_mechanisms)))
                self._attention_layer_size = \
                    sum(tensor_shape.dimension_value(
                        layer.compute_output_shape([None,
                                                    cell.output_size
                                                    + tensor_shape.dimension_value(mechanism.values.shape[-1])])[-1])
                        for layer, mechanism in zip(self._attention_layers, attention_mechanisms))
            else:
                self._attention_layers = None
                self._attention_layer_size = sum(
                    tensor_shape.dimension_value(
                        attention_mechanism.values.shape[-1])
                    for attention_mechanism in attention_mechanisms)

            self._cell = cell
            self._attention_mechanisms = attention_mechanisms
            self._cell_input_fn = cell_input_fn
            self._output_attention = output_attention
            self._alignment_history = alignment_history

            if initial_cell_state is None:
                self._initial_cell_state = None
            else:
                final_state_tensor = nest.flatten(initial_cell_state)[-1]
                state_batch_size = (tensor_shape.dimension_value(
                    final_state_tensor.shape[0])
                                    or array_ops.shape(final_state_tensor)[0])
                error_message = (
                    'When constructing AttentionWrapper %s: ' % self._base_name
                    +
                    'Non-matching batch sizes between the memory (encoder output) and initial_cell_state. '
                    'Are you using the BeamSearchDecoder? You may need to tile your initial state via the '
                    'tf.contrib.seq2seq.tile_batch function with argument multiple=beam_width.'
                )

                with ops.control_dependencies(
                        self._batch_size_checks(state_batch_size,
                                                error_message)):
                    self._initial_cell_state = \
                        nest.map_structure(lambda state: array_ops.identity(state, name='check_initial_cell_state'),
                                           initial_cell_state)
Beispiel #20
0
 def training():
     """ Selecting training / teacher forcing """
     fill_op = gen_array_ops.fill([array_ops.shape(outputs)[0]], -1)
     with ops.control_dependencies([fill_op]):
         return array_ops.identity(fill_op)
 def _take_grad():
     """ Computes the gradient from the accumulator """
     avg_grad = grad_accum.take_grad(num_required=1)
     with ops.control_dependencies([avg_grad]):
         return array_ops.identity(avg_grad)
Beispiel #22
0
 def get_training_inputs():
     """ Selecting training inputs """
     read_op = self._input_tas.read(next_time)
     with ops.control_dependencies([read_op]):
         return array_ops.identity(read_op)
 def _zero_grad():
     """ Returns a zeroed-out gradient """
     zero_like_grad = array_ops.zeros_like(grad)
     with ops.control_dependencies([zero_like_grad]):
         return array_ops.identity(zero_like_grad)
Beispiel #24
0
 def start_inputs():
     """ Returns the GO_ID initial input """
     embed_op = self._embedding_fn(self._start_inputs)
     with ops.control_dependencies([embed_op]):
         return array_ops.identity(embed_op)
Beispiel #25
0
 def training_inputs():
     """ Returns the training initial input """
     embed_op = self._embedding_fn(self._input_tas.read(0))
     with ops.control_dependencies([embed_op]):
         return array_ops.identity(embed_op)