Beispiel #1
0
 def non_force():
     outputs, final_state, _ = raw_rnn(
         decoder_cell,
         loop_fn_build(
             input_tensor, init_tensor, embedding_tensor, decoder_cell,
             batch_size, embedding_dim + args.encoder_hidden_dim +
             args.attention_output_dim, sequence_length, dense,
             sentence_size))
     non_force_decoder = tf.transpose(outputs.stack(), [1, 0, 2])
     return non_force_decoder
Beispiel #2
0
    def my_dynamic_rnn(
            self, cell, sequence_length, inputs,
            initial_state):  # initial_state = final state of encoder
        inputs_shape = tf.shape(inputs)
        max_seq_len, batch_size, input_features = self.trainingManager.configs.max_seq_len_decoder, inputs_shape[
            1], inputs.shape[2]

        inputs_ta = tf.TensorArray(dtype=tf.float32,
                                   size=max_seq_len)  # max_length = time
        inputs_ta = inputs_ta.unstack(
            inputs)  # length array of [batch , hidden state]

        def loop_fn(cur_time, cur_cell_output, cur_cell_state,
                    cur_loop_state):  # current inputs
            nxt_emit_output = cur_cell_output  # == None for time == 0

            if cur_cell_output is None:  # time == 0
                # initialization logic
                nxt_cell_state = initial_state
            else:
                # any logic that depends on the cell state or cell output..ex attention
                # this part is 1 based
                nxt_cell_state = cur_cell_state

            # common loop logic
            # as in traditional loop the condition is "cur_time < sequence_length" but here i want the finished
            cur_elements_finished = (cur_time >= sequence_length
                                     )  # [batch] # this part is 0 based

            is_current_out_of_bound = tf.reduce_all(
                cur_elements_finished
            )  # scalar --  will cut to the longest sequence given for example [5,2,f] with lengths [3,4] will end at 4

            # this shape has to be deterministic not [....,?]
            nxt_input = tf.cond(
                is_current_out_of_bound,
                lambda: tf.zeros(
                    [batch_size, input_features
                     ],  # input shape [batch , input_features]
                    dtype=tf.float32
                ),  # no input for end of loop .. can't read if out of bounds == time
                lambda: inputs_ta.read(cur_time)  # read current input
            )

            nxt_loop_state = None
            return cur_elements_finished, nxt_input, nxt_cell_state, nxt_emit_output, nxt_loop_state  # next step in time

        outputs_ta, final_state, _ = raw_rnn(cell, loop_fn)
        outputs = outputs_ta.stack()  # [seq_len, batch, hidden_state]
        # final_state # ([batch, hidden_state]) stacked times
        return outputs, final_state
Beispiel #3
0
    def decode(self, decoder_inp, seq_len, encoder_hidden_states, final_state,
               seq_len_inp):
        """Attention-based decoder using LSTM+Attn to model output sequence."""
        # First prepare the decoder input - Embed the input and obtain the
        # relevant loop function
        decoder_inputs, loop_function = self.prepare_decoder_input(decoder_inp)

        # TensorArray is used to do dynamic looping over decoder input
        inputs_ta = tf.TensorArray(size=self.max_output, dtype=tf.float32)
        inputs_ta = inputs_ta.unstack(decoder_inputs)

        batch_size = tf.shape(decoder_inputs)[1]
        embedding_size = decoder_inputs.get_shape()[2].value

        with variable_scope.variable_scope("attention_decoder"):
            attn_length = tf.shape(encoder_hidden_states)[1]
            attn_size = encoder_hidden_states.get_shape()[2].value

            # To calculate W1 * h_t we use a 1-by-1 convolution, need to
            # reshape before.
            hidden = tf.expand_dims(encoder_hidden_states, 2)

            attention_vec_size = 64

            k = variable_scope.get_variable(
                "AttnW", [1, 1, attn_size, attention_vec_size])
            hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = variable_scope.get_variable("AttnV", [attention_vec_size])

            batch_attn_size = array_ops.stack([batch_size, attn_size])
            attn = array_ops.zeros(batch_attn_size, dtype=tf.float32)
            attn.set_shape([None, attn_size])

            batch_alpha_size = array_ops.stack([batch_size, attn_length, 1, 1])
            alpha = array_ops.zeros(batch_alpha_size, dtype=tf.float32)

            attn_mask = tf.sequence_mask(tf.cast(seq_len_inp, tf.int32),
                                         dtype=tf.float32)

            def attn_loop_function(time, cell_output, state, loop_state):
                def attention(query, prev_alpha):
                    """Calculate attention weights."""
                    with variable_scope.variable_scope("Attention"):
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        s = math_ops.reduce_sum(
                            v * math_ops.tanh(hidden_features + y), [2, 3])

                        alpha = nn_ops.softmax(s) * attn_mask
                        sum_vec = tf.reduce_sum(alpha,
                                                reduction_indices=[1],
                                                keep_dims=True) + 1e-12
                        norm_term = tf.tile(sum_vec,
                                            tf.stack([1, tf.shape(alpha)[1]]))
                        alpha = alpha / norm_term
                        alpha = tf.expand_dims(alpha, 2)
                        alpha = tf.expand_dims(alpha, 3)
                        # Now calculate the attention-weighted vector d.
                        d = math_ops.reduce_sum(alpha * hidden, [1, 2])
                        d = array_ops.reshape(d, [-1, attn_size])

                    return tuple([d, alpha])

                # If loop_function is set, we use it instead of decoder_inputs.
                elements_finished = (time >= seq_len)
                finished = tf.reduce_all(elements_finished)

                if cell_output is None:
                    next_state = final_state
                    output = None
                    loop_state = tuple([attn, alpha])
                    next_input = inputs_ta.read(time)
                else:
                    next_state = state
                    loop_state = attention(cell_output, loop_state[1])
                    with variable_scope.variable_scope("AttnOutputProjection"):
                        output = linear([cell_output, loop_state[0]],
                                        self.cell.output_size, True)

                    if loop_function is not None:
                        simple_input = loop_function(output)
                        # print ("Yolo")
                    else:
                        simple_input = tf.cond(
                            finished,
                            lambda: tf.zeros([batch_size, embedding_size],
                                             dtype=tf.float32),
                            lambda: inputs_ta.read(time))

                    # Merge input and previous attentions into one vector of
                    # the right size.
                    input_size = simple_input.get_shape().with_rank(2)[1]
                    if input_size.value is None:
                        raise ValueError("Could not infer input size")
                    with variable_scope.variable_scope("InputProjection"):
                        next_input = linear([simple_input, loop_state[0]],
                                            input_size, True)

                return (elements_finished, next_input, next_state, output,
                        loop_state)

        # outputs is a TensorArray with T=max(sequence_length) entries
        # of shape Bx|V|
        outputs, state, _ = rnn.raw_rnn(self.cell, attn_loop_function)
        # Concatenate the output across timesteps to get a tensor of TxBx|v|
        # shape
        outputs = outputs.concat()
        return outputs
Beispiel #4
0
def dynamic_rnn_decoder(cell,
                        decoder_fn,
                        inputs=None,
                        sequence_length=None,
                        parallel_iterations=None,
                        swap_memory=False,
                        time_major=False,
                        scope=None,
                        name=None):
    """ Dynamic RNN decoder for a sequence-to-sequence model specified by
    RNNCell and decoder function.
    The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn`
    as the decoder does not make any assumptions of sequence length and batch
    size of the input.
    The `dynamic_rnn_decoder` has two modes: training or inference and expects
    the user to create seperate functions for each.
    Under both training and inference, both `cell` and `decoder_fn` are expected,
    where `cell` performs computation at every timestep using `raw_rnn`, and
    `decoder_fn` allows modeling of early stopping, output, state, and next
    input and context.
    When training the user is expected to supply `inputs`. At every time step a
    slice of the supplied input is fed to the `decoder_fn`, which modifies and
    returns the input for the next time step.
    `sequence_length` is needed at training time, i.e., when `inputs` is not
    None, for dynamic unrolling. At test time, when `inputs` is None,
    `sequence_length` is not needed.
    Under inference `inputs` is expected to be `None` and the input is inferred
    solely from the `decoder_fn`.
    Args:
      cell: An instance of RNNCell.
      decoder_fn: A function that takes time, cell state, cell input,
        cell output and context state. It returns a early stopping vector,
        cell state, next input, cell output and context state.
        Examples of decoder_fn can be found in the decoder_fn.py folder.
      inputs: The inputs for decoding (embedded format).
        If `time_major == False` (default), this must be a `Tensor` of shape:
          `[batch_size, max_time, ...]`.
        If `time_major == True`, this must be a `Tensor` of shape:
          `[max_time, batch_size, ...]`.
        The input to `cell` at each time step will be a `Tensor` with dimensions
          `[batch_size, ...]`.
      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
        if `inputs` is not None and `sequence_length` is None it is inferred
        from the `inputs` as the maximal possible sequence length.
      parallel_iterations: (Default: 32).  The number of iterations to run in
        parallel.  Those operations which do not have any temporal dependency
        and can be run in parallel, will be.  This parameter trades off
        time for space.  Values >> 1 use more memory but take less time,
        while smaller values use less memory but computations take longer.
      swap_memory: Transparently swap the tensors produced in forward inference
        but needed for back prop from GPU to CPU.  This allows training RNNs
        which would typically not fit on a single GPU, with very minimal (or no)
        performance penalty.
      time_major: The shape format of the `inputs` and `outputs` Tensors.
        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
        Using `time_major = True` is a bit more efficient because it avoids
        transposes at the beginning and end of the RNN calculation.  However,
        most TensorFlow data is batch-major, so by default this function
        accepts input and emits output in batch-major form.
      scope: VariableScope for the `raw_rnn`;
        defaults to None.
      name: NameScope for the decoder;
        defaults to "dynamic_rnn_decoder"
    Returns:
      A tuple (outputs, final_state, final_context_state) where:
        outputs: the RNN output 'Tensor'.
          If time_major == False (default), this will be a `Tensor` shaped:
            `[batch_size, max_time, cell.output_size]`.
          If time_major == True, this will be a `Tensor` shaped:
            `[max_time, batch_size, cell.output_size]`.
        final_state: The final state and will be shaped
          `[batch_size, cell.state_size]`.
        final_context_state: The context state returned by the final call
          to decoder_fn. This is useful if the context state maintains internal
          data which is required after the graph is run.
          For example, one way to diversify the inference output is to use
          a stochastic decoder_fn, in which case one would want to store the
          decoded outputs, not just the RNN outputs. This can be done by
          maintaining a TensorArray in context_state and storing the decoded
          output of each iteration therein.
    Raises:
      ValueError: if inputs is not None and has less than three dimensions.
    """
    with ops.name_scope(name, "dynamic_rnn_decoder", [
            cell, decoder_fn, inputs, sequence_length, parallel_iterations,
            swap_memory, time_major, scope
    ]):
        if inputs is not None:
            # Convert to tensor
            inputs = ops.convert_to_tensor(inputs)

            # Test input dimensions
            if inputs.get_shape().ndims is not None and (
                    inputs.get_shape().ndims < 2):
                raise ValueError("Inputs must have at least two dimensions")
            # Setup of RNN (dimensions, sizes, length, initial state, dtype)
            if not time_major:
                # [batch, seq, features] -> [seq, batch, features]
                inputs = array_ops.transpose(inputs, perm=[1, 0, 2])

            dtype = inputs.dtype
            # Get data input information
            input_depth = int(inputs.get_shape()[2])
            batch_depth = inputs.get_shape()[1].value
            max_time = inputs.get_shape()[0].value
            if max_time is None:
                max_time = array_ops.shape(inputs)[0]
            # Setup decoder inputs as TensorArray
            inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
            inputs_ta = inputs_ta.unstack(inputs)

        def loop_fn(time, cell_output, cell_state, loop_state):
            if cell_state is None:  # first call, before while loop (in raw_rnn)
                if cell_output is not None:
                    raise ValueError(
                        "Expected cell_output to be None when cell_state "
                        "is None, but saw: %s" % cell_output)
                if loop_state is not None:
                    raise ValueError(
                        "Expected loop_state to be None when cell_state "
                        "is None, but saw: %s" % loop_state)
                context_state = None
            else:  # subsequent calls, inside while loop, after cell excution
                if isinstance(loop_state, tuple):
                    (done, context_state) = loop_state
                else:
                    done = loop_state
                    context_state = None

            # call decoder function
            if inputs is not None:  # training
                # get next_cell_input
                if cell_state is None:
                    next_cell_input = inputs_ta.read(0)
                else:
                    if batch_depth is not None:
                        batch_size = batch_depth
                    else:
                        batch_size = array_ops.shape(done)[0]
                    next_cell_input = control_flow_ops.cond(
                        math_ops.equal(time, max_time),
                        lambda: array_ops.zeros([batch_size, input_depth],
                                                dtype=dtype),
                        lambda: inputs_ta.read(time))
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state,
                                                  next_cell_input, cell_output,
                                                  context_state)
            else:  # inference
                # next_cell_input is obtained through decoder_fn
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state, None,
                                                  cell_output, context_state)

            # check if we are done
            if next_done is None:  # training
                next_done = time >= sequence_length

            # build next_loop_state
            if next_context_state is None:
                next_loop_state = next_done
            else:
                next_loop_state = (next_done, next_context_state)

            return (next_done, next_cell_input, next_cell_state, emit_output,
                    next_loop_state)

        # Run raw_rnn function
        outputs_ta, final_state, final_loop_state = rnn.raw_rnn(
            cell,
            loop_fn,
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory,
            scope=scope)
        outputs = outputs_ta.stack()

        # Get final context_state, if generated by user
        if isinstance(final_loop_state, tuple):
            final_context_state = final_loop_state[1]
        else:
            final_context_state = None

        if not time_major:
            # [seq, batch, features] -> [batch, seq, features]
            outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
        return outputs, final_state, final_context_state
Beispiel #5
0
    def decode(self, decoder_inputs, seq_len, encoder_hidden_states,
               final_state, seq_len_inp):
        """Abstract method that needs to be extended by Inheritor classes.

        Args:
            decoder_inputs: Time major decoder IDs, TxB that contain ground tr.
                during training and are dummy value holders at test time.
            seq_len: Output sequence length for each input in minibatch.
                Useful to limit the computation to the max output length in
                a minibatch.
            encoder_hidden_states: Batch major output, BxTxH of encoder RNN.
                Useful with attention-enabled decoders.
            final_state: Final hidden state of encoder RNN. Useful for
                initializing decoder RNN.
            seq_len_inp: Useful with attention-enabled decoders to mask the
                outputs corresponding to padding symbols.
        Returns:
            outputs: Time major output, TxBx|V|, of decoder RNN.
        """
        decoder_inputs, loop_function = self.prepare_decoder_input(
            decoder_inputs)

        output_size = self.cell.output_size

        with variable_scope.variable_scope("attention_decoder"):
            batch_size = array_ops.shape(decoder_inputs)[1]
            embedding_size = decoder_inputs.get_shape()[2].value

            attn_length = tf.shape(encoder_hidden_states)[1]
            attn_size = encoder_hidden_states.get_shape()[2].value

            # To calculate W1 * h_t we use a 1-by-1 convolution, need to
            # reshape before.
            hidden = tf.expand_dims(encoder_hidden_states, 2)

            attention_vec_size = 64

            k = variable_scope.get_variable(
                "AttnW", [1, 1, attn_size, attention_vec_size])
            hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = variable_scope.get_variable("AttnV", [attention_vec_size])
            if self.use_conv:
                F = variable_scope.get_variable(
                    "AttnF",
                    [self.conv_filter_width, 1, 1, self.conv_num_channels])
                U = variable_scope.get_variable(
                    "AttnU",
                    [1, 1, self.conv_num_channels, attention_vec_size])

            batch_attn_size = array_ops.stack([batch_size, attn_size])
            attn = array_ops.zeros(batch_attn_size, dtype=tf.float32)
            attn.set_shape([None, attn_size])

            batch_alpha_size = array_ops.stack([batch_size, attn_length, 1, 1])
            alpha = array_ops.zeros(batch_alpha_size, dtype=tf.float32)

            # Assumes Time major arrangement
            inputs_ta = tf.TensorArray(size=400,
                                       dtype=tf.float32,
                                       dynamic_size=True)
            inputs_ta = inputs_ta.unstack(decoder_inputs)

            attn_mask = tf.sequence_mask(tf.cast(seq_len_inp, tf.int32),
                                         dtype=tf.float32)

            def raw_loop_function(time, cell_output, state, loop_state):
                def attention(query, prev_alpha):
                    """Calculate attention weights."""
                    with variable_scope.variable_scope("Attention"):
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        if self.use_conv:
                            conv_features = nn_ops.conv2d(
                                prev_alpha, F, [1, 1, 1, 1], "SAME")
                            feat_reshape = nn_ops.conv2d(
                                conv_features, U, [1, 1, 1, 1], "SAME")
                            s = math_ops.reduce_sum(
                                v * math_ops.tanh(hidden_features + y +
                                                  feat_reshape), [2, 3])
                        else:
                            s = math_ops.reduce_sum(
                                v * math_ops.tanh(hidden_features + y), [2, 3])

                        alpha = nn_ops.softmax(s) * attn_mask
                        sum_vec = tf.reduce_sum(alpha,
                                                reduction_indices=[1],
                                                keep_dims=True) + 1e-12
                        norm_term = tf.tile(sum_vec,
                                            tf.stack([1, tf.shape(alpha)[1]]))
                        alpha = alpha / norm_term
                        alpha = tf.expand_dims(alpha, 2)
                        alpha = tf.expand_dims(alpha, 3)
                        # Now calculate the attention-weighted vector d.
                        d = math_ops.reduce_sum(alpha * hidden, [1, 2])
                        d = array_ops.reshape(d, [-1, attn_size])

                    return tuple([d, alpha])

                # If loop_function is set, we use it instead of decoder_inputs.
                elements_finished = (time >= seq_len)
                finished = tf.reduce_all(elements_finished)

                if cell_output is None:
                    next_state = final_state
                    output = None
                    loop_state = tuple([attn, alpha])
                    next_input = inputs_ta.read(time)
                else:
                    next_state = state
                    loop_state = attention(cell_output, loop_state[1])
                    with variable_scope.variable_scope("AttnOutputProjection"):
                        output = linear([cell_output, loop_state[0]],
                                        output_size, True)

                    if loop_function is not None:
                        simple_input = loop_function(output)
                        # print ("Yolo")
                    else:
                        simple_input = tf.cond(
                            finished,
                            lambda: tf.zeros([batch_size, embedding_size],
                                             dtype=tf.float32),
                            lambda: inputs_ta.read(time))

                    # Merge input and previous attentions into one vector of
                    # the right size.
                    input_size = simple_input.get_shape().with_rank(2)[1]
                    if input_size.value is None:
                        raise ValueError("Could not infer input size")
                    with variable_scope.variable_scope("InputProjection"):
                        next_input = linear([simple_input, loop_state[0]],
                                            input_size, True)

                return (elements_finished, next_input, next_state, output,
                        loop_state)

        outputs, state, _ = rnn.raw_rnn(self.cell, raw_loop_function)
        return outputs.concat()
Beispiel #6
0
def dynamic_rnn_decoder(cell,
                        decoder_fn,
                        inputs=None,
                        sequence_length=None,
                        parallel_iterations=None,
                        swap_memory=False,
                        time_major=False,
                        scope=None,
                        name=None):
    with ops.name_scope(name, "dynamic_rnn_decoder", [
            cell, decoder_fn, inputs, sequence_length, parallel_iterations,
            swap_memory, time_major, scope
    ]):
        if inputs is not None:
            # Convert to tensor
            inputs = ops.convert_to_tensor(inputs)

            # Test input dimensions
            if inputs.get_shape().ndims is not None and (
                    inputs.get_shape().ndims < 2):
                raise ValueError("Inputs must have at least two dimensions")
            # Setup of RNN (dimensions, sizes, length, initial state, dtype)
            if not time_major:
                # [batch, seq, features] -> [seq, batch, features]
                inputs = array_ops.transpose(inputs, perm=[1, 0, 2])

            dtype = inputs.dtype
            # Get data input information
            input_depth = int(inputs.get_shape()[2])
            batch_depth = inputs.get_shape()[1].value
            max_time = inputs.get_shape()[0].value
            if max_time is None:
                max_time = array_ops.shape(inputs)[0]
            # Setup decoder inputs as TensorArray
            inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
            inputs_ta = inputs_ta.unstack(inputs)

        def loop_fn(time, cell_output, cell_state, loop_state):
            if cell_state is None:  # first call, before while loop (in raw_rnn)
                if cell_output is not None:
                    raise ValueError(
                        "Expected cell_output to be None when cell_state "
                        "is None, but saw: %s" % cell_output)
                if loop_state is not None:
                    raise ValueError(
                        "Expected loop_state to be None when cell_state "
                        "is None, but saw: %s" % loop_state)
                context_state = None
            else:  # subsequent calls, inside while loop, after cell excution
                if isinstance(loop_state, tuple):
                    (done, context_state) = loop_state
                else:
                    done = loop_state
                    context_state = None

            # call decoder function
            if inputs is not None:  # training
                # get next_cell_input
                if cell_state is None:
                    next_cell_input = inputs_ta.read(0)
                else:
                    if batch_depth is not None:
                        batch_size = batch_depth
                    else:
                        batch_size = array_ops.shape(done)[0]
                    next_cell_input = control_flow_ops.cond(
                        math_ops.equal(time, max_time),
                        lambda: array_ops.zeros([batch_size, input_depth],
                                                dtype=dtype),
                        lambda: inputs_ta.read(time))
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state,
                                                  next_cell_input, cell_output,
                                                  context_state)
            else:  # inference
                # next_cell_input is obtained through decoder_fn
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state, None,
                                                  cell_output, context_state)

            # check if we are done
            if next_done is None:  # training
                next_done = time >= sequence_length

            # build next_loop_state
            if next_context_state is None:
                next_loop_state = next_done
            else:
                next_loop_state = (next_done, next_context_state)

            return (next_done, next_cell_input, next_cell_state, emit_output,
                    next_loop_state)

        # Run raw_rnn function
        outputs_ta, final_state, final_loop_state = rnn.raw_rnn(
            cell,
            loop_fn,
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory,
            scope=scope)
        outputs = outputs_ta.stack()

        # Get final context_state, if generated by user
        if isinstance(final_loop_state, tuple):
            final_context_state = final_loop_state[1]
        else:
            final_context_state = None

        if not time_major:
            # [seq, batch, features] -> [batch, seq, features]
            outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
        return outputs, final_state, final_context_state
Beispiel #7
0
    def inference(self, inputs, masks):
        """
        Args:
            inputs: [batch_size, max_step, ...]
            masks: [batch_size, max_step]
        """

        self.masks = masks
        num_class = self.num_class
        hidden_size = self.hidden_size
        num_layer = self.num_layer
        keep_prob = self.keep_prob
        seq_length = tf.reduce_sum(tf.cast(masks, tf.int32), -1)

        # raw_shape: [batch_size, max_step, ..., in_channels]
        raw_shape = inputs.get_shape().as_list()
        batch_size = raw_shape[0]
        num_step = raw_shape[1]
        in_channel = raw_shape[-1]

        inputs = tf.reshape(inputs, [batch_size, num_step, -1, in_channel])
        # inputs : [batch_size, max_step, feature, in_channels]
        feature_size = inputs.get_shape()[2].value
        channel_size = inputs.get_shape()[3].value

        def loop_fn(time, cell_output, loop_state):
            if cell_output is None:
                # time == 0
                emit_output = None
                # attend weights that dot product with inputs
                # init first attention
                attend_shape = [batch_size, feature_size, 1]
                attend_weights = variable_with_weight_decay('attend_weights',
                                                            attend_shape,
                                                            stddev=0.01,
                                                            wd=0.0)
            else:
                emit_output = cell_output
                # attention from last output
                attend_weights = affine_transform(cell_output,
                                                  feature_size,
                                                  scope_name='attend')
                attend_weights = tf.nn.softmax(attend_weights)
                attend_weights = tf.expand_dims(attend_weights, -1)

            elements_finished = (time >= seq_length)
            finished = tf.reduce_all(elements_finished)
            next_input = tf.cond(
                finished,
                lambda: tf.zeros([batch_size, feature_size], dtype=tf.float32),
                lambda: tf.reduce_sum(
                    inputs_ta.read(time) * attend_weights, 2)
                # lambda: tf.reduce_mean(inputs_ta.read(time), 2)
            )

            next_loop_state = None
            return (elements_finished, next_input, emit_output,
                    next_loop_state)

        # build LSTM subgraph
        lstm_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple=True)
        # dropout layer (at output)
        if keep_prob < 1:
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=keep_prob)
        # multi-cells
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layer,
                                           state_is_tuple=True)

        # dynamic_rnn
        # inputs: [batch_size, max_step, ...]
        inputs = tf.transpose(inputs, perm=[1, 0, 2, 3])
        # inputs: [max_step, batch_size, ...]
        inputs_ta = tf.TensorArray(dtype=tf.float32, size=num_step)
        inputs_ta = inputs_ta.unpack(inputs)

        # init (? ...)
        initial_state = cell.zero_state(batch_size, tf.float32)
        # start to run overall
        outputs_ta, final_state, _ = raw_rnn(cell, loop_fn, initial_state)
        outputs = outputs_ta.pack()
        # outputs: [max_step, batch_size, hidden_size]
        # softmax_linear (? ...)
        outputs = tf.tanh(outputs)
        outputs = affine_transform(outputs,
                                   num_class,
                                   scope_name="softmax_linear")
        # outputs: [max_step, batch_size, num_class]
        outputs = tf.transpose(outputs, perm=[1, 0, 2])
        # outputs: [batch_size, max_step, num_class]

        return outputs
Beispiel #8
0
def dynamic_rnn_decoder(
        cell,  # 多层的 RNNCell
        decoder_fn,  # 对每个时间步输出进行处理成输入的函数
        inputs=None,  # 训练时,传入该参数,为 response 的嵌入向量 [batch_size, decoder_len, 600(300为词嵌入,100*3为3个实体嵌入)]
        sequence_length=None,  # 训练时,传入该参数,为 response 的长度向量
        parallel_iterations=None,  # 没用到这个参数
        swap_memory=False,  # 没用到这个参数
        time_major=False,  # 表示输入的数据集是否是 time-major 的,实验中为 False
        scope=None,  # ="decoder_rnn"
        name=None):  # 没用到这个参数
    """ seq2seq 模型的 RNN 动态解码器.

    dynamic_rnn_decoder 类似于 tf.python.ops.rnn.dynamic_rnn,因为解码器没有假设序列长度和输入的 batch size

    dynamic_rnn_decoder 有两种模式:训练和推导。并且,希望用户为每种模式创建分别的函数

    在训练和推导模式,cell 和 decoder_fn 都是被需要的。其中 cell 为每个时间步用的 RNN,
    decoder_fn 允许为 early stopping, state, next input, context 建模

    当训练时,要求用户提供 inputs。在每个时间步上,所提供 input 的一个切片被传给 decoder_fn,这修改并返回下个时间步的 input。

    sequence_length 在训练时为了展开而被需要,例如,当 input is not None。在测试时,当 input is None,sequence_length 就用不着了。

    在推导时,inputs 被期望为 None,并且 input 从 decoder_fn 中被单独的推导。

    Args:
        cell: RNNCell 的一个实例
        decoder_fn:
            一个需要 time, cell state, cell input,cell output 和 context state 的函数。
            他返回一个 early stopping 向量,cell state, next input, cell output 和 context state。
        inputs: 用于解码的输入,嵌入的形式
            If `time_major == False` (default), this must be a `Tensor` of shape:
                `[batch_size, max_time, ...]`.
            If `time_major == True`, this must be a `Tensor` of shape:
                `[max_time, batch_size, ...]`.
            The input to `cell` at each time step will be a `Tensor` with dimensions
                `[batch_size, ...]`.

        sequence_length:
            (可选) 一个 size 为 batch_size 的 int32/int64 向量。
            如果 inputs is not None 并且 sequence_length is None,
            它从 inputs 中被推导出来作为最大可能的序列长度
        parallel_iterations: (Default: 32).    平行运行中的迭代数量。
            这些操作没有任何的时间的依赖并且能够平行运行。
            这个参数为了空间折损了时间。
            值 >> 1 使用更多的内存但是花费更少的时间,
            然而较小的参数使用更少的内存但是计算的时间更久。
        swap_memory: 透明的交换前向传播产生的张量但是需要来自 GPU 到 CPU 的反向传播
            这允许训练可能不适用于单个 GPU 的 RNNs,只存在非常小的(或没有)性能损失。
        time_major: The shape format of the `inputs` and `outputs` Tensors.
            If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
            If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
            使用 time_major = True 是更有效率的,因为它避免了开始和结束时 RNN 计算的转换
            但是大多数 TensorFlow 数据是 batch-major 的,所以这个函数默认接受和发出
            batch-major 形式的输入和输出。
        scope: VariableScope for the `raw_rnn`;
            defaults to None.
        name: NameScope for the decoder;
            defaults to "dynamic_rnn_decoder"

    Returns:
        一个元组 (outputs, final_state, final_context_state) 其中:

            outputs: RNN 输出张量
                If time_major == False (default), this will be a `Tensor` shaped:
                    `[batch_size, max_time, cell.output_size]`.
                If time_major == True, this will be a `Tensor` shaped:
                    `[max_time, batch_size, cell.output_size]`.
            final_state: The final state and will be shaped
                `[batch_size, cell.state_size]`.

            final_context_state:
                上下文状态通过 decoder_fn 的最终调用被返回。如果上下文状态在图运行之后
                保持保持间隔数据,这就是有用的。
                例如,一种使推导输出多样化的方法是使用一个随机的解码器函数,在这种情况下,
                我们想存储解码的输出,而不仅仅是 RNN 的输出。这能够通过在 context_state
                中维护一个 TensorArray 实现,并且存储每个迭代解码的输出。
    Raises:
        ValueError: if inputs is not None and has less than three dimensions.
    """
    with ops.name_scope(name, "dynamic_rnn_decoder", [
            cell, decoder_fn, inputs, sequence_length, parallel_iterations,
            swap_memory, time_major, scope
    ]):
        if inputs is not None:
            # 将输入转化成张量
            inputs = ops.convert_to_tensor(inputs)
            # 测试输入的维度,不能小于 2
            if inputs.get_shape().ndims is not None and (
                    inputs.get_shape().ndims < 2):
                raise ValueError("Inputs must have at least two dimensions")

            # 如果不是 time_major,就要做一个转置
            if not time_major:
                # [batch, seq, features] -> [seq, batch, features]
                inputs = array_ops.transpose(
                    inputs, perm=[1, 0, 2])  # decoder_len * batch_size * 600

            dtype = inputs.dtype
            input_depth = int(inputs.get_shape()[2])  # 600
            batch_depth = inputs.get_shape()[1].value  # batch_size
            max_time = inputs.get_shape()[0].value  # decoder_len
            if max_time is None:
                max_time = array_ops.shape(inputs)[0]

            # 将解码器的输入设置成一个 tensor 数组
            # 数组长度为 decoder_len,数组的每个元素是个 batch_size * 600 的张量
            inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
            inputs_ta = inputs_ta.unstack(inputs)

        def loop_fn(time, cell_output, cell_state, loop_state):
            """loop_fn 是一个函数,这个函数在 rnn 的相邻时间步之间被调用。
              
            函数的总体调用过程为:
            1. 初始时刻,先调用一次loop_fn,获取第一个时间步的cell的输入,loop_fn 中进行读取初始时刻的输入。
            2. 进行cell自环 (output, cell_state) = cell(next_input, state)
            3. 在 t 时刻 RNN 计算结束时,cell 有一组输出 cell_output 和状态 cell_state,都是 tensor;
            4. 到 t+1 时刻开始进行计算之前,loop_fn 被调用,调用的形式为
                loop_fn( t, cell_output, cell_state, loop_state),而被期待的输出为:(finished, next_input, initial_state, emit_output, loop_state);
            5. RNN 采用 loop_fn 返回的 next_input 作为输入,initial_state 作为状态,计算得到新的输出。
            在每次执行(output, cell_state) =  cell(next_input, state)后,执行 loop_fn() 进行数据的准备和处理。
            emit_structure 即上文的 emit_output 将会按照时间存入 emit_ta 中。
            loop_state  记录 rnn loop 的变量的状态。用作记录状态
            tf.where 是用来实现dynamic的。

            time: 第 time 个时间步之前的处理,起始为 0
            cell_output: 上一个时间步的输出
            cell_state: RNNCells 的长时记忆
            loop_state: 保存了上个时间步执行后是否已经结束,如果输出 alignments,还保存了存有 alignments 的 TensorArray
            return:
            """

            # 解码之前第一次调用
            if cell_state is None:
                # cell_state is None 时,cell_output 应该为 None
                if cell_output is not None:
                    raise ValueError(
                        "Expected cell_output to be None when cell_state "
                        "is None, but saw: %s" % cell_output)
                # cell_state is None 时,loop_state 应该为 None
                if loop_state is not None:
                    raise ValueError(
                        "Expected loop_state to be None when cell_state "
                        "is None, but saw: %s" % loop_state)
                context_state = None

            # 后续的调用
            else:
                if isinstance(loop_state, tuple):
                    (done, context_state) = loop_state
                else:
                    done = loop_state
                    context_state = None

            # 训练
            # 训练时 input is not None
            # 获得 next_cell_input
            if inputs is not None:
                # 第一个时间步之前的处理
                if cell_state is None:
                    next_cell_input = inputs_ta.read(0)  # 其实第一列都是 GO_ID

                # 之后的 cell 之间的处理
                else:

                    if batch_depth is not None:
                        batch_size = batch_depth
                    else:
                        batch_size = array_ops.shape(done)[
                            0]  # done 是对循环是否结束的标注,

                    # 如果 time == max_time, 则 next_cell_input = batch_size * 600 的全 1 矩阵
                    # 否则,next_cell_input 从数据中读下一时间步的数据
                    next_cell_input = control_flow_ops.cond(
                        math_ops.equal(time, max_time),
                        lambda: array_ops.zeros([batch_size, input_depth],
                                                dtype=dtype),
                        lambda: inputs_ta.read(time))

                # emit_output = attention
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state,
                                                  next_cell_input, cell_output,
                                                  context_state)
            # 推导
            else:
                # next_cell_input 通过 decoder_fn 获得
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state, None,
                                                  cell_output, context_state)
            # 检查是否已经结束
            if next_done is None:  # 当训练时,next_done 返回的是 None
                next_done = time >= sequence_length  # 当 time >= sequence_length 时,next_done = True

            # 构建 next_loop_state
            if next_context_state is None:  # 如果不输出 alignments
                next_loop_state = next_done
            else:
                next_loop_state = (next_done, next_context_state)

            return (next_done, next_cell_input, next_cell_state, emit_output,
                    next_loop_state)

        # Run raw_rnn function
        outputs_ta, final_state, final_loop_state = rnn.raw_rnn(
            cell,
            loop_fn,
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory,
            scope=scope)
        outputs = outputs_ta.stack()

        # 如果要输出 alignments,就获取 final_context_state
        if isinstance(final_loop_state, tuple):
            final_context_state = final_loop_state[1]
        else:
            final_context_state = None

        # 如果不是 time_major,就转置回去
        if not time_major:
            # [seq, batch, features] -> [batch, seq, features]
            outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
        return outputs, final_state, final_context_state
Beispiel #9
0
def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None,
                        parallel_iterations=None, swap_memory=False,
                        time_major=False, scope=None, name=None):
  """ Dynamic RNN decoder for a sequence-to-sequence model specified by
  RNNCell and decoder function.

  The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn`
  as the decoder does not make any assumptions of sequence length and batch
  size of the input.

  The `dynamic_rnn_decoder` has two modes: training or inference and expects
  the user to create seperate functions for each.

  Under both training and inference `cell` and `decoder_fn` is expected. Where
  the `cell` performs computation at every timestep using the `raw_rnn` and
  the `decoder_fn` allows modelling of early stopping, output, state, and next
  input and context.

  When training the user is expected to supply `inputs`. At every time step a
  slice of the supplied input is fed to the `decoder_fn`, which modifies and
  returns the input for the next time step.

  `sequence_length` is needed at training time, i.e., when `inputs` is not
  None, for dynamic unrolling. At test time, when `inputs` is None,
  `sequence_length` is not needed.

  Under inference `inputs` is expected to be `None` and the input is inferred
  solely from the `decoder_fn`.

  Args:
    cell: An instance of RNNCell.
    decoder_fn: A function that takes time, cell state, cell input,
      cell output and context state. It returns a early stopping vector,
      cell state, next input, cell output and context state.
      Examples of decoder_fn can be found in the decoder_fn.py folder.
    inputs: The inputs for decoding (embedded format).

      If `time_major == False` (default), this must be a `Tensor` of shape:
        `[batch_size, max_time, ...]`.

      If `time_major == True`, this must be a `Tensor` of shape:
        `[max_time, batch_size, ...]`.

      The input to `cell` at each time step will be a `Tensor` with dimensions
        `[batch_size, ...]`.
    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
      if `inputs` is not None and `sequence_length` is None it is inferred
      from the `inputs` as the maximal possible sequence length.
    parallel_iterations: (Default: 32).  The number of iterations to run in
      parallel.  Those operations which do not have any temporal dependency
      and can be run in parallel, will be.  This parameter trades off
      time for space.  Values >> 1 use more memory but take less time,
      while smaller values use less memory but computations take longer.
    swap_memory: Transparently swap the tensors produced in forward inference
      but needed for back prop from GPU to CPU.  This allows training RNNs
      which would typically not fit on a single GPU, with very minimal (or no)
      performance penalty.
    time_major: The shape format of the `inputs` and `outputs` Tensors.
      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
      Using `time_major = True` is a bit more efficient because it avoids
      transposes at the beginning and end of the RNN calculation.  However,
      most TensorFlow data is batch-major, so by default this function
      accepts input and emits output in batch-major form.
    scope: VariableScope for the `raw_rnn`;
      defaults to None.
    name: NameScope for the decoder;
      defaults to "dynamic_rnn_decoder"

  Returns:
    A pair (outputs, state) where:

      outputs: the RNN output 'Tensor'.

        If time_major == False (default), this will be a `Tensor` shaped:
          `[batch_size, max_time, cell.output_size]`.

        If time_major == True, this will be a `Tensor` shaped:
          `[max_time, batch_size, cell.output_size]`.

      state: The final state and will be shaped
             `[batch_size, cell.state_size]`.

  Raises:
    ValueError: if inputs is not None and has less than three dimensions.
  """
  with ops.name_scope(name, "dynamic_rnn_decoder",
                      [cell, decoder_fn, inputs, sequence_length,
                       parallel_iterations, swap_memory, time_major, scope]):
    if inputs is not None:
      # Convert to tensor
      inputs = ops.convert_to_tensor(inputs)

      # Test input dimensions
      if inputs.get_shape().ndims is not None and (
          inputs.get_shape().ndims < 2):
        raise ValueError("Inputs must have at least two dimensions")
      # Setup of RNN (dimensions, sizes, length, initial state, dtype)
      if not time_major:
        # [batch, seq, features] -> [seq, batch, features]
        inputs = array_ops.transpose(inputs, perm=[1, 0, 2])

      dtype = inputs.dtype
      # Get data input information
      input_depth = int(inputs.get_shape()[2])
      batch_depth = inputs.get_shape()[1].value
      max_time = inputs.get_shape()[0].value
      if max_time is None:
        max_time = array_ops.shape(inputs)[0]
      # Setup decoder inputs as TensorArray
      inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
      inputs_ta = inputs_ta.unpack(inputs)

    def loop_fn(time, cell_output, cell_state, loop_state):
      if cell_state is None:  # first call, before while loop (in raw_rnn)
        if cell_output is not None:
          raise ValueError("Expected cell_output to be None when cell_state "
                           "is None, but saw: %s" % cell_output)
        if loop_state is not None:
          raise ValueError("Expected loop_state to be None when cell_state "
                           "is None, but saw: %s" % loop_state)
        context_state = None
      else:  # subsequent calls, inside while loop, after cell excution
        if isinstance(loop_state, tuple):
          (done, context_state) = loop_state
        else:
          done = loop_state
          context_state = None

      # call decoder function
      if inputs is not None:  # training
        # get next_cell_input
        if cell_state is None:
          next_cell_input = inputs_ta.read(0)
        else:
          if batch_depth is not None:
            batch_size = batch_depth
          else:
            batch_size = array_ops.shape(done)[0]
          next_cell_input = control_flow_ops.cond(
              math_ops.equal(time, max_time),
              lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype),
              lambda: inputs_ta.read(time))
        (next_done, next_cell_state, next_cell_input, emit_output,
         next_context_state) = decoder_fn(time, cell_state, next_cell_input,
                                          cell_output, context_state)
      else:  # inference
        # next_cell_input is obtained through decoder_fn
        (next_done, next_cell_state, next_cell_input, emit_output,
         next_context_state) = decoder_fn(time, cell_state, None, cell_output,
                                          context_state)

      # check if we are done
      if next_done is None:  # training
        next_done = time >= sequence_length

      # build next_loop_state
      if next_context_state is None:
        next_loop_state = next_done
      else:
        next_loop_state = (next_done, next_context_state)

      return (next_done, next_cell_input, next_cell_state,
              emit_output, next_loop_state)

    # Run raw_rnn function
    outputs_ta, state, _ = rnn.raw_rnn(
        cell, loop_fn, parallel_iterations=parallel_iterations,
        swap_memory=swap_memory, scope=scope)
    outputs = outputs_ta.pack()

    if not time_major:
      # [seq, batch, features] -> [batch, seq, features]
      outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
    return outputs, state
def policy_gradient_pointer_attention_decoder(
        cell,
        scope,
        memory,
        decoder_inputs,
        initial_state,
        enc_padding_mask,
        prev_coverage=None,
        # tokens
        UNK_token=0,
        start_tokens=None,
        embeddings=None,
        vocab_size=50000,
        num_source_OOVs=None,
        enc_batch_extended_vocab=None,
        # some flags
        reinforce=False,
        pointer_gen=True,
        use_coverage=False,
        debug_mode=False,
        # for decoding
        initial_state_attention=False):
    """PolicyGradient decoder"""

    # some todo's
    # if initial_state_attention:
    #     raise NotImplementedError
    if use_coverage or prev_coverage:
        raise NotImplementedError

    if reinforce and ((embeddings is None) or (start_tokens is None)):
        raise ValueError("when using reinforce, "
            "please provide embeddings and start_tokens")

    print("TODO: Using tf.where to replace tf.cond in next_cell_input")
    print("change sampled_tokens not include <start>?")

    # input data
    max_time = decoder_inputs.get_shape()[1].value
    attn_size = memory.get_shape()[2].value
    batch_size = memory.get_shape()[0].value
    input_size = decoder_inputs.get_shape()[2].value
    sequence_length = array_ops.tile([max_time], [batch_size])
    inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
    # TensorArray will unstack first dimension
    inputs_ta = inputs_ta.unstack(tf.transpose(decoder_inputs, perm=[1, 0, 2]))

    with variable_scope.variable_scope(scope.Attention):
        # layers
        # To calculate attention, we calculate
        #   v^T tanh(W_h h_i + W_s s_t + b_attn)
        # where h_i is an encoder state, and s_t a decoder state.
        # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t).
        # We set it to be equal to the size of the encoder states.
        attention_vec_size = attn_size
        # memory kernel maps encoder hidden states into memory
        memory_kernel = core_layers.Dense(
            units=attention_vec_size,
            use_bias=False, name="memory_kernel")
        # query kernel maps decoder hidden state into query
        query_kernel = core_layers.Dense(
            units=attention_vec_size,
            use_bias=True, name="query_kernel")
        # input kernel maps decoder hidden state into query
        input_kernel = core_layers.Dense(
            units=input_size,
            use_bias=True, name="input_kernel")
        # pgen_kernel maps states into p_gen
        pgen_kernel = core_layers.Dense(
            units=1, activation=tf.sigmoid,
            use_bias=True, name="pgen_kernel")
        # output_kernel maps cell_outputs into final cell outputs
        output_kernel = core_layers.Dense(
            units=cell.output_size,
            use_bias=True, name="output_kernel")
        # coverage kernels transforms coverage vector
        coverage_kernel = core_layers.Dense(
            units=attention_vec_size,
            use_bias=False, name="coverage_kernel")
        # output_kernel maps cell_outputs into final cell outputs
        logits_kernel = core_layers.Dense(
            units=vocab_size,
            use_bias=True, name="logits_kernel")
        
        # Get the weight matrix W_h and apply it to each encoder state to get
        # (W_h h_i), the encoder features
        # shape (batch_size,attn_length,1,attention_vec_size)
        processed_memory = memory_kernel(memory)

        def masked_attention(score):
            """Softmax + enc_padding_mask + re-normalize"""
            # take softmax. shape (batch_size, attn_length)
            attn_dist = nn_ops.softmax(score)
            attn_dist *= enc_padding_mask
            # shape (batch_size)
            masked_sums = math_ops.reduce_sum(attn_dist, axis=1)
            # re-normalize
            return attn_dist / array_ops.reshape(masked_sums, [-1, 1])

        def _compute_attention(cell_output, coverage=None):
            # Pass the decoder state through a linear layer
            # (this is W_s s_t + b_attn in the paper)
            # shape (batch_size, attention_vec_size)
            processed_query = control_flow_ops.cond(
                # i.e. None or not set
                _is_zero_matrix(coverage),
                # v^T tanh(W_h h_i + W_s s_t + b_attn)
                true_fn=lambda: query_kernel(cell_output),
                # v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                false_fn=lambda: (query_kernel(cell_output) +
                                  coverage_kernel(coverage)))

            score = attention_utils._bahdanau_score(
                processed_query=processed_query,
                keys=processed_memory,
                normalize=False)

            # Calculate attention distribution
            alignments = masked_attention(score)

            if use_coverage:
                # update coverage
                coverage = coverage + alignments

            # Reshape from [batch_size, memory_time]
            # to [batch_size, 1, memory_time]
            expanded_alignments = array_ops.expand_dims(alignments, 1)
            # Context is the inner product of alignments and values along the
            # memory time dimension.
            # alignments shape is
            #   [batch_size, 1, memory_time]
            # attention_mechanism.values shape is
            #   [batch_size, memory_time, memory_size]
            # the batched matmul is over memory_time, so the output shape is
            #   [batch_size, 1, memory_size].
            # we then squeeze out the singleton dim.
            context = math_ops.matmul(expanded_alignments, memory)
            context = array_ops.squeeze(context, [1])

            return context, alignments, coverage

        def loop_fn(loop_time, cell_output, cell_state, loop_state):
            if cell_output is None:  # time == 0
                final_dist = None
                emit_output = final_dist  # == None for time == 0
                next_cell_state = initial_state  # encoder last states
                coverage = (array_ops.zeros([batch_size, attn_size])
                            if prev_coverage is None else prev_coverage)

                # convext vector will initially be zeros
                # Ensure the second shape of attention vectors is set.
                context_vector = array_ops.zeros([batch_size, attn_size])
                context_vector.set_shape([None, attn_size])

                if initial_state_attention:
                    with variable_scope.variable_scope(
                            scope.Attention, reuse=tf.AUTO_REUSE):
                        # true in decode mode
                        # Re-calculate the context vector from the previous
                        # step so that we can pass it through a linear layer
                        # with this step's input to get a modified version of
                        # the input in decode mode, this is what updates the
                        # coverage vector
                        context_vector, _, coverage = _compute_attention(
                            cell_output=next_cell_state[-1].h,
                            coverage=coverage)

                # all TensorArrays for recoding sequences
                outputs_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                alignments_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                p_gens_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                coverages_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                sampled_tokens_history = tensor_array_ops.TensorArray(
                    dtype=tf.int32, size=0, dynamic_size=True)
                
                # mostly used in debugging
                logits_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                vocab_dists_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                final_dists_history = tensor_array_ops.TensorArray(
                    dtype=tf.float32, size=0, dynamic_size=True)
                
            else:
                # normal workflow:
                # decoder_inputs = input_kernel(inputs; context)
                # cell_output, states = cell(decoder_inputs, states)
                # context, att_dist, coverage = attention(states, coverage)
                # p_gen = pgen_kernel(...)
                # cell_outputs = output_kernel(cell_output, context)

                # since raw-rnn encapsulates cell call
                # we do this:
                # context, att_dist, coverage = attention(states, coverage)
                # p_gen = pgen_kernel(...)
                # cell_outputs = output_kernel(cell_output, context)
                # next_inputs = input_kernel(inputs; context) --> changed
                # Run the attention mechanism.

                # no change
                next_cell_state = cell_state

                # get the cell state of last layer's cell
                last_layer_state = cell_state[-1]

                # cell_input is cell inputs
                (sampled_tokens_history,
                 outputs_history, alignments_history, p_gens_history,
                 coverages_history, logits_history, vocab_dists_history,
                 final_dists_history, coverage, cell_input) = loop_state

                # Run the attention mechanism.
                with variable_scope.variable_scope(
                        scope.Attention, reuse=tf.AUTO_REUSE):
                    # reuse=initial_state_attention or i > 0
                    # or scope.Attention.reuse):
                    context_vector, attn_dist, coverage = _compute_attention(
                        cell_output=cell_output, coverage=coverage)
                    
                    # Concatenate the cell_output (= decoder state)
                    # and the context vector, and pass them through
                    # a linear layer. This is V[s_t, h*_t] + b in the paper
                    attention_output = output_kernel(
                        array_ops.concat([cell_output, context_vector], -1))

                    # update attention and cell_outputs
                    outputs_history = outputs_history.write(
                        loop_time - 1, attention_output)
                    alignments_history = alignments_history.write(
                        loop_time - 1, attn_dist)
                    coverages_history = coverages_history.write(
                        loop_time - 1, coverage)

                # Calculate p_gen
                if pointer_gen:
                    with variable_scope.variable_scope(scope.Pointer):
                        p_gen = pgen_kernel(array_ops.concat([
                            context_vector, last_layer_state.c,
                            last_layer_state.h, cell_input], -1))
                        # update p_gens_history distributions
                        p_gens_history = p_gens_history.write(
                            loop_time - 1, p_gen)

                # reuse variables
                # probably not necessary
                # [scope.Decoder[i].reuse_variables()
                #     for i in range(len(scope.Decoder))]
                # scope.Attention.reuse_variables()
                # scope.Pointer.reuse_variables()

                # distribution
                logits = logits_kernel(attention_output)
                vocab_dist = nn_ops.softmax(logits)
                
                final_dist = _calc_final_dist(
                    vocab_dist=vocab_dist,
                    attn_dist=attn_dist,
                    p_gen=p_gen,
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    num_source_OOVs=num_source_OOVs,
                    enc_batch_extended_vocab=enc_batch_extended_vocab)

                # raw_rnn requires `emit_output` to have same
                # shape with cell.output_size
                # thus we have to output attention_output
                # but not the final_distribution
                emit_output = attention_output

                # save these for debugging
                logits_history = logits_history.write(
                    loop_time - 1, logits)
                vocab_dists_history = vocab_dists_history.write(
                    loop_time - 1, vocab_dist)
                final_dists_history = final_dists_history.write(
                    loop_time - 1, final_dist)

            # generic
            elements_finished = (loop_time >= sequence_length)
            finished = math_ops.reduce_all(elements_finished)

            if reinforce and not initial_state_attention:
                # see Google's code
                # elements_finished = tf.logical_or(
                #     tf.equal(chosen_outputs, misc.BF_EOS_INT),
                #     loop_time >= global_config.timestep_limit)
                # they have this logical_or to stop
                # generation when sampled STOP
                # I am ignoring this for now, but probably
                # look back on this later?

                # also, Google used prev_elements_finished
                # but I used elements_finished, is that correct?

                if cell_output is None:  # time == 0
                    # when time == 0, use start_tokens
                    tf.logging.info("Running RLModel")
                    chosen_outputs = start_tokens
                else:
                    def _multinomial_sample(probs):
                        # tf.multinomial only samples from
                        # logits (unnormalized probability)
                        # here we only have normalized probability
                        # thus we use distributions.Categorical
                        dist = categorical.Categorical(probs=probs)

                        # use argmax during debugging
                        if not debug_mode:
                            sampled_tokens = dist.sample()
                        else:
                            sampled_tokens = dist.mode()

                        # since final_dist = vocab_dist + copy_dist
                        # sampled_tokens can have index out-of vocab_dist
                        # in this case we cast them into UNK
                        UNKs = array_ops.ones_like(sampled_tokens) * UNK_token
                        sampled_tokens = array_ops.where(
                            math_ops.greater(sampled_tokens, vocab_size),
                            UNKs, sampled_tokens, name="sampled_tokens")

                        return sampled_tokens

                    # otherwise, do the sampling in sequence_length
                    chosen_outputs = tf.to_int32(array_ops.where(
                        elements_finished,
                        array_ops.zeros([batch_size], dtype=tf.int32),
                        _multinomial_sample(final_dist)))

                    sampled_tokens_history = sampled_tokens_history.write(
                        loop_time - 1, chosen_outputs)

                next_input = array_ops.gather(embeddings, chosen_outputs)
            else:
                next_input = control_flow_ops.cond(
                    finished,
                    lambda: array_ops.zeros(
                        [batch_size, input_size], dtype=tf.float32),
                    lambda: inputs_ta.read(loop_time))

            with variable_scope.variable_scope(scope.Attention):
                # next inputs = input_kernel(inp; context)
                next_cell_input = input_kernel(
                    array_ops.concat([next_input, context_vector], -1))

            next_loop_state = (
                sampled_tokens_history,
                outputs_history, alignments_history, p_gens_history,
                coverages_history, logits_history, vocab_dists_history,
                final_dists_history, coverage, next_cell_input)
            
            return (elements_finished, next_cell_input, next_cell_state,
                    emit_output, next_loop_state)


        with tf.variable_scope("policy"):
            (decoder_outputs_ta,
             final_cell_state,
             final_loop_state) = rnn_ops.raw_rnn(
                cell=cell, loop_fn=loop_fn)

            (sampled_tokens_history,
             outputs_history, alignments_history, p_gens_history,
             coverages_history, logits_history, vocab_dists_history,
             final_dists_history, coverage, cell_input) = final_loop_state

        # [time, batch, nun_units] to [batch, time, num_units]
        final_dists = array_ops.transpose(
            final_dists_history.stack(), perm=[1, 0, 2])
        attn_dists = array_ops.transpose(
            alignments_history.stack(), perm=[1, 0, 2])
        p_gens = array_ops.transpose(
            p_gens_history.stack(), perm=[1, 0, 2])

        sampled_tokens = None
        if reinforce:
            sampled_tokens = array_ops.transpose(
                sampled_tokens_history.stack(), perm=(1, 0))

        # HG: what is that?
        # If using coverage, reshape it
        if coverage is not None:
            coverage = array_ops.reshape(coverage, [batch_size, -1])

        # used in debugging
        debug_variables = {
            "memory_kernel": memory_kernel,
            "query_kernel": query_kernel,
            "input_kernel": input_kernel,
            "pgen_kernel": pgen_kernel,
            "output_kernel": output_kernel,
            "coverage_kernel": coverage_kernel,
            "logits_kernel": logits_kernel,
            "memory": memory,
            "processed_memory": processed_memory}

        return (final_dists, final_cell_state, attn_dists, p_gens,
                coverage, sampled_tokens, decoder_outputs_ta,
                debug_variables, final_loop_state)
Beispiel #11
0
def dynamic_rnn_decoder(cell,
                        decoder_fn,
                        inputs=None,
                        sequence_length=None,
                        parallel_iterations=None,
                        swap_memory=False,
                        time_major=False,
                        scope=None,
                        name=None):
    """ 动态 RNN 解码器, 用于指定使用RNNCell的sequence-to-sequence模型的解码器功能

    `dynamic_rnn_decoder` 和官方提供的 `tf.python.ops.rnn.dynamic_rnn`差不多,
    但是不限制输入的sequence和batch size

    `dynamic_rnn_decoder` 有两个方法 : training 和 inference, 这两个是分开的方法

    不管是training 还是 inference, `cell` 和 `decoder_fn` 都是需要的,
    `cell` 用于每一步使用`raw_rnn`的计算,
    `decoder_fn` 可以控制模型的early stopping, output, state, 还有下一个input 和 context.

    训练时需要提供 `inputs`. 在每一个时间步,每一步的输入会被喂进 `decoder_fn`, 做一些更新和返回
    下一个时间步的输入

    为了做RNN的动态展开训练的时候`sequence_length` 是必需的, `input`也不能为None
    测试时, 如果 `inputs` 是 None,`sequence_length` 不是必需的

    在inference时`inputs`应为 `None`,并且来自`decoder_fn`的输出

    Args:
        cell: An instance of RNNCell.
        decoder_fn: A function that takes time, cell state, cell input,
            cell output and context state. It returns a early stopping vector,
            cell state, next input, cell output and context state.
            Examples of decoder_fn can be found in the decoder_fn.py folder.
        inputs: The inputs for decoding (embedded format).

            If `time_major == False` (default), this must be a `Tensor` of shape:
                `[batch_size, max_time, ...]`.

            If `time_major == True`, this must be a `Tensor` of shape:
                `[max_time, batch_size, ...]`.

            The input to `cell` at each time step will be a `Tensor` with dimensions
                `[batch_size, ...]`.

        sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
            if `inputs` is not None and `sequence_length` is None it is inferred
            from the `inputs` as the maximal possible sequence length.
        parallel_iterations: (Default: 32).    The number of iterations to run in
            parallel.    Those operations which do not have any temporal dependency
            and can be run in parallel, will be.    This parameter trades off
            time for space.    Values >> 1 use more memory but take less time,
            while smaller values use less memory but computations take longer.
        swap_memory: Transparently swap the tensors produced in forward inference
            but needed for back prop from GPU to CPU.    This allows training RNNs
            which would typically not fit on a single GPU, with very minimal (or no)
            performance penalty.
        time_major: The shape format of the `inputs` and `outputs` Tensors.
            If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
            If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
            Using `time_major = True` is a bit more efficient because it avoids
            transposes at the beginning and end of the RNN calculation.    However,
            most TensorFlow data is batch-major, so by default this function
            accepts input and emits output in batch-major form.
        scope: VariableScope for the `raw_rnn`;
            defaults to None.
        name: NameScope for the decoder;
            defaults to "dynamic_rnn_decoder"

    Returns:
        A tuple (outputs, final_state, final_context_state) where:

            outputs: the RNN output 'Tensor'.

                If time_major == False (default), this will be a `Tensor` shaped:
                    `[batch_size, max_time, cell.output_size]`.

                If time_major == True, this will be a `Tensor` shaped:
                    `[max_time, batch_size, cell.output_size]`.

            final_state: The final state and will be shaped
                `[batch_size, cell.state_size]`.

            final_context_state: The context state returned by the final call
                to decoder_fn. This is useful if the context state maintains internal
                data which is required after the graph is run.
                For example, one way to diversify the inference output is to use
                a stochastic decoder_fn, in which case one would want to store the
                decoded outputs, not just the RNN outputs. This can be done by
                maintaining a TensorArray in context_state and storing the decoded
                output of each iteration therein.

    Raises:
        ValueError: if inputs is not None and has less than three dimensions.
    """
    with ops.name_scope(name, "dynamic_rnn_decoder", [
            cell, decoder_fn, inputs, sequence_length, parallel_iterations,
            swap_memory, time_major, scope
    ]):
        if inputs is not None:
            # 转成 tensor
            inputs = ops.convert_to_tensor(inputs)

            # 检查 input 维度
            if inputs.get_shape().ndims is not None and (
                    inputs.get_shape().ndims < 2):
                raise ValueError("Inputs must have at least two dimensions")
            # 设置RNN (dimensions, sizes, length, initial state, dtype)
            if not time_major:
                # [batch, seq, features] -> [seq, batch, features]
                inputs = array_ops.transpose(inputs, perm=[1, 0, 2])

            dtype = inputs.dtype
            # Get data input information
            input_depth = int(inputs.get_shape()[2])
            batch_depth = inputs.get_shape()[1].value
            max_time = inputs.get_shape()[0].value
            if max_time is None:
                max_time = array_ops.shape(inputs)[0]
            # Setup decoder inputs as TensorArray
            inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
            inputs_ta = inputs_ta.unstack(inputs)

        def loop_fn(time, cell_output, cell_state, loop_state):
            if cell_state is None:  # first call, before while loop (in raw_rnn)
                if cell_output is not None:
                    raise ValueError(
                        "Expected cell_output to be None when cell_state "
                        "is None, but saw: %s" % cell_output)
                if loop_state is not None:
                    raise ValueError(
                        "Expected loop_state to be None when cell_state "
                        "is None, but saw: %s" % loop_state)
                context_state = None
            else:  # subsequent calls, inside while loop, after cell excution
                if isinstance(loop_state, tuple):
                    (done, context_state) = loop_state
                else:
                    done = loop_state
                    context_state = None

            # call decoder function
            if inputs is not None:  # training
                # get next_cell_input
                if cell_state is None:
                    next_cell_input = inputs_ta.read(0)
                else:
                    if batch_depth is not None:
                        batch_size = batch_depth
                    else:
                        batch_size = array_ops.shape(done)[0]
                    next_cell_input = control_flow_ops.cond(
                        math_ops.equal(time, max_time),
                        lambda: array_ops.zeros([batch_size, input_depth],
                                                dtype=dtype),
                        lambda: inputs_ta.read(time))
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state,
                                                  next_cell_input, cell_output,
                                                  context_state)
            else:  # inference
                # next_cell_input is obtained through decoder_fn
                (next_done, next_cell_state, next_cell_input, emit_output,
                 next_context_state) = decoder_fn(time, cell_state, None,
                                                  cell_output, context_state)

            # check if we are done
            if next_done is None:  # training
                next_done = time >= sequence_length

            # build next_loop_state
            if next_context_state is None:
                next_loop_state = next_done
            else:
                next_loop_state = (next_done, next_context_state)

            return (next_done, next_cell_input, next_cell_state, emit_output,
                    next_loop_state)

        # Run raw_rnn function
        outputs_ta, final_state, final_loop_state = rnn.raw_rnn(
            cell,
            loop_fn,
            parallel_iterations=parallel_iterations,
            swap_memory=swap_memory,
            scope=scope)
        outputs = outputs_ta.stack()

        # Get final context_state, if generated by user
        if isinstance(final_loop_state, tuple):
            final_context_state = final_loop_state[1]
        else:
            final_context_state = None

        if not time_major:
            # [seq, batch, features] -> [batch, seq, features]
            outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
        return outputs, final_state, final_context_state
Beispiel #12
0
sequence_length = tf.placeholder(shape=(batch_size, max_time), dtype=tf.int32)
inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
inputs_ta = inputs_ta.unpack(inputs)


def loop_fn(time, cell_output, loop_state):
    emit_output = cell_output  # == None for time == 0
    elements_finished = (time >= sequence_length)
    finished = tf.reduce_all(elements_finished)
    next_input = tf.cond(
        finished,
        lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32),
        lambda: inputs_ta.read(time))
    next_loop_state = None
    return (elements_finished, next_input, emit_output, next_loop_state)


cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
initial_state = cell.zero_state(batch_size, tf.float32)
outputs_ta, final_state, _ = raw_rnn(cell, loop_fn, initial_state)
outputs = outputs_ta.pack()

loss_op = kits.loss(outputs, labels)

import pdb
pdb.set_trace()  # breakpoint 49561a3a //
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

sess.run(loss_op)
Beispiel #13
0
    def decode(self, decoder_inp, seq_len, encoder_hidden_states, final_state,
               seq_len_inp):
        """Basic decoder using LSTM to model output sequence."""
        # First prepare the decoder input - Embed the input and obtain the
        # relevant loop function
        decoder_inputs, loop_function = self.prepare_decoder_input(decoder_inp)

        # TensorArray is used to do dynamic looping over decoder input
        inputs_ta = tf.TensorArray(size=self.max_output, dtype=tf.float32)
        inputs_ta = inputs_ta.unstack(decoder_inputs)

        batch_size = tf.shape(decoder_inputs)[1]
        emb_size = decoder_inputs.get_shape()[2].value

        with variable_scope.variable_scope("rnn_decoder"):

            def simple_loop_function(time, cell_output, state, loop_state):
                # Check which sequences are processed
                elements_finished = (time >= tf.cast(seq_len, tf.int32))
                # finished would indicate if all output sequences have been
                # processed
                finished = tf.reduce_all(elements_finished)
                if cell_output is None:
                    # 0th time step. Initialize the decoder hidden state with
                    # final hidden state of encoder.
                    next_state = final_state
                    # Read the <GO> tag to start decoding
                    next_input = inputs_ta.read(time)
                    output = None
                else:
                    next_state = state
                    output = cell_output
                    if self.isTraining:
                        if loop_function is not None:
                            # Perform Scheduled sampling
                            # https://arxiv.org/abs/1506.03099
                            random_prob = tf.random_uniform([])
                            next_input = tf.cond(
                                finished, lambda: tf.zeros(
                                    [batch_size, emb_size], dtype=tf.float32),
                                lambda: tf.cond(
                                    tf.greater_equal(random_prob, self.
                                                     samp_prob), lambda:
                                    inputs_ta.read(
                                        time), lambda: loop_function(output)))
                        else:
                            # Read the decoder input till all output
                            # sequences are not finished.
                            next_input = tf.cond(
                                finished, lambda: tf.zeros(
                                    [batch_size, emb_size], dtype=tf.float32),
                                lambda: inputs_ta.read(time))
                    else:
                        # During evaluation, the output of previous time step
                        # is fed into next time step
                        next_input = loop_function(output)
                return (elements_finished, next_input, next_state, output,
                        None)

        # outputs is a TensorArray with T=max(sequence_length) entries
        # of shape Bx|V|
        outputs, state, _ = rnn.raw_rnn(self.cell, simple_loop_function)
        # Concatenate the output across timesteps to get a tensor of TxBx|v|
        # shape
        outputs = outputs.concat()
        return outputs
Beispiel #14
0
    def my_attentive(self, cell, sequence_length, inputs, encoder_final_state,
                     memory_fw, memory_bw):
        inputs_shape = tf.shape(inputs)
        max_seq_len, batch_size, input_features = self.trainingManager.configs.max_seq_len_decoder, inputs_shape[
            1], inputs.shape[2]

        inputs_ta = tf.TensorArray(dtype=tf.float32,
                                   size=max_seq_len)  # max_length = time
        inputs_ta = inputs_ta.unstack(
            inputs)  # length array of [batch , hidden state]

        attention_weights = tf.TensorArray(
            size=max_seq_len, dtype=tf.float32)  # max_length = time

        def loop_fn(cur_time, cur_cell_output, cur_cell_state,
                    cur_loop_state):  # current inputs
            # cur_cell_output = None at time = 0
            nxt_emit_output = tf.zeros(
                [
                    cell.output_size * 2
                ],  # (decoder hidden = 512 )+(context vector_fw = 256)+(context vector_bw = 256)
                dtype=tf.float32
            )  # define initial size of output or the default is cell , dont give batch size !!!!!!

            compressed_context_vector_fw = tf.zeros(
                [batch_size, input_features // 2
                 ],  # [batch , input_features/2]
                dtype=tf.float32)

            compressed_context_vector_bw = tf.zeros(
                [batch_size, input_features // 2
                 ],  # [batch , input_features/2]
                dtype=tf.float32)

            if cur_cell_output is None:  # time == 0
                # initialization logic
                nxt_cell_state = encoder_final_state
                nxt_loop_state = attention_weights
            else:
                # any logic that depends on the cell state or cell output..ex attention
                # this part is 1 based
                nxt_cell_state = cur_cell_state  # [batch , stacked*hidden_decoder=(512*2)]

                pure_context_vector_fw, scalars_fw = self.attention_step(
                    cur_cell_state, memory_fw)  # [batch ,hidden_encoder]
                pure_context_vector_bw, scalars_bw = self.attention_step(
                    cur_cell_state, memory_bw)  # [batch ,hidden_encoder]

                compressed_context_vector_fw = tf.layers.dense(
                    pure_context_vector_fw,
                    units=input_features // 2)  # [batch ,input_features/2]
                compressed_context_vector_bw = tf.layers.dense(
                    pure_context_vector_bw,
                    units=input_features // 2)  # [batch ,input_features/2]

                nxt_emit_output = tf.concat(
                    (cur_cell_output, pure_context_vector_fw,
                     pure_context_vector_bw),
                    axis=1
                )  # [batch ,hidden_decoder+hidden_encoder+hidden_encoder=(512+256+256)]

                not_finished = (cur_time - 1 < sequence_length
                                )  # this part is 1 based
                nxt_loop_state = cur_loop_state.write(
                    cur_time - 1,
                    (tf.where(not_finished, scalars_fw,
                              tf.zeros_like(scalars_fw)),
                     tf.where(not_finished, scalars_bw,
                              tf.zeros_like(scalars_bw))
                     )  # a pair of forward and backward attention weights
                )

            # common loop logic
            # as in traditional loop the condition is "cur_time < sequence_length" but here i want the finished
            cur_elements_finished = (cur_time >= sequence_length
                                     )  # [batch] # this part is 0 based

            is_current_out_of_bound = tf.reduce_all(
                cur_elements_finished
            )  # scalar --  will cut to the longest sequence given for example [5,2,f] with lengths [3,4] will end at 4

            # this shape has to be deterministic not [....,?]
            nxt_input = tf.cond(
                is_current_out_of_bound,
                lambda: tf.zeros(
                    [batch_size, input_features * 2
                     ],  # input shape [batch , input_features+input_features]
                    dtype=tf.float32
                ),  # no input for end of loop .. can't read if out of bounds == time
                lambda: tf.concat(
                    (inputs_ta.read(cur_time), compressed_context_vector_fw,
                     compressed_context_vector_bw),
                    axis=1)  # read current input and concat context vector
            )

            # nxt_loop_state = None
            return cur_elements_finished, nxt_input, nxt_cell_state, nxt_emit_output, nxt_loop_state  # next step in time

        outputs_ta, final_state, loop_ta = raw_rnn(cell,
                                                   loop_fn,
                                                   swap_memory=True)
        outputs = outputs_ta.stack(
        )  # [seq_len_decoder_decoder, batch, hidden_state+context vector(512+256+256) ]
        loop = loop_ta.stack(
        )  # [seq_len_decoder_decoder, 2, batch,seq_len_encoder(memory)]
        return outputs, final_state, loop
Beispiel #15
0
    def my_attentive_concat_memory(self, cell, sequence_length, inputs,
                                   encoder_final_state, memory):
        inputs_shape = tf.shape(inputs)
        max_seq_len, batch_size, input_features = self.trainingManager.configs.max_seq_len_decoder, inputs_shape[
            1], inputs.shape[2]

        inputs_ta = tf.TensorArray(dtype=tf.float32,
                                   size=max_seq_len)  # max_length = time
        inputs_ta = inputs_ta.unstack(
            inputs)  # length array of [batch , hidden state]

        attention_weights = tf.TensorArray(
            size=max_seq_len, dtype=tf.float32)  # max_length = tim

        def loop_fn(cur_time, cur_cell_output, cur_cell_state,
                    cur_loop_state):  # current inputs
            nxt_emit_output = tf.zeros(
                [cell.output_size * 2], dtype=tf.float32
            )  # define initial size of output or the default is cell , dont give batch size !!!!!!

            # nxt_emit_output = None
            compressed_context_vector = tf.zeros(
                [batch_size, input_features],  # [batch , input_features]
                dtype=tf.float32)

            if cur_cell_output is None:  # time == 0
                # initialization logic
                nxt_cell_state = encoder_final_state
                nxt_loop_state = attention_weights
            else:
                # any logic that depends on the cell state or cell output..ex attention
                # this part is 1 based
                nxt_cell_state = cur_cell_state  # [batch , stacked*hidden_decoder=(1024*2)]

                scalars = tf.reduce_sum(
                    tf.multiply(
                        memory, cur_cell_state[-1]
                    ),  # [seq_len_encoder, batch ,stacked*hidden_encoder=(512*2)] , mul by top state
                    axis=2)  # [seq_len_encoder, batch ] this is cross product

                scalars = tf.transpose(tf.nn.softmax(
                    scalars,
                    axis=0))  # [batch,seq_len_encoder] this is cross product

                memory_trans = tf.transpose(memory, [
                    2, 1, 0
                ])  # [stacked*hidden_encoder=(512*2), batch,seq_len_encoder]

                pure_context_vector = tf.reduce_sum(
                    tf.transpose(
                        tf.multiply(
                            memory_trans, scalars
                        ),  # [stacked*hidden_encoder=(512*2),batch,seq_len_encoder ]
                        [2, 1, 0]
                    ),  # [seq_len_encoder, batch ,stacked*hidden_encoder=(512*2)]
                    axis=0)  # [batch ,stacked*hidden_encoder=(512*2)]

                compressed_context_vector = tf.layers.dense(
                    pure_context_vector,
                    units=input_features)  # [batch ,seq_len_encoder]

                nxt_emit_output = tf.concat(
                    (cur_cell_output, pure_context_vector),
                    axis=1)  # [batch ,2*stacked*hidden_encoder=(512*2*2)]
                nxt_loop_state = cur_loop_state.write(
                    cur_time - 1,
                    tf.where(
                        cur_time - 1 < sequence_length,  # this part is 1 based
                        scalars,
                        tf.zeros_like(scalars)))

            # common loop logic
            # as in traditional loop the condition is "cur_time < sequence_length" but here i want the finished
            cur_elements_finished = (cur_time >= sequence_length
                                     )  # [batch] # this part is 0 based

            is_current_out_of_bound = tf.reduce_all(
                cur_elements_finished
            )  # scalar --  will cut to the longest sequence given for example [5,2,f] with lengths [3,4] will end at 4

            # this shape has to be deterministic not [....,?]
            nxt_input = tf.cond(
                is_current_out_of_bound,
                lambda: tf.zeros(
                    [batch_size, input_features * 2
                     ],  # input shape [batch , input_features+input_features]
                    dtype=tf.float32
                ),  # no input for end of loop .. can't read if out of bounds == time
                lambda: tf.concat(
                    (inputs_ta.read(cur_time), compressed_context_vector),
                    axis=1)  # read current input and concat context vector
            )

            # nxt_loop_state = None
            return cur_elements_finished, nxt_input, nxt_cell_state, nxt_emit_output, nxt_loop_state  # next step in time

        outputs_ta, final_state, loop_ta = raw_rnn(cell,
                                                   loop_fn,
                                                   swap_memory=True)
        outputs = outputs_ta.stack(
        )  # [seq_len_decoder_decoder, batch, hidden_state ]
        loop = loop_ta.stack(
        )  # [seq_len_decoder_decoder, seq_len_encoder(memory), batch]
        return outputs, final_state, loop
Beispiel #16
0
def rnn_decoder_attention(cell,
                          num_attention_units,
                          attention_inputs,
                          decoder_inputs,
                          initial_state,
                          decoder_length,
                          decoder_fn,
                          attention_length=None,
                          weight_initializer=None,
                          encoder_projection=None,
                          parallel_iterations=None,
                          swap_memory=False,
                          time_major=False,
                          scope=None):
    """ Dynamic RNN decoder with attention for a sequence-to-sequence model
	specified by RNNCell 'cell'.

	The 'rnn_decoder_attention' is similar to the
	'tf.python.ops.rnn.dynamic_rnn'. As the decoder does not make any
	assumptions of sequence length of the input or how many steps it can decode,
	since 'dynamic_rnn_decoder' uses dynamic unrolling. This allows
	'attention_inputs' and 'decoder_inputs' to have [None] in the sequence
	length of the decoder inputs.

	The parameters attention_inputs and  decoder_inputs are nessesary for both
	training and evaluation. During training all of attention_inputs and a slice
	of decoder_inputs is feed at every timestep. During evaluation
	decoder_inputs it is only feed at time==0, as the decoder needs the
	'start-of-sequence' symbol, known from Bahdanau et al., 2014
	https://arxiv.org/abs/1409.0473, at the beginning of decoding.

	The parameter  initial_state is used to initialize the decoder RNN.
	As default a linear transformation with a tf.nn.tanh linearity is used.
	By a linear transformation we can have different number of units between
	the encoder and decoder.

	The parameter sequence length is nessesary as it determines how many
	timesteps to decode for each sample. TODO: Could make it optional for
	training.

	The parameter attention_length is used for masking the alpha values
	computes over the attention_input. Is set to None (default) no mask is
	computed.

	Extensions of interest:
	- Support time_major=True for attention_input (not using conv2D)
	- Look into rnn.raw_rnn so we don't need to handle zero states
	- Make 'alpha' usable
	- Don't use decoder_inputs for evaluation
	- Make a attention class to allow custom attention functions
	- Multi-layered decoder
	- Beam search

	Args:
	  cell: An instance of RNNCell.
	  num_attention_units: The number of units used for attention.
	  attention_inputs: The encoded inputs.
		The input used to attend over at every timestep, must be of size
		[batch_size, seq_len, features]
	  decoder_inputs: The inputs for decoding (embedded format).
		If `time_major == False` (default), this must be a `Tensor` of shape:
		  `[batch_size, max_time, ...]`.
		If `time_major == True`, this must be a `Tensor` of shape:
		  `[max_time, batch_size, ...]`.
		The input to `cell` at each time step will be a `Tensor` with dimensions
		  `[batch_size, ...]`.
	  initial_state: An initial state for the decoder's RNN.
		Must be [batch_size, num_features], where num_features does not have to
		match the cell.state_size. As a projection is performed at the beginning
		of the decoding.
	  decoder_length: An int32/int64 vector sized `[batch_size]`.
	  decoder_fn: A function that takes a state and returns an embedding.
		Here is an example of a `decoder_fn`:
		def decoder_fn(embeddings, weight, bias):
		  def dec_fn(state):
			prev = tf.matmul(state, weight) + bias
			return tf.gather(embeddings, tf.argmax(prev, 1))
		  return dec_fn
	  encoder_projection: (optional) given that the encoder might have a
		different size than the decoder, we project the intial state as
		described in Bahdanau, 2014 (https://arxiv.org/abs/1409.0473).
		The optional `encoder_projection` is a
		`tf.contrib.layers.fully_connected` with
		`activation_fn=tf.python.ops.nn.tanh`.
	  weight_initializer: (optional) An initializer used for attention.
	  attention_length: (optional) An int32/int64 vector sized `[batch_size]`.
	  parallel_iterations: (Default: 32).  The number of iterations to run in
		parallel.  Those operations which do not have any temporal dependency
		and can be run in parallel, will be.  This parameter trades off
		time for space.  Values >> 1 use more memory but take less time,
		while smaller values use less memory but computations take longer.
	  swap_memory: Transparently swap the tensors produced in forward inference
		but needed for back prop from GPU to CPU.  This allows training RNNs
		which would typically not fit on a single GPU, with very minimal (or no)
		performance penalty.
	  time_major: The shape format of the `inputs` and `outputs` Tensors.
		If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
		If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
		Using `time_major = True` is a bit more efficient because it avoids
		transposes at the beginning and end of the RNN calculation.  However,
		most TensorFlow data is batch-major, so by default this function
		accepts input and emits output in batch-major form.
	  scope: VariableScope for the created subgraph;
		defaults to "decoder_attention".
	Returns:
	  A pair (outputs_train, outputs_eval) where:
		outputs_train/eval: the RNN output 'Tensor'
		  If time_major == False (default), this will be a `Tensor` shaped:
			`[batch_size, max_time, cell.output_size]`.
		  If time_major == True, this will be a `Tensor` shaped:
			`[max_time, batch_size, cell.output_size]`.
		NOTICE: output_train is commonly used for calculating loss.
	Raises:
	  #TODO Put up some raises
	"""

    with vs.variable_scope(scope or "decoder") as varscope:
        # Project initial_state as described in Bahdanau et al. 2014
        # https://arxiv.org/abs/1409.0473
        if encoder_projection is None:
            encoder_projection = partial(fully_connected,
                                         activation_fn=math_ops.tanh)
        state = encoder_projection(initial_state, cell.output_size)
        # Setup of RNN (dimensions, sizes, length, initial state, dtype)
        # Setup dtype
        dtype = state.dtype
        if not time_major:
            # [batch, seq, features] -> [seq, batch, features]
            decoder_inputs = array_ops.transpose(decoder_inputs,
                                                 perm=[1, 0, 2])
        # Get data input information
        batch_size = array_ops.shape(decoder_inputs)[1]
        attention_input_depth = int(attention_inputs.get_shape()[2])
        decoder_input_depth = int(decoder_inputs.get_shape()[2])
        attention_max_length = array_ops.shape(attention_inputs)[1]
        # Setup decoder inputs as TensorArray
        decoder_inputs_ta = tensor_array_ops.TensorArray(dtype,
                                                         size=0,
                                                         dynamic_size=True)
        decoder_inputs_ta = decoder_inputs_ta.unpack(decoder_inputs)

        print "attention_input_depth,", attention_input_depth
        print "decoder_input_depth,", decoder_input_depth
        # Setup attention weight
        if weight_initializer is None:
            weight_initializer = init_ops.truncated_normal_initializer(
                stddev=0.1)
        with vs.variable_scope("attention") as attnscope:
            v_a = vs.get_variable('v_a',
                                  shape=[num_attention_units],
                                  initializer=weight_initializer)
            W_a = vs.get_variable(
                'W_a',
                shape=[cell.output_size, num_attention_units],
                initializer=weight_initializer)

        # Encode attention_inputs for attention
        hidden = array_ops.reshape(
            attention_inputs,
            [-1, attention_max_length, 1, attention_input_depth])
        part1 = conv2d(hidden, num_attention_units, (1, 1))
        part1 = array_ops.squeeze(part1,
                                  [2])  # Squeeze out the third dimension

        def context_fn(state, inp):
            with vs.variable_scope("attention") as attnscope:
                part2 = math_ops.matmul(state, W_a)  # [batch, attn_units]
                part2 = array_ops.expand_dims(part2,
                                              1)  # [batch, 1, attn_units]
                cmb_attn = part1 + part2  # [batch, seq, attn_units]
                e = math_ops.reduce_sum(v_a * math_ops.tanh(cmb_attn),
                                        [2])  # [batch, seq]
                alpha = nn.softmax(e)
                # Mask
                if attention_length is not None:
                    alpha = math_ops.to_float(mask(attention_length)) * alpha
                    alpha = alpha / math_ops.reduce_sum(alpha, [1],
                                                        keep_dims=True)
                # [batch, features]
                context = math_ops.reduce_sum(
                    array_ops.expand_dims(alpha, 2) * attention_inputs, [1])
                context.set_shape([None, attention_input_depth])
                con = array_ops.concat(1, (inp, context))
                print "con,", con.get_shape()
                return con, alpha

        # loop function train
        def loop_fn_train(time, cell_output, cell_state, loop_state):
            print "@@@TRAIN@@@"
            emit_output = cell_output
            if cell_output is None:
                next_cell_state = state  # Use projection of prev encoder state
            else:
                next_cell_state = cell_state
            elements_finished = (time >= decoder_length
                                 )  # TODO handle seq_len=None
            finished = math_ops.reduce_all(elements_finished)

            next_input, _ = control_flow_ops.cond(
                finished,
                # Handle zero states
                lambda: (array_ops.zeros(
                    [batch_size, decoder_input_depth + attention_input_depth],
                    dtype=dtype),
                         array_ops.zeros([batch_size, attention_max_length],
                                         dtype=dtype)),
                # Read data and calculate attention
                lambda: context_fn(next_cell_state, decoder_inputs_ta.read(time
                                                                           )))
            next_input.set_shape([
                None, decoder_input_depth + attention_input_depth
            ])  # it loses its shape at some point
            next_loop_state = None
            return (elements_finished, next_input, next_cell_state,
                    emit_output, next_loop_state)

        # loop function eval
        def loop_fn_eval(time, cell_output, cell_state, loop_state):
            print "@@@EVAL@@@"
            emit_output = cell_output
            if cell_output is None:
                next_cell_state = state
            else:
                next_cell_state = cell_state
            elements_finished = (time >= decoder_length
                                 )  # TODO handle seq_len=None
            finished = math_ops.reduce_all(elements_finished)
            varscope.reuse_variables()
            next_input, _ = control_flow_ops.cond(
                finished,
                # Handle zero states
                lambda: (array_ops.zeros(
                    [batch_size, decoder_input_depth + attention_input_depth],
                    dtype=dtype),
                         array_ops.zeros([batch_size, attention_max_length],
                                         dtype=dtype)),
                # Read data and calculate attention
                lambda: control_flow_ops.cond(
                    math_ops.greater(time, 0), lambda: context_fn(
                        next_cell_state, decoder_fn(next_cell_state)), lambda:
                    context_fn(next_cell_state, decoder_inputs_ta.read(0))))
            # next_input loses its shape at some point
            next_input.set_shape(
                [None, decoder_input_depth + attention_input_depth])
            next_loop_state = None
            print "next_input,", next_input.get_shape()
            return (elements_finished, next_input, next_cell_state,
                    emit_output, next_loop_state)

        # Run raw_rnn function
        outputs_ta_train, _, _ = rnn.raw_rnn(cell, loop_fn_train)
        varscope.reuse_variables()
        outputs_ta_eval, _, _ = rnn.raw_rnn(cell, loop_fn_eval)
        outputs_train = outputs_ta_train.pack()
        outputs_eval = outputs_ta_eval.pack()
        if not time_major:
            outputs_train = array_ops.transpose(outputs_train, perm=[1, 0, 2])
            outputs_eval = array_ops.transpose(outputs_eval, perm=[1, 0, 2])
        return outputs_train, outputs_eval