コード例 #1
0
def _output_form_split(cell_output, decoder_states, output_size, initializer):

    with vs.variable_scope("cell_output_linear", initializer=initializer):
        cell_output_ = cells.linear([cell_output], output_size, True)

    with vs.variable_scope("decoder_states_linear", initializer=initializer):
        ds_ = cells.linear([decoder_states], output_size, True)

    output = cell_output_ + ds_

    return output
コード例 #2
0
ファイル: lm_ops.py プロジェクト: chagge/attentive_lm
def _output_form_split(cell_output, decoder_states, output_size, initializer):

    with vs.variable_scope("cell_output_linear", initializer=initializer):
        cell_output_ = cells.linear([cell_output], output_size, True)

    with vs.variable_scope("decoder_states_linear", initializer=initializer):
        ds_ = cells.linear([decoder_states], output_size, True)

    output = cell_output_ + ds_

    return output
コード例 #3
0
def _output_form_concat(cell_output, decoder_states, output_size, initializer):

    with vs.variable_scope("output_form", initializer=initializer):
        output = cells.linear([cell_output] + [decoder_states], output_size,
                              True)

    return output
コード例 #4
0
ファイル: attention.py プロジェクト: giancds/attentive_lm
def score_combined(decoder_current,  # h_t
                   decoder_previous,  # h_i
                   reuse_variables=False,
                   dtype=tf.float32):
  """ Applies a score function of the form

              v.(W1.hi + W2.hs)

  where W is a weight matrix, v is a vector of parameters, hi is one
  of each of the decoder hidden states and hs is the current hidden state at
  timestep t.

  The function performs a 1-by-1 convolution to calculate W.hi  and performs
  the W2.hs step using a ``linear'' cell (see cells.linear for the
  documentation)  and broadcasted into the result of W1.hi (encoder_hiddens)
  via multiplication step.  After this step a reduce_sum is performed over
  axis=[2,3] so the correct results are obtained.

  Args:
      decoder_current: not used
      attn_size: the size of the attention vectors
      encoder_hiddens: 3-D Tensor [batch_size, timestep, hidden_dim]. It
          represents the hidden sattes of the decoder up to the current
          timestep.
      current_hidden: Tensor, representing the current hidden state at
          timestep t

  Returns:
      beta: decoder hidden states after applying the content function

  """
  with tf.variable_scope("score_salton_combined") as scope:
    if reuse_variables:
      scope.reuse_variables()

    _, output_size = get_2d_tensor_shapes(decoder_current)

    decoder_current = cells.linear(
      [decoder_current], output_size, bias=False, dtype=dtype)

    #
    decoder_previous, attn_dim = reshape_attention(decoder_previous)

    # we first get the correct weight matrix
    ws = tf.get_variable("AttnDecWs", [1, 1, attn_dim, attn_dim], dtype=dtype)

    # we apply a small convolution to the decoder states - it is more
    # efficient than performing a recurrent matrix * matrix
    hidden_features = convolve(decoder_previous, ws)

    hidden_features = hidden_features + decoder_current

    # we then get the vector v that will be used on the second
    # multiplication op.
    vs = tf.get_variable("AttnDecVs", [attn_dim], dtype=dtype)

    scores = tf.reduce_sum((vs * tf.tanh(hidden_features)), [2, 3])

  return scores
コード例 #5
0
ファイル: content_functions.py プロジェクト: chagge/tsf_nmt
def mod_vinyals_kayser(hidden, decoder_hidden_state, initializer=None):

    # size of decoder layers
    attention_vec_size = hidden.get_shape()[3].value

    with vs.variable_scope("mod_vinyals_kayser", initializer=initializer):

        k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, 1], initializer=initializer)
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")

        y = cells.linear(decoder_hidden_state, 1, True)
        y = array_ops.reshape(y, [-1, 1, 1, 1])

        # Attention mask is a softmax of v^T * tanh(...).
        s = math_ops.reduce_sum(math_ops.tanh(hidden_features + y), [2, 3])

    return s
コード例 #6
0
ファイル: content_functions.py プロジェクト: chagge/tsf_nmt
def bahdanau_nmt(hidden, decoder_previous_state, initializer=None):
    # size of decoder layers
    attention_vec_size = hidden.get_shape()[3].value
    decoder_size = decoder_previous_state.get_shape()[1].value

    with vs.variable_scope("bahdanau_nmt", initializer=initializer):
        # here we calculate the W_a * s_i-1 (W1 * h_1) part of the attention alignment
        k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, attention_vec_size], initializer=initializer)
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        va = vs.get_variable("AttnV_%d" % 0, [attention_vec_size], initializer=initializer)

        y = cells.linear(decoder_previous_state, decoder_size, True)
        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])

        # Attention mask is a softmax of v^T * tanh(...).
        s = math_ops.reduce_sum(va * math_ops.tanh(hidden_features + y), [2, 3])

    return s
コード例 #7
0
def mod_vinyals_kayser(hidden, decoder_hidden_state, initializer=None):

    # size of decoder layers
    attention_vec_size = hidden.get_shape()[3].value

    with vs.variable_scope("mod_vinyals_kayser", initializer=initializer):

        k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, 1],
                            initializer=initializer)
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")

        y = cells.linear(decoder_hidden_state, 1, True)
        y = array_ops.reshape(y, [-1, 1, 1, 1])

        # Attention mask is a softmax of v^T * tanh(...).
        s = math_ops.reduce_sum(math_ops.tanh(hidden_features + y), [2, 3])

    return s
コード例 #8
0
def bahdanau_nmt(hidden, decoder_previous_state, initializer=None):
    # size of decoder layers
    attention_vec_size = hidden.get_shape()[3].value
    decoder_size = decoder_previous_state.get_shape()[1].value

    with vs.variable_scope("bahdanau_nmt", initializer=initializer):
        # here we calculate the W_a * s_i-1 (W1 * h_1) part of the attention alignment
        k = vs.get_variable("AttnW_%d" % 0,
                            [1, 1, attention_vec_size, attention_vec_size],
                            initializer=initializer)
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        va = vs.get_variable("AttnV_%d" % 0, [attention_vec_size],
                             initializer=initializer)

        y = cells.linear(decoder_previous_state, decoder_size, True)
        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])

        # Attention mask is a softmax of v^T * tanh(...).
        s = math_ops.reduce_sum(va * math_ops.tanh(hidden_features + y),
                                [2, 3])

    return s
コード例 #9
0
ファイル: decoders.py プロジェクト: yeab/tsf_nmt
def attention_decoder_output(decoder_inputs,
                             initial_state,
                             attention_states,
                             cell,
                             num_symbols,
                             attention_f=global_attention,
                             window_size=10,
                             content_function=vinyals_kaiser,
                             decoder_attention_f=decoder_type_2,
                             combine_inp_attn=False,
                             input_feeding=False,
                             dropout=None,
                             initializer=None,
                             decoder_states=None,
                             step_num=None,
                             dtype=tf.float32,
                             scope=None):
    """

    Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence
        model.

    Parameters
    ----------

    decoder_inputs: list
            a list of 2D Tensors [batch_size x cell.input_size].

    initial_state: tensor
            3D Tensor [batch_size x attn_length x attn_size].

    attention_states:

    cell: RNNCell
            rnn_cell.RNNCell defining the cell function and size.

    batch_size: int
            batch size when training the model

    attention_f: function
            function indicating which type of attention to use. Default to global_attention.

    output_size: int
            size of the output vectors; if None, we use cell.output_size.

    loop_function:
            if not None, this function will be applied to i-th output
                in order to generate i+1-th input, and decoder_inputs will be ignored,
                except for the first element ("GO" symbol). This can be used for decoding,
                but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
            Signature -- loop_function(prev, i) = next
                * prev is a 2D Tensor of shape [batch_size x cell.output_size],
                * i is an integer, the step number (when advanced control is needed),
                * next is a 2D Tensor of shape [batch_size x cell.input_size].

    window_size: int
            size of the window to apply on local attention

    input_feeding: boolean
            whether or not to use the input feeding approach by Luong et al., 2015.

    content_function: string

    dtype:
            The dtype to use for the RNN initial state (default: tf.float32).

    scope:
            VariableScope for the created subgraph; default: "attention_decoder".

    Returns
    -------

    outputs:
            A list of the same length as decoder_inputs of 2D Tensors of shape
                [batch_size x output_size]. These represent the generated outputs.
                Output i is computed from input i (which is either i-th decoder_inputs or
                loop_function(output {i-1}, i)) as follows. First, we run the cell
                on a combination of the input and previous attention masks:
                    cell_output, new_state = cell(linear(input, prev_attn), prev_state).
                Then, we calculate new attention masks:
                    new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
                and then we calculate the output:
                    output = linear(cell_output, new_attn).

    states:
            The state of each decoder cell in each time-step. This is a list
                with length len(decoder_inputs) -- one item for each time-step.
                Each item is a 2D Tensor of shape [batch_size x cell.state_size].

    """
    assert attention_f is not None

    output_size = cell.output_size

    if dropout is not None:

        for c in cell._cells:
            c.input_keep_prob = 1.0 - dropout

    if initializer is None:
        initializer = tf.random_uniform_initializer(minval=-0.1,
                                                    maxval=0.1,
                                                    seed=_SEED)

    with vs.variable_scope(scope or "attention_decoder",
                           initializer=initializer):

        emb_inp = _embed_inputs(decoder_inputs,
                                num_symbols,
                                cell.input_size,
                                input_feeding=input_feeding)

        batch = array_ops.shape(emb_inp[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        cell_state = initial_state

        outputs = []
        batch_attn_size = array_ops.pack([batch, attn_size])

        # initial attention state
        ct = array_ops.zeros(batch_attn_size, dtype=dtype)
        ct.set_shape([None, attn_size])

        if decoder_states is None:
            cell_outputs = []
        else:
            cell_outputs = decoder_states

        for i in xrange(len(emb_inp)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()

            if input_feeding:
                # if using input_feeding, concatenate previous attention with input to layers
                inp = array_ops.concat(1, [emb_inp[i], ct])
            else:
                inp = emb_inp[i]

            if combine_inp_attn:
                # Merge input and previous attentions into one vector of the right size.
                x = cells.linear([inp] + [ct], cell.input_size, True)
            else:
                x = inp

            # Run the RNN.
            cell_output, new_state = cell(x, cell_state)
            cell_state = new_state

            if decoder_states is None:

                # states.append(new_state)  # new_state = dt#
                cell_outputs.append(cell_output)

            else:
                reshaped = tf.reshape(cell_output, [-1, 1, 1, attn_size])
                decoder_states = tf.concat(1, [decoder_states, reshaped])

            # dt = new_state
            if content_function is mod_bahdanau:
                dt = cell_outputs[-2]
            else:
                dt = cell_output

            ct = attention_f(decoder_hidden_state=dt,
                             hidden_attn=hidden,
                             initializer=initializer,
                             window_size=window_size,
                             content_function=content_function,
                             dtype=dtype)

            with vs.variable_scope("AttnOutputProjection",
                                   initializer=initializer):

                if decoder_states is None:

                    shape1 = len(cell_outputs)

                    top_states = [
                        tf.reshape(o, [-1, 1, attn_size]) for o in cell_outputs
                    ]

                    output_attention_states = tf.concat(1, top_states)

                    decoder_hidden = array_ops.reshape(
                        output_attention_states, [-1, shape1, 1, attn_size])

                    ht_hat = decoder_output_attention(decoder_hidden,
                                                      attn_size,
                                                      decoder_attention_f,
                                                      initializer=initializer)
                else:

                    decoder_hidden = decoder_states

                    ht_hat = decoder_output_attention(decoder_hidden,
                                                      attn_size,
                                                      decoder_attention_f,
                                                      initializer=initializer,
                                                      step_num=step_num)

                output = cells.linear([ct] + [ht_hat], output_size, True)

                output = tf.tanh(output)

            outputs.append(output)

    if decoder_states is None:

        cell_outs = [
            tf.reshape(o, [-1, 1, 1, attn_size]) for o in cell_outputs
        ]

        cell_outputs = tf.concat(1, cell_outs)

    else:

        cell_outputs = decoder_states

    return outputs, cell_state, cell_outputs
コード例 #10
0
ファイル: attention.py プロジェクト: yeab/tsf_nmt
def hybrid_attention(decoder_hidden_state,
                     hidden_attn,
                     initializer,
                     window_size=10,
                     content_function=vinyals_kaiser,
                     dtype=tf.float32):
    """Put hybrid attention (mix of global and local attention) on hidden using decoder hidden states
    and the hidden states of encoder (hidden_attn).

        Parameters
        ----------
        decoder_hidden_state : 2-D Tensor
            Tensor representing the current hidden state of the decoder (output of the recurrent layers).
            Shape is (?, decoder_size).
        hidden_attn : 4-D Tensor
            Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
            shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
            the attention score more efficiently.
        initializer : function
            Function to use when initializing variables within the variables context.
        window_size : int
            Size of each side of the window to use when applying local attention. Not relevant to global
            attention. Default to 10.
        content_function : function
            Content function to score the decoder hidden states and encoder hidden states to extract their
            weights. Default to 'vinyals_kaiser'.
        dtype : tensorflow dtype
            Type of tensors. Default to tf.float32

        Returns
        -------
        ds : 2-D Tensor
            Tensor representing the context vector generated after scoring the encoder and decoder hidden
            states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None

    attention_vec_size = hidden_attn.get_shape()[3].value

    local_attn = local_attention(decoder_hidden_state=decoder_hidden_state,
                                 hidden_attn=hidden_attn,
                                 content_function=content_function,
                                 window_size=window_size,
                                 initializer=initializer,
                                 dtype=dtype)

    global_attn = global_attention(decoder_hidden_state=decoder_hidden_state,
                                   hidden_attn=hidden_attn,
                                   content_function=content_function,
                                   window_size=window_size,
                                   initializer=initializer,
                                   dtype=dtype)

    with vs.variable_scope("FeedbackGate_%d" % 0, initializer=initializer):
        y = cells.linear(decoder_hidden_state, attention_vec_size, True)
        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])

        vb = vs.get_variable("FeedbackVb_%d" % 0, [attention_vec_size],
                             initializer=initializer)

        # tanh(Wp*ht)
        tanh = math_ops.tanh(y)
        beta = math_ops.sigmoid(math_ops.reduce_sum((vb * tanh), [2, 3]))

        _ = tf.histogram_summary('hybrid_beta_weights', beta)

        attns = beta * global_attn + (1 - beta) * local_attn

    return attns
コード例 #11
0
ファイル: attention.py プロジェクト: yeab/tsf_nmt
def local_attention(decoder_hidden_state,
                    hidden_attn,
                    initializer,
                    window_size=10,
                    content_function=vinyals_kaiser,
                    dtype=tf.float32):
    """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn).

    Parameters
    ----------
    decoder_hidden_state : 2-D Tensor
        Tensor representing the current hidden state of the decoder (output of the recurrent layers).
        Shape is (?, decoder_size).
    hidden_attn : 4-D Tensor
        Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
        shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
        the attention score more efficiently.
    initializer : function
        Function to use when initializing variables within the variables context.
    window_size : int
        Size of each side of the window to use when applying local attention. Not relevant to global
        attention. Default to 10.
    content_function : function
        Content function to score the decoder hidden states and encoder hidden states to extract their
        weights. Default to 'vinyals_kaiser'.
    dtype : tensorflow dtype
        Type of tensors. Default to tf.float32

    Returns
    -------
    ds : 2-D Tensor
        Tensor representing the context vector generated after scoring the encoder and decoder hidden
        states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None
    sigma = window_size / 2
    denominator = sigma**2

    attention_vec_size = hidden_attn.get_shape()[3].value
    attn_length = hidden_attn.get_shape()[1].value

    batch_size = array_ops.shape(hidden_attn)[0]

    with vs.variable_scope("AttentionLocal", initializer=initializer):

        # apply content function to score the hidden states from the encoder
        s = content_function(hidden_attn, decoder_hidden_state)

        with vs.variable_scope("WindowPrediction", initializer=initializer):
            ht = cells.linear([decoder_hidden_state], attention_vec_size, True)

        # get the parameters (vp)
        vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size],
                             initializer=initializer)

        # tanh(Wp*ht)
        tanh = math_ops.tanh(ht)
        # S * sigmoid(vp * tanh(Wp*ht))  - this is going to return a number
        # for each sentence in the batch - i.e., a tensor of shape batch x 1
        S = attn_length
        pt = math_ops.reduce_sum((vp * tanh), [2, 3])
        pt = math_ops.sigmoid(pt) * S

        # now we get only the integer part of the values
        pt = tf.floor(pt)

        _ = tf.histogram_summary('local_window_predictions', pt)

        # we now create a tensor containing the indices representing each position
        # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3,
        # the resulting tensor will be:
        # [[0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]]
        #
        indices = []
        for pos in xrange(attn_length):
            indices.append(pos)
        indices = indices * batch_size
        idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype)
        idx = tf.reshape(idx, [-1, attn_length])

        # here we calculate the boundaries of the attention window based on the ppositions
        low = pt - window_size + 1  # we add one because the floor op already generates the first position
        high = pt + window_size

        # here we check our positions against the boundaries
        mlow = tf.to_float(idx < low)
        mhigh = tf.to_float(idx > high)

        # now we combine both into a pre-mask that has 0s and 1s switched
        # i.e, at this point, True == 0 and False == 1
        m = mlow + mhigh  # batch_size

        # here we switch the 0s to 1s and the 1s to 0s
        # we correct the values so True == 1 and False == 0
        mask = tf.to_float(tf.equal(m, 0.0))

        # here we switch off all the values that fall outside the window
        # first we switch off those in the truncated normal
        alpha = s * mask
        masked_soft = nn_ops.softmax(alpha)

        _ = tf.histogram_summary('local_alpha_weights', alpha)

        # here we calculate the 'truncated normal distribution'
        numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype))
        div = tf.truediv(numerator, denominator)
        e = math_ops.exp(div)  # result of the truncated normal distribution

        at = masked_soft * e

        # Now calculate the attention-weighted vector d.
        d = math_ops.reduce_sum(
            array_ops.reshape(at, [-1, attn_length, 1, 1]) * hidden_attn,
            [1, 2])
        ds = array_ops.reshape(d, [-1, attention_vec_size])

    _ = tf.histogram_summary('local_attention_context', ds)

    return ds
コード例 #12
0
ファイル: decoders.py プロジェクト: yeab/tsf_nmt
def attention_decoder_nmt(decoder_inputs,
                          initial_state,
                          attention_states,
                          cell,
                          num_symbols,
                          attention_f=global_attention,
                          window_size=10,
                          content_function=vinyals_kaiser,
                          decoder_attention_f=decoder_type_2,
                          combine_inp_attn=False,
                          input_feeding=False,
                          dropout=None,
                          initializer=None,
                          dtype=tf.float32,
                          scope=None):
    """

    Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence
        model.

    Parameters
    ----------

    decoder_inputs: list
            a list of 2D Tensors [batch_size x cell.input_size].

    initial_state: tensor
            2d Tensor [batch_size x (number of decoder layers * hidden_layer_size * 2)] if LSTM or
            [batch_size x (number of decoder layers * hidden_layer_size)] if GRU representing the initial
                state (usually, we take the states of the encoder) to be used when running the decoder. The '2' on
                the LSTM formula mean that we have to set the hidden state and the cell state.

    attention_states: tensor
            3D tensor [batch_size x attn_length (time) x attn_size (hidden_layer_size)] representing the encoder
                hidden states that will be used to derive the context (attention) vector.

    cell: RNNCell
            rnn_cell.RNNCell defining the cell function and size.

    batch_size: tensor
            tensor representing the batch size used when training the model

    attention_f: function
            function indicating which type of attention to use. Default to global_attention.

    loop_function:
            if not None, this function will be applied to i-th output
                in order to generate i+1-th input, and decoder_inputs will be ignored,
                except for the first element ("GO" symbol). This can be used for decoding,
                but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
            Signature -- loop_function(prev, i) = next
                * prev is a 2D Tensor of shape [batch_size x cell.output_size],
                * i is an integer, the step number (when advanced control is needed),
                * next is a 2D Tensor of shape [batch_size x cell.input_size].

    window_size: int
            size of the window to apply on local attention.Default to 10.

    input_feeding : boolean
            Flag indicating where to use the "input feeding approach" proposed by Luong et al. (2015).
                Default to False.

    content_function: string

    dtype:
            The dtype to use for the RNN initial state (default: tf.float32).

    scope:
            VariableScope for the created subgraph; default: "attention_decoder".

    Returns
    -------

    outputs:
            A list of the same length as decoder_inputs of 2D Tensors of shape
                [batch_size x output_size]. These represent the generated outputs.
                Output i is computed from input i (which is either i-th decoder_inputs or
                loop_function(output {i-1}, i)) as follows. First, we run the cell
                on a combination of the input and previous attention masks:
                    cell_output, new_state = cell(linear(input, prev_attn), prev_state).
                Then, we calculate new attention masks:
                    new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
                and then we calculate the output:
                    output = linear(cell_output, new_attn).

    states:
            The state of each decoder cell in each time-step. This is a list
                with length len(decoder_inputs) -- one item for each time-step.
                Each item is a 2D Tensor of shape [batch_size x cell.state_size].

    """
    assert attention_f is not None

    output_size = cell.output_size

    if dropout is not None:

        cell.input_keep_prob = 1.0 - dropout

    if initializer is None:
        initializer = tf.random_uniform_initializer(minval=-0.1,
                                                    maxval=0.1,
                                                    seed=_SEED)

    with vs.variable_scope(scope or "embedding_attention_decoder",
                           initializer=initializer):
        emb_inp = _embed_inputs(decoder_inputs,
                                num_symbols,
                                cell.input_size,
                                input_feeding=input_feeding)

        batch = array_ops.shape(emb_inp[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        cell_states = initial_state
        initial_state_decoder = tf.zeros_like(initial_state)
        initial_state_decoder.set_shape(
            [None, initial_state.get_shape()[1].value])
        cell_outputs = [initial_state_decoder]
        outputs = []
        batch_attn_size = array_ops.pack([batch, attn_size])

        # initial attention state
        ct = array_ops.zeros(batch_attn_size, dtype=dtype)
        ct.set_shape([None, attn_size])

        for i in xrange(len(emb_inp)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()

            if input_feeding:
                # if using input_feeding, concatenate previous attention with input to layers
                inp = array_ops.concat(1, [emb_inp[i], ct])
            else:
                inp = emb_inp[i]

            if combine_inp_attn:
                # Merge input and previous attentions into one vector of the right size.
                x = cells.linear([inp] + [ct], cell.input_size, True)
            else:
                x = inp

            dt = cell_outputs[-1]

            ct = attention_f(decoder_hidden_state=dt,
                             hidden_attn=hidden,
                             initializer=initializer,
                             window_size=window_size,
                             content_function=content_function,
                             dtype=dtype)

            # Run the RNN.
            cell_output, new_state = cell(x, cell_states, context=ct)
            cell_states = new_state
            cell_outputs.append(cell_output)

            #
            with vs.variable_scope("AttnOutputProjection",
                                   initializer=initializer):

                with vs.variable_scope("AttnOutputProjection_logit_lstm",
                                       initializer=initializer):

                    # if we pass a list of tensors, linear will first concatenate them over axis 1
                    logit_lstm = cells.linear([cell_output], output_size, True)

                with vs.variable_scope("AttnOutputProjection_logit_ctx",
                                       initializer=initializer):

                    # if we pass a list of tensors, linear will first concatenate them over axis 1
                    logit_ctx = cells.linear([ct], output_size, True)

                with vs.variable_scope("AttnOutputProjection_logit_emb",
                                       initializer=initializer):

                    # if we pass a list of tensors, linear will first concatenate them over axis 1
                    logit_prev = cells.linear([x], output_size, True)

                # if we pass a list of tensors, linear will first concatenate them over axis 1
                output = tf.tanh(logit_lstm + logit_prev + logit_ctx)

            outputs.append(output)

    return outputs, cell_states
コード例 #13
0
def _output_form_single(decoder_states, output_size, initializer):

    with vs.variable_scope("decoder_states_linear", initializer=initializer):
        output = cells.linear([decoder_states], output_size, True)

    return output
コード例 #14
0
ファイル: decoders.py プロジェクト: chagge/tsf_nmt
def attention_decoder_output(decoder_inputs, initial_state, attention_states, cell, num_symbols,
                             attention_f=global_attention, window_size=10, content_function=vinyals_kaiser,
                             decoder_attention_f=decoder_type_2, combine_inp_attn=False, input_feeding=False,
                             dropout=None, initializer=None, decoder_states=None, step_num=None,
                             dtype=tf.float32, scope=None):
    """

    Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence
        model.

    Parameters
    ----------

    decoder_inputs: list
            a list of 2D Tensors [batch_size x cell.input_size].

    initial_state: tensor
            3D Tensor [batch_size x attn_length x attn_size].

    attention_states:

    cell: RNNCell
            rnn_cell.RNNCell defining the cell function and size.

    batch_size: int
            batch size when training the model

    attention_f: function
            function indicating which type of attention to use. Default to global_attention.

    output_size: int
            size of the output vectors; if None, we use cell.output_size.

    loop_function:
            if not None, this function will be applied to i-th output
                in order to generate i+1-th input, and decoder_inputs will be ignored,
                except for the first element ("GO" symbol). This can be used for decoding,
                but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
            Signature -- loop_function(prev, i) = next
                * prev is a 2D Tensor of shape [batch_size x cell.output_size],
                * i is an integer, the step number (when advanced control is needed),
                * next is a 2D Tensor of shape [batch_size x cell.input_size].

    window_size: int
            size of the window to apply on local attention

    input_feeding: boolean
            whether or not to use the input feeding approach by Luong et al., 2015.

    content_function: string

    dtype:
            The dtype to use for the RNN initial state (default: tf.float32).

    scope:
            VariableScope for the created subgraph; default: "attention_decoder".

    Returns
    -------

    outputs:
            A list of the same length as decoder_inputs of 2D Tensors of shape
                [batch_size x output_size]. These represent the generated outputs.
                Output i is computed from input i (which is either i-th decoder_inputs or
                loop_function(output {i-1}, i)) as follows. First, we run the cell
                on a combination of the input and previous attention masks:
                    cell_output, new_state = cell(linear(input, prev_attn), prev_state).
                Then, we calculate new attention masks:
                    new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
                and then we calculate the output:
                    output = linear(cell_output, new_attn).

    states:
            The state of each decoder cell in each time-step. This is a list
                with length len(decoder_inputs) -- one item for each time-step.
                Each item is a 2D Tensor of shape [batch_size x cell.state_size].

    """
    assert attention_f is not None

    output_size = cell.output_size

    if dropout is not None:

        for c in cell._cells:
            c.input_keep_prob = 1.0 - dropout

    if initializer is None:
        initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1, seed=_SEED)

    with vs.variable_scope(scope or "attention_decoder", initializer=initializer):

        emb_inp = _embed_inputs(decoder_inputs, num_symbols, cell.input_size, input_feeding=input_feeding)

        batch = array_ops.shape(emb_inp[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size])

        cell_state = initial_state

        outputs = []
        batch_attn_size = array_ops.pack([batch, attn_size])

        # initial attention state
        ct = array_ops.zeros(batch_attn_size, dtype=dtype)
        ct.set_shape([None, attn_size])

        if decoder_states is None:
            cell_outputs = []
        else:
            cell_outputs = decoder_states

        for i in xrange(len(emb_inp)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()

            if input_feeding:
                # if using input_feeding, concatenate previous attention with input to layers
                inp = array_ops.concat(1, [emb_inp[i], ct])
            else:
                inp = emb_inp[i]

            if combine_inp_attn:
                # Merge input and previous attentions into one vector of the right size.
                x = cells.linear([inp] + [ct], cell.input_size, True)
            else:
                x = inp

            # Run the RNN.
            cell_output, new_state = cell(x, cell_state)
            cell_state = new_state

            if decoder_states is None:

                # states.append(new_state)  # new_state = dt#
                cell_outputs.append(cell_output)

            else:
                reshaped = tf.reshape(cell_output, [-1, 1, 1, attn_size])
                decoder_states = tf.concat(1, [decoder_states, reshaped])

            # dt = new_state
            if content_function is mod_bahdanau:
                dt = cell_outputs[-2]
            else:
                dt = cell_output

            ct = attention_f(decoder_hidden_state=dt, hidden_attn=hidden,
                             initializer=initializer, window_size=window_size,
                             content_function=content_function, dtype=dtype)

            with vs.variable_scope("AttnOutputProjection", initializer=initializer):

                if decoder_states is None:

                    shape1 = len(cell_outputs)

                    top_states = [tf.reshape(o, [-1, 1, attn_size]) for o in cell_outputs]

                    output_attention_states = tf.concat(1, top_states)

                    decoder_hidden = array_ops.reshape(output_attention_states, [-1, shape1, 1, attn_size])

                    ht_hat = decoder_output_attention(decoder_hidden,
                                                      attn_size,
                                                      decoder_attention_f,
                                                      initializer=initializer)
                else:

                    decoder_hidden = decoder_states

                    ht_hat = decoder_output_attention(decoder_hidden,
                                                      attn_size,
                                                      decoder_attention_f,
                                                      initializer=initializer,
                                                      step_num=step_num)

                output = cells.linear([ct] + [ht_hat], output_size, True)

                output = tf.tanh(output)

            outputs.append(output)

    if decoder_states is None:

        cell_outs = [tf.reshape(o, [-1, 1, 1, attn_size]) for o in cell_outputs]

        cell_outputs = tf.concat(1, cell_outs)

    else:

        cell_outputs = decoder_states

    return outputs, cell_state, cell_outputs
コード例 #15
0
ファイル: attention.py プロジェクト: chagge/tsf_nmt
def hybrid_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10,
                     content_function=vinyals_kaiser, dtype=tf.float32):
    """Put hybrid attention (mix of global and local attention) on hidden using decoder hidden states
    and the hidden states of encoder (hidden_attn).

        Parameters
        ----------
        decoder_hidden_state : 2-D Tensor
            Tensor representing the current hidden state of the decoder (output of the recurrent layers).
            Shape is (?, decoder_size).
        hidden_attn : 4-D Tensor
            Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
            shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
            the attention score more efficiently.
        initializer : function
            Function to use when initializing variables within the variables context.
        window_size : int
            Size of each side of the window to use when applying local attention. Not relevant to global
            attention. Default to 10.
        content_function : function
            Content function to score the decoder hidden states and encoder hidden states to extract their
            weights. Default to 'vinyals_kaiser'.
        dtype : tensorflow dtype
            Type of tensors. Default to tf.float32

        Returns
        -------
        ds : 2-D Tensor
            Tensor representing the context vector generated after scoring the encoder and decoder hidden
            states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None

    attention_vec_size = hidden_attn.get_shape()[3].value

    local_attn = local_attention(decoder_hidden_state=decoder_hidden_state,
                                 hidden_attn=hidden_attn,
                                 content_function=content_function,
                                 window_size=window_size, initializer=initializer, dtype=dtype)

    global_attn = global_attention(decoder_hidden_state=decoder_hidden_state,
                                   hidden_attn=hidden_attn,
                                   content_function=content_function,
                                   window_size=window_size, initializer=initializer, dtype=dtype)

    with vs.variable_scope("FeedbackGate_%d" % 0, initializer=initializer):
        y = cells.linear(decoder_hidden_state, attention_vec_size, True)
        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])

        vb = vs.get_variable("FeedbackVb_%d" % 0, [attention_vec_size], initializer=initializer)

        # tanh(Wp*ht)
        tanh = math_ops.tanh(y)
        beta = math_ops.sigmoid(math_ops.reduce_sum((vb * tanh), [2, 3]))

        _ = tf.histogram_summary('hybrid_beta_weights', beta)

        attns = beta * global_attn + (1 - beta) * local_attn

    return attns
コード例 #16
0
ファイル: lm_ops.py プロジェクト: chagge/attentive_lm
def _output_form_single(decoder_states, output_size, initializer):

    with vs.variable_scope("decoder_states_linear", initializer=initializer):
        output = cells.linear([decoder_states], output_size, True)

    return output
コード例 #17
0
ファイル: lm_ops.py プロジェクト: chagge/attentive_lm
def _output_form_concat(cell_output, decoder_states, output_size, initializer):

    with vs.variable_scope("output_form", initializer=initializer):
        output = cells.linear([cell_output] + [decoder_states], output_size, True)

    return output
コード例 #18
0
ファイル: decoders.py プロジェクト: chagge/tsf_nmt
def attention_decoder_nmt(decoder_inputs, initial_state, attention_states, cell, num_symbols,
                          attention_f=global_attention, window_size=10, content_function=vinyals_kaiser,
                          decoder_attention_f=decoder_type_2, combine_inp_attn=False, input_feeding=False,
                          dropout=None, initializer=None, dtype=tf.float32, scope=None):
    """

    Helper function implementing a RNN decoder with global, local or hybrid attention for the sequence-to-sequence
        model.

    Parameters
    ----------

    decoder_inputs: list
            a list of 2D Tensors [batch_size x cell.input_size].

    initial_state: tensor
            2d Tensor [batch_size x (number of decoder layers * hidden_layer_size * 2)] if LSTM or
            [batch_size x (number of decoder layers * hidden_layer_size)] if GRU representing the initial
                state (usually, we take the states of the encoder) to be used when running the decoder. The '2' on
                the LSTM formula mean that we have to set the hidden state and the cell state.

    attention_states: tensor
            3D tensor [batch_size x attn_length (time) x attn_size (hidden_layer_size)] representing the encoder
                hidden states that will be used to derive the context (attention) vector.

    cell: RNNCell
            rnn_cell.RNNCell defining the cell function and size.

    batch_size: tensor
            tensor representing the batch size used when training the model

    attention_f: function
            function indicating which type of attention to use. Default to global_attention.

    loop_function:
            if not None, this function will be applied to i-th output
                in order to generate i+1-th input, and decoder_inputs will be ignored,
                except for the first element ("GO" symbol). This can be used for decoding,
                but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
            Signature -- loop_function(prev, i) = next
                * prev is a 2D Tensor of shape [batch_size x cell.output_size],
                * i is an integer, the step number (when advanced control is needed),
                * next is a 2D Tensor of shape [batch_size x cell.input_size].

    window_size: int
            size of the window to apply on local attention.Default to 10.

    input_feeding : boolean
            Flag indicating where to use the "input feeding approach" proposed by Luong et al. (2015).
                Default to False.

    content_function: string

    dtype:
            The dtype to use for the RNN initial state (default: tf.float32).

    scope:
            VariableScope for the created subgraph; default: "attention_decoder".

    Returns
    -------

    outputs:
            A list of the same length as decoder_inputs of 2D Tensors of shape
                [batch_size x output_size]. These represent the generated outputs.
                Output i is computed from input i (which is either i-th decoder_inputs or
                loop_function(output {i-1}, i)) as follows. First, we run the cell
                on a combination of the input and previous attention masks:
                    cell_output, new_state = cell(linear(input, prev_attn), prev_state).
                Then, we calculate new attention masks:
                    new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
                and then we calculate the output:
                    output = linear(cell_output, new_attn).

    states:
            The state of each decoder cell in each time-step. This is a list
                with length len(decoder_inputs) -- one item for each time-step.
                Each item is a 2D Tensor of shape [batch_size x cell.state_size].

    """
    assert attention_f is not None

    output_size = cell.output_size

    if dropout is not None:

        cell.input_keep_prob = 1.0 - dropout

    if initializer is None:
        initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1, seed=_SEED)

    with vs.variable_scope(scope or "embedding_attention_decoder", initializer=initializer):
        emb_inp = _embed_inputs(decoder_inputs, num_symbols, cell.input_size, input_feeding=input_feeding)

        batch = array_ops.shape(emb_inp[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size])

        cell_states = initial_state
        initial_state_decoder = tf.zeros_like(initial_state)
        initial_state_decoder.set_shape([None, initial_state.get_shape()[1].value])
        cell_outputs = [initial_state_decoder]
        outputs = []
        batch_attn_size = array_ops.pack([batch, attn_size])

        # initial attention state
        ct = array_ops.zeros(batch_attn_size, dtype=dtype)
        ct.set_shape([None, attn_size])

        for i in xrange(len(emb_inp)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()

            if input_feeding:
                # if using input_feeding, concatenate previous attention with input to layers
                inp = array_ops.concat(1, [emb_inp[i], ct])
            else:
                inp = emb_inp[i]

            if combine_inp_attn:
                # Merge input and previous attentions into one vector of the right size.
                x = cells.linear([inp] + [ct], cell.input_size, True)
            else:
                x = inp

            dt = cell_outputs[-1]

            ct = attention_f(decoder_hidden_state=dt, hidden_attn=hidden,
                             initializer=initializer, window_size=window_size,
                             content_function=content_function, dtype=dtype)

            # Run the RNN.
            cell_output, new_state = cell(x, cell_states, context=ct)
            cell_states = new_state
            cell_outputs.append(cell_output)

            #
            with vs.variable_scope("AttnOutputProjection", initializer=initializer):

                with vs.variable_scope("AttnOutputProjection_logit_lstm", initializer=initializer):

                    # if we pass a list of tensors, linear will first concatenate them over axis 1
                    logit_lstm = cells.linear([cell_output], output_size, True)

                with vs.variable_scope("AttnOutputProjection_logit_ctx", initializer=initializer):

                    # if we pass a list of tensors, linear will first concatenate them over axis 1
                    logit_ctx = cells.linear([ct], output_size, True)

                with vs.variable_scope("AttnOutputProjection_logit_emb", initializer=initializer):

                    # if we pass a list of tensors, linear will first concatenate them over axis 1
                    logit_prev = cells.linear([x], output_size, True)

                # if we pass a list of tensors, linear will first concatenate them over axis 1
                output = tf.tanh(logit_lstm + logit_prev + logit_ctx)

            outputs.append(output)

    return outputs, cell_states
コード例 #19
0
ファイル: attention.py プロジェクト: oakfeiwang/attentive_lm
def score_combined(
        decoder_current,  # h_t
        decoder_previous,  # h_i
        reuse_variables=False,
        dtype=tf.float32):
    """ Applies a score function of the form

              v.(W1.hi + W2.hs)

  where W is a weight matrix, v is a vector of parameters, hi is one
  of each of the decoder hidden states and hs is the current hidden state at
  timestep t.

  The function performs a 1-by-1 convolution to calculate W.hi  and performs
  the W2.hs step using a ``linear'' cell (see cells.linear for the
  documentation)  and broadcasted into the result of W1.hi (encoder_hiddens)
  via multiplication step.  After this step a reduce_sum is performed over
  axis=[2,3] so the correct results are obtained.

  Args:
      decoder_current: not used
      attn_size: the size of the attention vectors
      encoder_hiddens: 3-D Tensor [batch_size, timestep, hidden_dim]. It
          represents the hidden sattes of the decoder up to the current
          timestep.
      current_hidden: Tensor, representing the current hidden state at
          timestep t

  Returns:
      beta: decoder hidden states after applying the content function

  """
    with tf.variable_scope("score_salton_combined") as scope:
        if reuse_variables:
            scope.reuse_variables()

        _, output_size = get_2d_tensor_shapes(decoder_current)

        decoder_current = cells.linear([decoder_current],
                                       output_size,
                                       bias=False,
                                       dtype=dtype)

        #
        decoder_previous, attn_dim = reshape_attention(decoder_previous)

        # we first get the correct weight matrix
        ws = tf.get_variable("AttnDecWs", [1, 1, attn_dim, attn_dim],
                             dtype=dtype)

        # we apply a small convolution to the decoder states - it is more
        # efficient than performing a recurrent matrix * matrix
        hidden_features = convolve(decoder_previous, ws)

        hidden_features = hidden_features + decoder_current

        # we then get the vector v that will be used on the second
        # multiplication op.
        vs = tf.get_variable("AttnDecVs", [attn_dim], dtype=dtype)

        scores = tf.reduce_sum((vs * tf.tanh(hidden_features)), [2, 3])

    return scores
コード例 #20
0
ファイル: attention.py プロジェクト: chagge/tsf_nmt
def local_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10,
                    content_function=vinyals_kaiser, dtype=tf.float32):
    """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn).

    Parameters
    ----------
    decoder_hidden_state : 2-D Tensor
        Tensor representing the current hidden state of the decoder (output of the recurrent layers).
        Shape is (?, decoder_size).
    hidden_attn : 4-D Tensor
        Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
        shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
        the attention score more efficiently.
    initializer : function
        Function to use when initializing variables within the variables context.
    window_size : int
        Size of each side of the window to use when applying local attention. Not relevant to global
        attention. Default to 10.
    content_function : function
        Content function to score the decoder hidden states and encoder hidden states to extract their
        weights. Default to 'vinyals_kaiser'.
    dtype : tensorflow dtype
        Type of tensors. Default to tf.float32

    Returns
    -------
    ds : 2-D Tensor
        Tensor representing the context vector generated after scoring the encoder and decoder hidden
        states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None
    sigma = window_size / 2
    denominator = sigma ** 2

    attention_vec_size = hidden_attn.get_shape()[3].value
    attn_length = hidden_attn.get_shape()[1].value

    batch_size = array_ops.shape(hidden_attn)[0]

    with vs.variable_scope("AttentionLocal", initializer=initializer):

        # apply content function to score the hidden states from the encoder
        s = content_function(hidden_attn, decoder_hidden_state)

        with vs.variable_scope("WindowPrediction", initializer=initializer):
            ht = cells.linear([decoder_hidden_state], attention_vec_size, True)

        # get the parameters (vp)
        vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size], initializer=initializer)

        # tanh(Wp*ht)
        tanh = math_ops.tanh(ht)
        # S * sigmoid(vp * tanh(Wp*ht))  - this is going to return a number
        # for each sentence in the batch - i.e., a tensor of shape batch x 1
        S = attn_length
        pt = math_ops.reduce_sum((vp * tanh), [2, 3])
        pt = math_ops.sigmoid(pt) * S

        # now we get only the integer part of the values
        pt = tf.floor(pt)

        _ = tf.histogram_summary('local_window_predictions', pt)

        # we now create a tensor containing the indices representing each position
        # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3,
        # the resulting tensor will be:
        # [[0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]]
        #
        indices = []
        for pos in xrange(attn_length):
            indices.append(pos)
        indices = indices * batch_size
        idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype)
        idx = tf.reshape(idx, [-1, attn_length])

        # here we calculate the boundaries of the attention window based on the ppositions
        low = pt - window_size + 1  # we add one because the floor op already generates the first position
        high = pt + window_size

        # here we check our positions against the boundaries
        mlow = tf.to_float(idx < low)
        mhigh = tf.to_float(idx > high)

        # now we combine both into a pre-mask that has 0s and 1s switched
        # i.e, at this point, True == 0 and False == 1
        m = mlow + mhigh  # batch_size

        # here we switch the 0s to 1s and the 1s to 0s
        # we correct the values so True == 1 and False == 0
        mask = tf.to_float(tf.equal(m, 0.0))

        # here we switch off all the values that fall outside the window
        # first we switch off those in the truncated normal
        alpha = s * mask
        masked_soft = nn_ops.softmax(alpha)

        _ = tf.histogram_summary('local_alpha_weights', alpha)

        # here we calculate the 'truncated normal distribution'
        numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype))
        div = tf.truediv(numerator, denominator)
        e = math_ops.exp(div)  # result of the truncated normal distribution

        at = masked_soft * e

        # Now calculate the attention-weighted vector d.
        d = math_ops.reduce_sum(
                array_ops.reshape(at, [-1, attn_length, 1, 1]) * hidden_attn,
                [1, 2])
        ds = array_ops.reshape(d, [-1, attention_vec_size])

    _ = tf.histogram_summary('local_attention_context', ds)

    return ds