Exemple #1
0
 def test_crf_with_loss_op(self, num_tags, num_words):
     model = CNNModelHelper(name='external')
     embeddings_dim = 200
     embeddings = np.random.randn(num_words,
                                  embeddings_dim).astype(np.float32)
     transitions = np.random.uniform(low=-1,
                                     high=1,
                                     size=(num_tags + 2,
                                           num_tags + 2)).astype(np.float32)
     labels = np.random.randint(num_tags, size=(num_words)).astype(np.int64)
     embeddings_blob, labels_blob, transitions_blob = (
         model.net.AddExternalInputs('embeddings_blob', 'labels_blob',
                                     'crf_transitions'))
     workspace.FeedBlob(str(embeddings_blob), embeddings)
     workspace.FeedBlob(str(labels_blob), labels)
     workspace.FeedBlob(str(transitions_blob), transitions)
     predictions_blob = model.FC(embeddings_blob, "fc_0", embeddings_dim,
                                 num_tags, ('UniformFill', {
                                     'min': -1.0
                                 }, {
                                     'max': 1.0
                                 }), ('UniformFill', {
                                     'min': -1.0
                                 }, {
                                     'max': 1.0
                                 }))
     crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob)
     crf_loss = crf_layer.crf_loss(predictions_blob, labels_blob)
     model.net.AddGradientOperators([crf_loss])
     workspace.RunNetOnce(model.param_init_net)
     workspace.RunNetOnce(model.net)
     loss = workspace.FetchBlob(str(crf_loss))
     predictions = workspace.FetchBlob(str(predictions_blob))
     np.testing.assert_allclose(
         loss,
         self._compute_loss_manual(predictions, num_tags, labels,
                                   transitions),
         atol=0.001,
         rtol=0.001,
         err_msg='CRF LOSS is not matching the reference')
Exemple #2
0
def MILSTM(model,
           input_blob,
           seq_lengths,
           initial_states,
           dim_in,
           dim_out,
           scope,
           outputs_with_grads=(0, ),
           memory_optimization=False,
           forget_bias=0.0):
    '''
    Adds MI flavor of standard LSTM recurrent network operator to a model.
    See https://arxiv.org/pdf/1606.06630.pdf

    model: CNNModelHelper object new operators would be added to

    input_blob: the input sequence in a format T x N x D
    where T is sequence size, N - batch size and D - input dimention

    seq_lengths: blob containing sequence lengths which would be passed to
    LSTMUnit operator

    initial_states: a tupple of (hidden_input_blob, cell_input_blob)
    which are going to be inputs to the cell net on the first iteration

    dim_in: input dimention

    dim_out: output dimention

    outputs_with_grads : position indices of output blobs which will receive
    external error gradient during backpropagation

    memory_optimization: if enabled, the LSTM step is recomputed on backward step
                   so that we don't need to store forward activations for each
                   timestep. Saves memory with cost of computation.
    '''
    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        return "{}/{}".format(str(scope), str(name))

    """ initial bulk fully-connected """
    input_blob = model.FC(input_blob,
                          s('i2h'),
                          dim_in=dim_in,
                          dim_out=4 * dim_out,
                          axis=2)
    """ the step net """
    step_model = CNNModelHelper(name='milstm_cell', param_model=model)
    input_t, timestep, cell_t_prev, hidden_t_prev = (
        step_model.net.AddScopedExternalInputs('input_t', 'timestep',
                                               'cell_t_prev', 'hidden_t_prev'))
    # hU^T
    # Shape: [1, batch_size, 4 * hidden_size]
    prev_t = step_model.FC(hidden_t_prev,
                           s('prev_t'),
                           dim_in=dim_out,
                           dim_out=4 * dim_out,
                           axis=2)
    # defining MI parameters
    alpha = step_model.param_init_net.ConstantFill([], [s('alpha')],
                                                   shape=[4 * dim_out],
                                                   value=1.0)
    beta1 = step_model.param_init_net.ConstantFill([], [s('beta1')],
                                                   shape=[4 * dim_out],
                                                   value=1.0)
    beta2 = step_model.param_init_net.ConstantFill([], [s('beta2')],
                                                   shape=[4 * dim_out],
                                                   value=1.0)
    b = step_model.param_init_net.ConstantFill([], [s('b')],
                                               shape=[4 * dim_out],
                                               value=0.0)
    model.params.extend([alpha, beta1, beta2, b])
    # alpha * (xW^T * hU^T)
    # Shape: [1, batch_size, 4 * hidden_size]
    alpha_tdash = step_model.net.Mul([prev_t, input_t], s('alpha_tdash'))
    # Shape: [batch_size, 4 * hidden_size]
    alpha_tdash_rs, _ = step_model.net.Reshape(
        alpha_tdash,
        [s('alpha_tdash_rs'), s('alpha_tdash_old_shape')],
        shape=[-1, 4 * dim_out],
    )
    alpha_t = step_model.net.Mul([alpha_tdash_rs, alpha],
                                 s('alpha_t'),
                                 broadcast=1,
                                 use_grad_hack=1)
    # beta1 * hU^T
    # Shape: [batch_size, 4 * hidden_size]
    prev_t_rs, _ = step_model.net.Reshape(
        prev_t,
        [s('prev_t_rs'), s('prev_t_old_shape')],
        shape=[-1, 4 * dim_out],
    )
    beta1_t = step_model.net.Mul([prev_t_rs, beta1],
                                 s('beta1_t'),
                                 broadcast=1,
                                 use_grad_hack=1)
    # beta2 * xW^T
    # Shape: [batch_szie, 4 * hidden_size]
    input_t_rs, _ = step_model.net.Reshape(
        input_t,
        [s('input_t_rs'), s('input_t_old_shape')],
        shape=[-1, 4 * dim_out],
    )
    beta2_t = step_model.net.Mul([input_t_rs, beta2],
                                 s('beta2_t'),
                                 broadcast=1,
                                 use_grad_hack=1)
    # Add 'em all up
    gates_tdash = step_model.net.Sum([alpha_t, beta1_t, beta2_t],
                                     s('gates_tdash'))
    gates_t = step_model.net.Add([gates_tdash, b],
                                 s('gates_t'),
                                 broadcast=1,
                                 use_grad_hack=1)
    # # Shape: [1, batch_size, 4 * hidden_size]
    gates_t_rs, _ = step_model.net.Reshape(
        gates_t,
        [s('gates_t_rs'), s('gates_t_old_shape')],
        shape=[1, -1, 4 * dim_out],
    )

    hidden_t, cell_t = step_model.net.LSTMUnit(
        [hidden_t_prev, cell_t_prev, gates_t_rs, seq_lengths, timestep],
        [s('hidden_t'), s('cell_t')],
        forget_bias=forget_bias,
    )
    step_model.net.AddExternalOutputs(cell_t, hidden_t)
    """ recurrent network """
    (hidden_input_blob, cell_input_blob) = initial_states
    output, last_output, all_states, last_state = recurrent_net(
        net=model.net,
        cell_net=step_model.net,
        inputs=[(input_t, input_blob)],
        initial_cell_inputs=[
            (hidden_t_prev, hidden_input_blob),
            (cell_t_prev, cell_input_blob),
        ],
        links={
            hidden_t_prev: hidden_t,
            cell_t_prev: cell_t,
        },
        timestep=timestep,
        scope=scope,
        outputs_with_grads=outputs_with_grads,
        recompute_blobs_on_backward=[gates_t] if memory_optimization else None)
    return output, last_output, all_states, last_state
Exemple #3
0
def LSTMWithAttention(
    model,
    decoder_inputs,
    decoder_input_lengths,
    initial_decoder_hidden_state,
    initial_decoder_cell_state,
    initial_attention_weighted_encoder_context,
    encoder_output_dim,
    encoder_outputs,
    decoder_input_dim,
    decoder_state_dim,
    scope,
    attention_type=AttentionType.Regular,
    outputs_with_grads=(0, 4),
    weighted_encoder_outputs=None,
    lstm_memory_optimization=False,
    attention_memory_optimization=False,
    forget_bias=0.0,
):
    '''
    Adds a LSTM with attention mechanism to a model.

    The implementation is based on https://arxiv.org/abs/1409.0473, with
    a small difference in the order
    how we compute new attention context and new hidden state, similarly to
    https://arxiv.org/abs/1508.04025.

    The model uses encoder-decoder naming conventions,
    where the decoder is the sequence the op is iterating over,
    while computing the attention context over the encoder.

    model: CNNModelHelper object new operators would be added to

    decoder_inputs: the input sequence in a format T x N x D
    where T is sequence size, N - batch size and D - input dimention

    decoder_input_lengths: blob containing sequence lengths
    which would be passed to LSTMUnit operator

    initial_decoder_hidden_state: initial hidden state of LSTM

    initial_decoder_cell_state: initial cell state of LSTM

    initial_attention_weighted_encoder_context: initial attention context

    encoder_output_dim: dimension of encoder outputs

    encoder_outputs: the sequence, on which we compute the attention context
    at every iteration

    decoder_input_dim: input dimention (last dimension on decoder_inputs)

    decoder_state_dim: size of hidden states of LSTM

    attention_type: One of: AttentionType.Regular, AttentionType.Recurrent.
    Determines which type of attention mechanism to use.

    outputs_with_grads : position indices of output blobs which will receive
    external error gradient during backpropagation

    weighted_encoder_outputs: encoder outputs to be used to compute attention
    weights. In the basic case it's just linear transformation of
    encoder outputs (that the default, when weighted_encoder_outputs is None).
    However, it can be something more complicated - like a separate
    encoder network (for example, in case of convolutional encoder)

    lstm_memory_optimization: recompute LSTM activations on backward pass, so
                 we don't need to store their values in forward passes

    attention_memory_optimization: recompute attention for backward pass
    '''
    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        return "{}/{}".format(str(scope), str(name))

    decoder_inputs = model.FC(
        decoder_inputs,
        s('i2h'),
        dim_in=decoder_input_dim,
        dim_out=4 * decoder_state_dim,
        axis=2,
    )
    # [batch_size, encoder_output_dim, encoder_length]
    encoder_outputs_transposed = model.Transpose(
        encoder_outputs,
        s('encoder_outputs_transposed'),
        axes=[1, 2, 0],
    )
    if weighted_encoder_outputs is None:
        weighted_encoder_outputs = model.FC(
            encoder_outputs,
            s('weighted_encoder_outputs'),
            dim_in=encoder_output_dim,
            dim_out=encoder_output_dim,
            axis=2,
        )
    step_model = CNNModelHelper(
        name='lstm_with_attention_cell',
        param_model=model,
    )
    (
        input_t,
        timestep,
        cell_t_prev,
        hidden_t_prev,
        attention_weighted_encoder_context_t_prev,
    ) = (step_model.net.AddScopedExternalInputs(
        'input_t',
        'timestep',
        'cell_t_prev',
        'hidden_t_prev',
        'attention_weighted_encoder_context_t_prev',
    ))
    step_model.net.AddExternalInputs(encoder_outputs_transposed,
                                     weighted_encoder_outputs)

    gates_concatenated_input_t, _ = step_model.net.Concat(
        [hidden_t_prev, attention_weighted_encoder_context_t_prev],
        [
            s('gates_concatenated_input_t'),
            s('_gates_concatenated_input_t_concat_dims'),
        ],
        axis=2,
    )
    gates_t = step_model.FC(
        gates_concatenated_input_t,
        s('gates_t'),
        dim_in=decoder_state_dim + encoder_output_dim,
        dim_out=4 * decoder_state_dim,
        axis=2,
    )
    step_model.net.Sum([gates_t, input_t], gates_t)

    hidden_t_intermediate, cell_t = step_model.net.LSTMUnit(
        [hidden_t_prev, cell_t_prev, gates_t, decoder_input_lengths, timestep],
        ['hidden_t_intermediate', s('cell_t')],
        forget_bias=forget_bias,
    )
    if attention_type == AttentionType.Recurrent:
        attention_weighted_encoder_context_t, _, attention_blobs = apply_recurrent_attention(
            model=step_model,
            encoder_output_dim=encoder_output_dim,
            encoder_outputs_transposed=encoder_outputs_transposed,
            weighted_encoder_outputs=weighted_encoder_outputs,
            decoder_hidden_state_t=hidden_t_intermediate,
            decoder_hidden_state_dim=decoder_state_dim,
            scope=scope,
            attention_weighted_encoder_context_t_prev=(
                attention_weighted_encoder_context_t_prev),
        )
    else:
        attention_weighted_encoder_context_t, _, attention_blobs = apply_regular_attention(
            model=step_model,
            encoder_output_dim=encoder_output_dim,
            encoder_outputs_transposed=encoder_outputs_transposed,
            weighted_encoder_outputs=weighted_encoder_outputs,
            decoder_hidden_state_t=hidden_t_intermediate,
            decoder_hidden_state_dim=decoder_state_dim,
            scope=scope,
        )
    hidden_t = step_model.Copy(hidden_t_intermediate, s('hidden_t'))
    step_model.net.AddExternalOutputs(
        cell_t,
        hidden_t,
        attention_weighted_encoder_context_t,
    )
    recompute_blobs = []
    if attention_memory_optimization:
        recompute_blobs.extend(attention_blobs)
    if lstm_memory_optimization:
        recompute_blobs.extend([gates_t])

    return recurrent_net(
        net=model.net,
        cell_net=step_model.net,
        inputs=[
            (input_t, decoder_inputs),
        ],
        initial_cell_inputs=[
            (hidden_t_prev, initial_decoder_hidden_state),
            (cell_t_prev, initial_decoder_cell_state),
            (
                attention_weighted_encoder_context_t_prev,
                initial_attention_weighted_encoder_context,
            ),
        ],
        links={
            hidden_t_prev:
            hidden_t,
            cell_t_prev:
            cell_t,
            attention_weighted_encoder_context_t_prev:
            (attention_weighted_encoder_context_t),
        },
        timestep=timestep,
        scope=scope,
        outputs_with_grads=outputs_with_grads,
        recompute_blobs_on_backward=recompute_blobs,
    )
Exemple #4
0
def LSTM(model,
         input_blob,
         seq_lengths,
         initial_states,
         dim_in,
         dim_out,
         scope,
         outputs_with_grads=(0, ),
         return_params=False,
         memory_optimization=False,
         forget_bias=0.0):
    '''
    Adds a standard LSTM recurrent network operator to a model.

    model: CNNModelHelper object new operators would be added to

    input_blob: the input sequence in a format T x N x D
    where T is sequence size, N - batch size and D - input dimention

    seq_lengths: blob containing sequence lengths which would be passed to
    LSTMUnit operator

    initial_states: a tupple of (hidden_input_blob, cell_input_blob)
    which are going to be inputs to the cell net on the first iteration

    dim_in: input dimention

    dim_out: output dimention

    outputs_with_grads : position indices of output blobs which will receive
    external error gradient during backpropagation

    return_params: if True, will return a dictionary of parameters of the LSTM

    memory_optimization: if enabled, the LSTM step is recomputed on backward step
                   so that we don't need to store forward activations for each
                   timestep. Saves memory with cost of computation.
    '''
    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        return "{}/{}".format(str(scope), str(name))

    """ initial bulk fully-connected """
    input_blob = model.FC(input_blob,
                          s('i2h'),
                          dim_in=dim_in,
                          dim_out=4 * dim_out,
                          axis=2)
    """ the step net """
    step_model = CNNModelHelper(name='lstm_cell', param_model=model)
    input_t, timestep, cell_t_prev, hidden_t_prev = (
        step_model.net.AddScopedExternalInputs('input_t', 'timestep',
                                               'cell_t_prev', 'hidden_t_prev'))
    gates_t = step_model.FC(hidden_t_prev,
                            s('gates_t'),
                            dim_in=dim_out,
                            dim_out=4 * dim_out,
                            axis=2)
    step_model.net.Sum([gates_t, input_t], gates_t)
    hidden_t, cell_t = step_model.net.LSTMUnit(
        [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep],
        [s('hidden_t'), s('cell_t')],
        forget_bias=forget_bias,
    )
    step_model.net.AddExternalOutputs(cell_t, hidden_t)
    """ recurrent network """
    (hidden_input_blob, cell_input_blob) = initial_states
    output, last_output, all_states, last_state = recurrent_net(
        net=model.net,
        cell_net=step_model.net,
        inputs=[(input_t, input_blob)],
        initial_cell_inputs=[
            (hidden_t_prev, hidden_input_blob),
            (cell_t_prev, cell_input_blob),
        ],
        links={
            hidden_t_prev: hidden_t,
            cell_t_prev: cell_t,
        },
        timestep=timestep,
        scope=scope,
        outputs_with_grads=outputs_with_grads,
        recompute_blobs_on_backward=[gates_t] if memory_optimization else None)
    if return_params:
        params = {
            'input': {
                'weights': input_blob + "_w",
                'biases': input_blob + '_b'
            },
            'recurrent': {
                'weights': gates_t + "_w",
                'biases': gates_t + '_b'
            }
        }
        return output, last_output, all_states, last_state, params
    else:
        return output, last_output, all_states, last_state
Exemple #5
0
def LSTM(model,
         input_blob,
         seq_lengths,
         initial_states,
         dim_in,
         dim_out,
         scope,
         outputs_with_grads=(0, )):
    '''
    Adds a standard LSTM recurrent network operator to a model.

    model: CNNModelHelper object new operators would be added to

    input_blob: the input sequence in a format T x N x D
    where T is sequence size, N - batch size and D - input dimention

    seq_lengths: blob containing sequence lengths which would be passed to
    LSTMUnit operator

    initial_states: a tupple of (hidden_input_blob, cell_input_blob)
    which are going to be inputs to the cell net on the first iteration

    dim_in: input dimention

    dim_out: output dimention

    outputs_with_grads : position indices of output blobs which will receive
    external error gradient during backpropagation
    '''
    def s(name):
        # We have to manually scope due to our internal/external blob
        # relationships.
        return "{}/{}".format(str(scope), str(name))

    """ initial bulk fully-connected """
    input_blob = model.FC(input_blob,
                          s('i2h'),
                          dim_in=dim_in,
                          dim_out=4 * dim_out,
                          axis=2)
    """ the step net """
    step_model = CNNModelHelper(name='lstm_cell', param_model=model)
    input_t, timestep, cell_t_prev, hidden_t_prev = (
        step_model.net.AddScopedExternalInputs('input_t', 'timestep',
                                               'cell_t_prev', 'hidden_t_prev'))
    gates_t = step_model.FC(hidden_t_prev,
                            s('gates_t'),
                            dim_in=dim_out,
                            dim_out=4 * dim_out,
                            axis=2)
    step_model.net.Sum([gates_t, input_t], gates_t)
    hidden_t, cell_t = step_model.net.LSTMUnit(
        [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep],
        [s('hidden_t'), s('cell_t')],
    )
    step_model.net.AddExternalOutputs(cell_t, hidden_t)
    """ recurrent network """
    (hidden_input_blob, cell_input_blob) = initial_states
    output, last_output, all_states, last_state = recurrent_net(
        net=model.net,
        cell_net=step_model.net,
        inputs=[(input_t, input_blob)],
        initial_cell_inputs=[
            (hidden_t_prev, hidden_input_blob),
            (cell_t_prev, cell_input_blob),
        ],
        links={
            hidden_t_prev: hidden_t,
            cell_t_prev: cell_t,
        },
        timestep=timestep,
        scope=scope,
        outputs_with_grads=outputs_with_grads,
    )
    return output, last_output, all_states, last_state