Ejemplo n.º 1
0
def _test_activation_coverage(act_type):
    config_coverage = sockeye.coverage.CoverageConfig(
        type=act_type,
        max_fertility=2,
        num_hidden=2,
        layer_normalization=False)
    encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4
    # source: (batch_size, source_seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    # source_length: (batch_size,)
    source_length = mx.sym.Variable("source_length")
    # prev_hidden: (batch_size, decoder_num_hidden)
    prev_hidden = mx.sym.Variable("prev_hidden")
    # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden)
    prev_coverage = mx.sym.Variable("prev_coverage")
    # attention_scores: (batch_size, source_seq_len)
    attention_scores = mx.sym.Variable("attention_scores")
    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    source_length_shape = (batch_size, )
    prev_hidden_shape = (batch_size, decoder_num_hidden)
    attention_scores_shape = (batch_size, source_seq_len)
    prev_coverage_shape = (batch_size, source_seq_len,
                           config_coverage.num_hidden)
    source_data = gaussian_vector(shape=source_shape)
    source_length_data = integer_vector(shape=source_length_shape,
                                        max_value=source_seq_len)
    prev_hidden_data = gaussian_vector(shape=prev_hidden_shape)
    prev_coverage_data = gaussian_vector(shape=prev_coverage_shape)
    attention_scores_data = uniform_vector(shape=attention_scores_shape)
    attention_scores_data = attention_scores_data / np.sum(
        attention_scores_data)

    coverage = sockeye.coverage.get_coverage(config_coverage)
    coverage_func = coverage.on(source, source_length, source_seq_len)
    updated_coverage = coverage_func(prev_hidden, attention_scores,
                                     prev_coverage)
    executor = updated_coverage.simple_bind(
        ctx=mx.cpu(),
        source=source_shape,
        source_length=source_length_shape,
        prev_hidden=prev_hidden_shape,
        prev_coverage=prev_coverage_shape,
        attention_scores=attention_scores_shape)
    executor.arg_dict["source"][:] = source_data
    executor.arg_dict["source_length"][:] = source_length_data
    executor.arg_dict["prev_hidden"][:] = prev_hidden_data
    executor.arg_dict["prev_coverage"][:] = prev_coverage_data
    executor.arg_dict["attention_scores"][:] = attention_scores_data
    result = executor.forward()
    new_coverage = result[0].asnumpy()
    assert new_coverage.shape == prev_coverage_shape
    # this is needed to modulate the 0 input. The output changes according to the activation type used.
    modulated = mx.nd.Activation(mx.nd.zeros((1, 1)),
                                 act_type=act_type).asnumpy()
    assert (np.sum(
        np.sum(np.isclose(new_coverage, modulated, atol=1.e-6), axis=2) != 0,
        axis=1) == source_length_data).all()
Ejemplo n.º 2
0
def test_coverage_attention(attention_coverage_type,
                            attention_coverage_num_hidden,
                            batch_size=3,
                            encoder_num_hidden=2,
                            decoder_num_hidden=2):
    # source: (batch_size, seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    # source_length: (batch_size, )
    source_length = mx.sym.Variable("source_length")
    source_seq_len = 10

    config_coverage = sockeye.coverage.CoverageConfig(type=attention_coverage_type,
                                                      num_hidden=attention_coverage_num_hidden,
                                                      layer_normalization=False)
    config_attention = sockeye.rnn_attention.AttentionConfig(type="coverage",
                                                             num_hidden=5,
                                                             input_previous_word=False,
                                                             source_num_hidden=encoder_num_hidden,
                                                             query_num_hidden=decoder_num_hidden,
                                                             layer_normalization=False,
                                                             config_coverage=config_coverage)
    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len)

    attention_state = attention.get_initial_state(source_length, source_seq_len)
    attention_func = attention.on(source, source_length, source_seq_len)
    attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"), mx.sym.Variable("decoder_state"))
    attention_state = attention_func(attention_input, attention_state)
    sym = mx.sym.Group([attention_state.context, attention_state.probs, attention_state.dynamic_source])

    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    source_length_shape = (batch_size,)
    decoder_state_shape = (batch_size, decoder_num_hidden)

    executor = sym.simple_bind(ctx=mx.cpu(),
                               source=source_shape,
                               source_length=source_length_shape,
                               decoder_state=decoder_state_shape)

    source_length_vector = integer_vector(shape=source_length_shape, max_value=source_seq_len)
    executor.arg_dict["source"][:] = gaussian_vector(shape=source_shape)
    executor.arg_dict["source_length"][:] = source_length_vector
    executor.arg_dict["decoder_state"][:] = gaussian_vector(shape=decoder_state_shape)
    exec_output = executor.forward()
    context_result = exec_output[0].asnumpy()
    attention_prob_result = exec_output[1].asnumpy()
    dynamic_source_result = exec_output[2].asnumpy()

    expected_probs = (1. / source_length_vector).reshape((batch_size, 1))

    assert context_result.shape == (batch_size, encoder_num_hidden)
    assert attention_prob_result.shape == (batch_size, source_seq_len)
    assert dynamic_source_result.shape == (batch_size, source_seq_len, attention_coverage_num_hidden)
    assert (np.sum(np.isclose(attention_prob_result, expected_probs), axis=1) == source_length_vector).all()
Ejemplo n.º 3
0
def _test_gru_coverage():
    config_coverage = sockeye.coverage.CoverageConfig(
        type="gru", num_hidden=2, layer_normalization=False)
    encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4
    # source: (batch_size, source_seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    # source_length: (batch_size,)
    source_length = mx.sym.Variable("source_length")
    # prev_hidden: (batch_size, decoder_num_hidden)
    prev_hidden = mx.sym.Variable("prev_hidden")
    # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden)
    prev_coverage = mx.sym.Variable("prev_coverage")
    # attention_scores: (batch_size, source_seq_len)
    attention_scores = mx.sym.Variable("attention_scores")
    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    source_length_shape = (batch_size, )
    prev_hidden_shape = (batch_size, decoder_num_hidden)
    attention_scores_shape = (batch_size, source_seq_len)
    prev_coverage_shape = (batch_size, source_seq_len,
                           config_coverage.num_hidden)
    source_data = gaussian_vector(shape=source_shape)
    source_length_data = integer_vector(shape=source_length_shape,
                                        max_value=source_seq_len)
    prev_hidden_data = gaussian_vector(shape=prev_hidden_shape)
    prev_coverage_data = gaussian_vector(shape=prev_coverage_shape)
    attention_scores_data = uniform_vector(shape=attention_scores_shape)
    attention_scores_data = attention_scores_data / np.sum(
        attention_scores_data)
    coverage = sockeye.coverage.get_coverage(config_coverage)
    coverage_func = coverage.on(source, source_length, source_seq_len)
    updated_coverage = coverage_func(prev_hidden, attention_scores,
                                     prev_coverage)
    executor = updated_coverage.simple_bind(
        ctx=mx.cpu(),
        source=source_shape,
        source_length=source_length_shape,
        prev_hidden=prev_hidden_shape,
        prev_coverage=prev_coverage_shape,
        attention_scores=attention_scores_shape)
    executor.arg_dict["source"][:] = source_data
    executor.arg_dict["source_length"][:] = source_length_data
    executor.arg_dict["prev_hidden"][:] = prev_hidden_data
    executor.arg_dict["prev_coverage"][:] = prev_coverage_data
    executor.arg_dict["attention_scores"][:] = attention_scores_data
    result = executor.forward()
    new_coverage = result[0].asnumpy()
    assert new_coverage.shape == prev_coverage_shape
    assert (np.sum(np.sum(new_coverage != 1, axis=2) != 0,
                   axis=1) == source_length_data).all()
Ejemplo n.º 4
0
def _test_activation_coverage(act_type):
    config_coverage = sockeye.coverage.CoverageConfig(type=act_type, num_hidden=2, layer_normalization=False)
    encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4
    # source: (batch_size, source_seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    # source_length: (batch_size,)
    source_length = mx.sym.Variable("source_length")
    # prev_hidden: (batch_size, decoder_num_hidden)
    prev_hidden = mx.sym.Variable("prev_hidden")
    # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden)
    prev_coverage = mx.sym.Variable("prev_coverage")
    # attention_scores: (batch_size, source_seq_len)
    attention_scores = mx.sym.Variable("attention_scores")
    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    source_length_shape = (batch_size,)
    prev_hidden_shape = (batch_size, decoder_num_hidden)
    attention_scores_shape = (batch_size, source_seq_len)
    prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden)
    source_data = gaussian_vector(shape=source_shape)
    source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len)
    prev_hidden_data = gaussian_vector(shape=prev_hidden_shape)
    prev_coverage_data = gaussian_vector(shape=prev_coverage_shape)
    attention_scores_data = uniform_vector(shape=attention_scores_shape)
    attention_scores_data = attention_scores_data / np.sum(attention_scores_data)

    coverage = sockeye.coverage.get_coverage(config_coverage)
    coverage_func = coverage.on(source, source_length, source_seq_len)
    updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage)
    executor = updated_coverage.simple_bind(ctx=mx.cpu(),
                                            source=source_shape,
                                            source_length=source_length_shape,
                                            prev_hidden=prev_hidden_shape,
                                            prev_coverage=prev_coverage_shape,
                                            attention_scores=attention_scores_shape)
    executor.arg_dict["source"][:] = source_data
    executor.arg_dict["source_length"][:] = source_length_data
    executor.arg_dict["prev_hidden"][:] = prev_hidden_data
    executor.arg_dict["prev_coverage"][:] = prev_coverage_data
    executor.arg_dict["attention_scores"][:] = attention_scores_data
    result = executor.forward()
    new_coverage = result[0].asnumpy()
    assert new_coverage.shape == prev_coverage_shape
    # this is needed to modulate the 0 input. The output changes according to the activation type used.
    modulated = mx.nd.Activation(mx.nd.zeros((1, 1)), act_type=act_type).asnumpy()
    assert (np.sum(np.sum(np.isclose(new_coverage, modulated, atol=1.e-6), axis=2) != 0, axis=1) == source_length_data).all()
Ejemplo n.º 5
0
def _test_gru_coverage():
    config_coverage = sockeye.coverage.CoverageConfig(type="gru", num_hidden=2, layer_normalization=False)
    encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4
    # source: (batch_size, source_seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    # source_length: (batch_size,)
    source_length = mx.sym.Variable("source_length")
    # prev_hidden: (batch_size, decoder_num_hidden)
    prev_hidden = mx.sym.Variable("prev_hidden")
    # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden)
    prev_coverage = mx.sym.Variable("prev_coverage")
    # attention_scores: (batch_size, source_seq_len)
    attention_scores = mx.sym.Variable("attention_scores")
    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    source_length_shape = (batch_size,)
    prev_hidden_shape = (batch_size, decoder_num_hidden)
    attention_scores_shape = (batch_size, source_seq_len)
    prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden)
    source_data = gaussian_vector(shape=source_shape)
    source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len)
    prev_hidden_data = gaussian_vector(shape=prev_hidden_shape)
    prev_coverage_data = gaussian_vector(shape=prev_coverage_shape)
    attention_scores_data = uniform_vector(shape=attention_scores_shape)
    attention_scores_data = attention_scores_data / np.sum(attention_scores_data)
    coverage = sockeye.coverage.get_coverage(config_coverage)
    coverage_func = coverage.on(source, source_length, source_seq_len)
    updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage)
    executor = updated_coverage.simple_bind(ctx=mx.cpu(),
                                            source=source_shape,
                                            source_length=source_length_shape,
                                            prev_hidden=prev_hidden_shape,
                                            prev_coverage=prev_coverage_shape,
                                            attention_scores=attention_scores_shape)
    executor.arg_dict["source"][:] = source_data
    executor.arg_dict["source_length"][:] = source_length_data
    executor.arg_dict["prev_hidden"][:] = prev_hidden_data
    executor.arg_dict["prev_coverage"][:] = prev_coverage_data
    executor.arg_dict["attention_scores"][:] = attention_scores_data
    result = executor.forward()
    new_coverage = result[0].asnumpy()
    assert new_coverage.shape == prev_coverage_shape
    assert (np.sum(np.sum(new_coverage != 1, axis=2) != 0, axis=1) == source_length_data).all()
Ejemplo n.º 6
0
def test_step(cell_type, context_gating,
              num_embed=2,
              encoder_num_hidden=5,
              decoder_num_hidden=5):

    vocab_size, batch_size, source_seq_len = 10, 10, 7,

    # (batch_size, source_seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    # (batch_size,)
    source_length = mx.sym.Variable("source_length")
    source_length_shape = (batch_size,)
    # (batch_size, num_embed)
    word_vec_prev = mx.sym.Variable("word_vec_prev")
    word_vec_prev_shape = (batch_size, num_embed)
    # (batch_size, decoder_num_hidden)
    hidden_prev = mx.sym.Variable("hidden_prev")
    hidden_prev_shape = (batch_size, decoder_num_hidden)
    # List(mx.sym.Symbol(batch_size, decoder_num_hidden)
    states_shape = (batch_size, decoder_num_hidden)

    config_coverage = sockeye.coverage.CoverageConfig(type="tanh",
                                                      num_hidden=2,
                                                      layer_normalization=False)
    config_attention = sockeye.rnn_attention.AttentionConfig(type="coverage",
                                                             num_hidden=2,
                                                             input_previous_word=False,
                                                             source_num_hidden=decoder_num_hidden,
                                                             query_num_hidden=decoder_num_hidden,
                                                             layer_normalization=False,
                                                             config_coverage=config_coverage)
    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len)
    attention_state = attention.get_initial_state(source_length, source_seq_len)
    attention_func = attention.on(source, source_length, source_seq_len)

    config_rnn = sockeye.rnn.RNNConfig(cell_type=cell_type,
                                       num_hidden=decoder_num_hidden,
                                       num_layers=1,
                                       dropout_inputs=0.,
                                       dropout_states=0.,
                                       residual=False,
                                       forget_bias=0.)

    config_decoder = sockeye.decoder.RecurrentDecoderConfig(max_seq_len_source=source_seq_len,
                                                            rnn_config=config_rnn,
                                                            attention_config=config_attention,
                                                            context_gating=context_gating)

    decoder = sockeye.decoder.RecurrentDecoder(config=config_decoder)

    if cell_type == C.GRU_TYPE:
        layer_states = [gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers)]
    elif cell_type == C.LSTM_TYPE:
        layer_states = [gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers*2)]
    else:
        raise ValueError

    state, attention_state = decoder._step(word_vec_prev=word_vec_prev,
                                           state=sockeye.decoder.RecurrentDecoderState(hidden_prev, layer_states),
                                           attention_func=attention_func,
                                           attention_state=attention_state)
    sym = mx.sym.Group([state.hidden, attention_state.probs, attention_state.dynamic_source])

    executor = sym.simple_bind(ctx=mx.cpu(),
                               source=source_shape,
                               source_length=source_length_shape,
                               word_vec_prev=word_vec_prev_shape,
                               hidden_prev=hidden_prev_shape)
    executor.arg_dict["source"][:] = gaussian_vector(source_shape)
    executor.arg_dict["source_length"][:] = integer_vector(source_length_shape, source_seq_len)
    executor.arg_dict["word_vec_prev"][:] = gaussian_vector(word_vec_prev_shape)
    executor.arg_dict["hidden_prev"][:] = gaussian_vector(hidden_prev_shape)
    executor.arg_dict["states"] = layer_states
    hidden_result, attention_probs_result, attention_dynamic_source_result = executor.forward()

    assert hidden_result.shape == hidden_prev_shape
    assert attention_probs_result.shape == (batch_size, source_seq_len)
    assert attention_dynamic_source_result.shape == (batch_size, source_seq_len, config_coverage.num_hidden)
Ejemplo n.º 7
0
def test_step(cell_type,
              context_gating,
              num_embed=2,
              encoder_num_hidden=5,
              decoder_num_hidden=5):

    vocab_size, batch_size, source_seq_len = 10, 10, 7,

    # (batch_size, source_seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    # (batch_size,)
    source_length = mx.sym.Variable("source_length")
    source_length_shape = (batch_size, )
    # (batch_size, num_embed)
    word_vec_prev = mx.sym.Variable("word_vec_prev")
    word_vec_prev_shape = (batch_size, num_embed)
    # (batch_size, decoder_num_hidden)
    hidden_prev = mx.sym.Variable("hidden_prev")
    hidden_prev_shape = (batch_size, decoder_num_hidden)
    # List(mx.sym.Symbol(batch_size, decoder_num_hidden)
    states_shape = (batch_size, decoder_num_hidden)

    config_coverage = sockeye.coverage.CoverageConfig(
        type="tanh", num_hidden=2, layer_normalization=False)
    config_attention = sockeye.rnn_attention.AttentionConfig(
        type="coverage",
        num_hidden=2,
        input_previous_word=False,
        source_num_hidden=decoder_num_hidden,
        query_num_hidden=decoder_num_hidden,
        layer_normalization=False,
        config_coverage=config_coverage)
    attention = sockeye.rnn_attention.get_attention(config_attention,
                                                    max_seq_len=source_seq_len)
    attention_state = attention.get_initial_state(source_length,
                                                  source_seq_len)
    attention_func = attention.on(source, source_length, source_seq_len)

    config_rnn = sockeye.rnn.RNNConfig(cell_type=cell_type,
                                       num_hidden=decoder_num_hidden,
                                       num_layers=1,
                                       dropout_inputs=0.,
                                       dropout_states=0.,
                                       residual=False,
                                       forget_bias=0.)

    config_decoder = sockeye.decoder.RecurrentDecoderConfig(
        max_seq_len_source=source_seq_len,
        rnn_config=config_rnn,
        attention_config=config_attention,
        context_gating=context_gating)

    decoder = sockeye.decoder.RecurrentDecoder(config=config_decoder)

    if cell_type == C.GRU_TYPE:
        layer_states = [
            gaussian_vector(shape=states_shape, return_symbol=True)
            for _ in range(config_rnn.num_layers)
        ]
    elif cell_type == C.LSTM_TYPE:
        layer_states = [
            gaussian_vector(shape=states_shape, return_symbol=True)
            for _ in range(config_rnn.num_layers * 2)
        ]
    else:
        raise ValueError

    state, attention_state = decoder._step(
        word_vec_prev=word_vec_prev,
        state=sockeye.decoder.RecurrentDecoderState(hidden_prev, layer_states),
        attention_func=attention_func,
        attention_state=attention_state)
    sym = mx.sym.Group(
        [state.hidden, attention_state.probs, attention_state.dynamic_source])

    executor = sym.simple_bind(ctx=mx.cpu(),
                               source=source_shape,
                               source_length=source_length_shape,
                               word_vec_prev=word_vec_prev_shape,
                               hidden_prev=hidden_prev_shape)
    executor.arg_dict["source"][:] = gaussian_vector(source_shape)
    executor.arg_dict["source_length"][:] = integer_vector(
        source_length_shape, source_seq_len)
    executor.arg_dict["word_vec_prev"][:] = gaussian_vector(
        word_vec_prev_shape)
    executor.arg_dict["hidden_prev"][:] = gaussian_vector(hidden_prev_shape)
    executor.arg_dict["states"] = layer_states
    hidden_result, attention_probs_result, attention_dynamic_source_result = executor.forward(
    )

    assert hidden_result.shape == hidden_prev_shape
    assert attention_probs_result.shape == (batch_size, source_seq_len)
    assert attention_dynamic_source_result.shape == (
        batch_size, source_seq_len, config_coverage.num_hidden)
Ejemplo n.º 8
0
def test_coverage_attention(attention_coverage_type,
                            attention_coverage_num_hidden,
                            batch_size=3,
                            encoder_num_hidden=2,
                            decoder_num_hidden=2):
    # source: (batch_size, seq_len, encoder_num_hidden)
    source = mx.sym.Variable("source")
    # source_length: (batch_size, )
    source_length = mx.sym.Variable("source_length")
    source_seq_len = 10

    config_coverage = sockeye.coverage.CoverageConfig(
        type=attention_coverage_type,
        num_hidden=attention_coverage_num_hidden,
        layer_normalization=False)
    config_attention = sockeye.rnn_attention.AttentionConfig(
        type="coverage",
        num_hidden=5,
        input_previous_word=False,
        source_num_hidden=encoder_num_hidden,
        query_num_hidden=decoder_num_hidden,
        layer_normalization=False,
        config_coverage=config_coverage)
    attention = sockeye.rnn_attention.get_attention(config_attention,
                                                    max_seq_len=source_seq_len)

    attention_state = attention.get_initial_state(source_length,
                                                  source_seq_len)
    attention_func = attention.on(source, source_length, source_seq_len)
    attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"),
                                           mx.sym.Variable("decoder_state"))
    attention_state = attention_func(attention_input, attention_state)
    sym = mx.sym.Group([
        attention_state.context, attention_state.probs,
        attention_state.dynamic_source
    ])

    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
    source_length_shape = (batch_size, )
    decoder_state_shape = (batch_size, decoder_num_hidden)

    executor = sym.simple_bind(ctx=mx.cpu(),
                               source=source_shape,
                               source_length=source_length_shape,
                               decoder_state=decoder_state_shape)

    source_length_vector = integer_vector(shape=source_length_shape,
                                          max_value=source_seq_len)
    executor.arg_dict["source"][:] = gaussian_vector(shape=source_shape)
    executor.arg_dict["source_length"][:] = source_length_vector
    executor.arg_dict["decoder_state"][:] = gaussian_vector(
        shape=decoder_state_shape)
    exec_output = executor.forward()
    context_result = exec_output[0].asnumpy()
    attention_prob_result = exec_output[1].asnumpy()
    dynamic_source_result = exec_output[2].asnumpy()

    expected_probs = (1. / source_length_vector).reshape((batch_size, 1))

    assert context_result.shape == (batch_size, encoder_num_hidden)
    assert attention_prob_result.shape == (batch_size, source_seq_len)
    assert dynamic_source_result.shape == (batch_size, source_seq_len,
                                           attention_coverage_num_hidden)
    assert (np.sum(np.isclose(attention_prob_result, expected_probs),
                   axis=1) == source_length_vector).all()