Esempio n. 1
0
def test_openai_gpt2():
    from transformers import GPT2Model, GPT2Tokenizer

    input_text = "Here is some text to encode"
    pt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    pt_model = GPT2Model.from_pretrained("gpt2", return_dict=True)
    pt_outputs = pt_model(**pt_tokenizer([input_text], return_tensors="pt"))

    task = build_task({
        "class": "lm",
        "params": {
            "data_pipeline.class": "GPT2DataPipeline",
            "max_len": 50,
            "begin_of_sentence": "eos"
        }
    })

    model_cfgs = get_hyper_parameters("gpt2_117m")
    model = task.build_model(model_cfgs)
    restore_checkpoint_if_possible_v2(model, "117M", model_name="OpenAIGPT2")
    input_ids = task._data_pipeline.process(input_text)
    tf_inputs = {
        "trg_input": tf.convert_to_tensor([input_ids], tf.int64),
        "trg_length": tf.convert_to_tensor([len(input_ids)], tf.int64)
    }
    _, gen_init = model.get_symbols_to_logits_fn(tf_inputs, is_training=False, is_inference=False)
    tf_outputs = model.get_decoder_output(gen_init["decoder_input"],
                                          cache=gen_init["decoder_internal_cache"],
                                          is_training=False)
    assert_equal_numpy(pt_outputs.last_hidden_state.detach().numpy(), tf_outputs[:, :-1].numpy(), 5e-4)
def test_multiheadself_attention():
    length_q = 4
    num_heads = 2
    num_units = 4
    dropout_rate = 0.
    output_depth = 3
    tf_att_layer = MultiHeadSelfAttention(num_heads=num_heads, num_units=num_units,
                                          output_depth=output_depth, attention_dropout_rate=dropout_rate)
    pt_att_layer = PTMultiHeadSelfAttention(input_depth=num_units, num_heads=num_heads, num_units=num_units,
                                            output_depth=output_depth, attention_dropout_rate=dropout_rate)

    query = numpy.random.rand(1, length_q, num_units)
    bias = numpy.random.rand(1, length_q)
    tf_query = tf.convert_to_tensor(query, dtype=tf.float32)
    tf_bias = tf.convert_to_tensor(bias, dtype=tf.float32)
    pt_query = torch.FloatTensor(query)
    pt_bias = torch.FloatTensor(bias)
    # build layer
    _ = tf_att_layer(tf_query)
    _ = pt_att_layer(pt_query)
    pt_att_layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
        tf_att_layer._qkv_transform_layer._kernel.numpy())
    pt_att_layer._qkv_transform_layer._bias.data = torch.FloatTensor(
        tf_att_layer._qkv_transform_layer._bias.numpy())
    pt_att_layer._output_transform_layer._kernel.data = torch.FloatTensor(
        tf_att_layer._output_transform_layer._kernel.numpy())
    pt_att_layer._output_transform_layer._bias.data = torch.FloatTensor(
        tf_att_layer._output_transform_layer._bias.numpy())
    assert_equal_numpy(tf_att_layer(tf_query, bias=tf_bias, is_training=False).numpy(),
                       pt_att_layer(pt_query, bias=pt_bias, is_training=False).detach().numpy())
Esempio n. 3
0
def test_subsampler():
    inp = numpy.random.rand(1, 19, 80, 1)
    pt_inp = torch.FloatTensor(inp)
    tf_inp = tf.convert_to_tensor(inp, tf.float32)
    # with layer norm
    tf_layer = TFAudioConvSubsamplingLayer(40)
    pt_layer = AudioConvSubsamplingLayer(40, input_dimension=80)
    _ = tf_layer(tf_inp)
    _ = pt_layer(pt_inp)
    pt_layer._conv_layer1.weight.data = torch.FloatTensor(
        tf_layer._conv_layers[0].kernel.numpy().transpose((3, 2, 0, 1)))
    pt_layer._conv_layer1.bias.data = torch.FloatTensor(
        tf_layer._conv_layers[0].bias.numpy())
    pt_layer._conv_layer2.weight.data = torch.FloatTensor(
        tf_layer._conv_layers[1].kernel.numpy().transpose((3, 2, 0, 1)))
    pt_layer._conv_layer2.bias.data = torch.FloatTensor(
        tf_layer._conv_layers[1].bias.numpy())
    pt_layer._norm_layer1.weight.data = torch.FloatTensor(
        tf_layer._norm_layers[0].gamma.numpy())
    pt_layer._norm_layer1.bias.data = torch.FloatTensor(
        tf_layer._norm_layers[0].beta.numpy())
    pt_layer._norm_layer2.weight.data = torch.FloatTensor(
        tf_layer._norm_layers[1].gamma.numpy())
    pt_layer._norm_layer2.bias.data = torch.FloatTensor(
        tf_layer._norm_layers[1].beta.numpy())
    pt_layer._dense_layer.weight.data = torch.FloatTensor(
        tf_layer._dense_layer.kernel.numpy().transpose())
    pt_layer._dense_layer.bias.data = torch.FloatTensor(
        tf_layer._dense_layer.bias.numpy())
    assert_equal_numpy(
        pt_layer(pt_inp).detach().numpy(),
        tf_layer(tf_inp).numpy(), 5e-5)

    # without layer norm
    tf_layer = TFAudioConvSubsamplingLayer(40, layer_norm=False)
    pt_layer = AudioConvSubsamplingLayer(40,
                                         input_dimension=80,
                                         layer_norm=False)
    _ = tf_layer(tf_inp)
    _ = pt_layer(pt_inp)
    pt_layer._conv_layer1.weight.data = torch.FloatTensor(
        tf_layer._conv_layers[0].kernel.numpy().transpose((3, 2, 0, 1)))
    pt_layer._conv_layer1.bias.data = torch.FloatTensor(
        tf_layer._conv_layers[0].bias.numpy())
    pt_layer._conv_layer2.weight.data = torch.FloatTensor(
        tf_layer._conv_layers[1].kernel.numpy().transpose((3, 2, 0, 1)))
    pt_layer._conv_layer2.bias.data = torch.FloatTensor(
        tf_layer._conv_layers[1].bias.numpy())
    pt_layer._dense_layer.weight.data = torch.FloatTensor(
        tf_layer._dense_layer.kernel.numpy().transpose())
    pt_layer._dense_layer.bias.data = torch.FloatTensor(
        tf_layer._dense_layer.bias.numpy())
    assert_equal_numpy(
        pt_layer(pt_inp).detach().numpy(),
        tf_layer(tf_inp).numpy(), 1e-6)
Esempio n. 4
0
def test_emb():
    emb_dim = 5
    vocab_size = 10
    tf_emb = WordEmbeddingSharedWeights(emb_dim, vocab_size, True)
    pt_emb = PTWordEmbeddingSharedWeights(emb_dim, vocab_size, True)
    inp_2d = numpy.random.randint(0, 9, [2, 5])
    inp_1d = numpy.random.randint(0, 9, [
        3,
    ])
    logits_2d = numpy.random.rand(2, 5)
    logits_3d = numpy.random.rand(2, 4, 5)
    tf_inp_2d = tf.convert_to_tensor(inp_2d, tf.int32)
    tf_inp_1d = tf.convert_to_tensor(inp_1d, tf.int32)
    tf_logits_2d = tf.convert_to_tensor(logits_2d, tf.float32)
    tf_logits_3d = tf.convert_to_tensor(logits_3d, tf.float32)
    pt_inp_2d = torch.IntTensor(inp_2d)
    pt_inp_1d = torch.IntTensor(inp_1d)
    pt_logits_2d = torch.FloatTensor(logits_2d)
    pt_logits_3d = torch.FloatTensor(logits_3d)
    _ = tf_emb(tf_logits_2d, mode="linear")
    _ = pt_emb(pt_logits_2d, mode="linear")
    pt_emb._shared_weights.data = torch.Tensor(tf_emb._shared_weights.numpy())
    pt_emb._bias.data = torch.Tensor(tf_emb._bias.numpy())
    assert_equal_numpy(
        tf_emb(tf_logits_2d, mode="linear").numpy(),
        pt_emb(pt_logits_2d, mode="linear").detach().numpy())
    assert_equal_numpy(
        tf_emb(tf_logits_3d, mode="linear").numpy(),
        pt_emb(pt_logits_3d, mode="linear").detach().numpy())
    assert_equal_numpy(
        tf_emb(tf_inp_2d).numpy(),
        pt_emb(pt_inp_2d).detach().numpy())
    assert_equal_numpy(
        tf_emb(tf_inp_1d).numpy(),
        pt_emb(pt_inp_1d).detach().numpy())
Esempio n. 5
0
def test_ffn():
    numpy_inp = numpy.random.rand(3, 5)
    tf_inp = tf.convert_to_tensor(numpy_inp, tf.float32)
    pt_inp = torch.FloatTensor(numpy_inp)
    tf_ffn = TransformerFFN(7, 11, 0.1)
    tf_out = tf_ffn(tf_inp, is_training=False)
    pt_ffn = PTTransformerFFN(5, 7, 11, 0.1)
    _ = pt_ffn(pt_inp, is_training=False)
    pt_ffn._dense1.weight.data = torch.FloatTensor(
        tf_ffn._conv1.kernel.numpy().transpose([1, 0]))
    pt_ffn._dense1.bias.data = torch.FloatTensor(tf_ffn._conv1.bias.numpy())
    pt_ffn._dense2.weight.data = torch.FloatTensor(
        tf_ffn._conv2.kernel.numpy().transpose([1, 0]))
    pt_ffn._dense2.bias.data = torch.FloatTensor(tf_ffn._conv2.bias.numpy())
    pt_out = pt_ffn(pt_inp, is_training=False)
    assert_equal_numpy(pt_out.detach().numpy(), tf_out.numpy())
Esempio n. 6
0
def test_multihead_dense():
    num_heads = 3
    output_size = (6, 12)
    input_size = 6
    numpy_inp = numpy.random.randn(2, 3, input_size)
    pt_inp = torch.FloatTensor(numpy_inp)
    tf_inp = tf.convert_to_tensor(numpy_inp, dtype=tf.float32)
    tf_non_out_layer = MultiHeadDenseLayer(output_size,
                                           num_heads,
                                           use_bias=True,
                                           is_output_transform=False,
                                           name="nonoutput_transform")
    pt_non_out_layer = PTMultiHeadDenseLayer(input_size,
                                             output_size,
                                             num_heads,
                                             use_bias=True,
                                             is_output_transform=False)
    _ = pt_non_out_layer(pt_inp)
    tf_out = tf_non_out_layer(tf_inp)
    pt_non_out_layer._kernel.data = torch.FloatTensor(
        tf_non_out_layer._kernel.numpy())
    pt_non_out_layer._bias.data = torch.FloatTensor(
        tf_non_out_layer._bias.numpy())

    for x, y in zip(tf_out, pt_non_out_layer(pt_inp)):
        assert_equal_numpy(x.numpy(), y.detach().numpy())

    num_inputs_per_head = 5
    output_size = 6
    numpy_inp = numpy.random.randn(1, 2, num_heads, num_inputs_per_head)
    tf_inp = tf.convert_to_tensor(numpy_inp)
    pt_inp = torch.FloatTensor(numpy_inp)
    tf_out_layer = MultiHeadDenseLayer(output_size,
                                       num_heads,
                                       use_bias=True,
                                       is_output_transform=True,
                                       name="output_transform")
    pt_out_layer = PTMultiHeadDenseLayer(num_heads * num_inputs_per_head,
                                         output_size,
                                         num_heads,
                                         use_bias=True,
                                         is_output_transform=True)
    tf_out = tf_out_layer(tf_inp)
    _ = pt_out_layer(pt_inp)
    pt_out_layer._kernel.data = torch.FloatTensor(tf_out_layer._kernel.numpy())
    pt_out_layer._bias.data = torch.FloatTensor(tf_out_layer._bias.numpy())
    assert_equal_numpy(tf_out.numpy(), pt_out_layer(pt_inp).detach().numpy())
Esempio n. 7
0
def test_incremental_encode():
    max_time = 5
    inputs = tf.random.normal([1, max_time, 8])
    inputs_padding = tf.convert_to_tensor([[
        0.,
        0.,
        0.,
        0.,
        0.,
    ]],
                                          dtype=tf.float32)
    encoder = TransformerEncoder(
        num_layers=2,
        hidden_size=8,
        num_attention_heads=2,
        filter_size=20,
        attention_monotonic=True,
    )
    encoder_outputs = encoder(inputs, inputs_padding, is_training=False)

    incremental_encoder_outputs, _ = encoder.incremental_encode(inputs, {},
                                                                time=0)
    assert_equal_numpy(encoder_outputs.numpy(),
                       incremental_encoder_outputs.numpy(), 1e-5)

    incremental_encoder_outputs0, cache = encoder.incremental_encode(
        inputs[:, :2], {}, time=0)
    incremental_encoder_outputs1, cache = encoder.incremental_encode(inputs[:,
                                                                            2],
                                                                     cache,
                                                                     time=2)
    incremental_encoder_outputs2, cache = encoder.incremental_encode(
        inputs[:, 3:], cache, time=3)

    assert_equal_numpy(
        encoder_outputs.numpy(),
        tf.concat([
            incremental_encoder_outputs0, incremental_encoder_outputs1,
            incremental_encoder_outputs2
        ],
                  axis=1), 1e-5)
Esempio n. 8
0
def test_position_embedding():
    tf_postbl = PositionEmbeddingWrapper.add_sinusoids_timing_signal(
        tf.zeros([1, 10, 10]), None)
    pt_postbl = PTPositionEmbeddingWrapper.add_sinusoids_timing_signal(
        torch.zeros(1, 10, 10), None)
    assert_equal_numpy(tf_postbl.numpy(), pt_postbl.detach().numpy())
    emb_dim = 5
    vocab_size = 10
    tf_emb = WordEmbeddingSharedWeights(emb_dim, vocab_size, False)
    pt_emb = PTWordEmbeddingSharedWeights(emb_dim, vocab_size, False)
    inp_2d = numpy.random.randint(0, 9, [2, 5])
    inp_1d = numpy.random.randint(0, 9, [
        3,
    ])
    logits_2d = numpy.random.rand(2, 5)
    logits_3d = numpy.random.rand(2, 4, 5)
    tf_inp_2d = tf.convert_to_tensor(inp_2d, tf.int32)
    tf_inp_1d = tf.convert_to_tensor(inp_1d, tf.int32)
    tf_logits_2d = tf.convert_to_tensor(logits_2d, tf.float32)
    tf_logits_3d = tf.convert_to_tensor(logits_3d, tf.float32)
    pt_inp_2d = torch.IntTensor(inp_2d)
    pt_inp_1d = torch.IntTensor(inp_1d)
    pt_logits_2d = torch.FloatTensor(logits_2d)
    pt_logits_3d = torch.FloatTensor(logits_3d)
    _ = tf_emb(tf_logits_2d, mode="linear")
    _ = pt_emb(pt_logits_2d, mode="linear")
    pt_emb._shared_weights.data = torch.Tensor(tf_emb._shared_weights.numpy())
    tf_posemb = PositionEmbeddingWrapper("sinusoids", tf_emb)
    pt_posemb = PTPositionEmbeddingWrapper("sinusoids", pt_emb)
    assert_equal_numpy(
        tf_posemb(tf_logits_2d, mode="linear").numpy(),
        pt_posemb(pt_logits_2d, mode="linear").detach().numpy())
    assert_equal_numpy(
        tf_posemb(tf_logits_3d, mode="linear").numpy(),
        pt_posemb(pt_logits_3d, mode="linear").detach().numpy())
    assert_equal_numpy(
        tf_posemb(tf_inp_2d).numpy(),
        pt_posemb(pt_inp_2d).detach().numpy())
    assert_equal_numpy(
        tf_posemb(tf_inp_1d, time=5).numpy(),
        pt_posemb(pt_inp_1d, time=5).detach().numpy())
def test_multiheadself_attention_under_dec():
    num_heads = 2
    num_units = 4
    dropout_rate = 0.
    output_depth = 3
    tf_att_layer = MultiHeadSelfAttention(num_heads=num_heads, num_units=num_units,
                                          output_depth=output_depth, attention_dropout_rate=dropout_rate)
    pt_att_layer = PTMultiHeadSelfAttention(input_depth=num_units, num_heads=num_heads, num_units=num_units,
                                            output_depth=output_depth, attention_dropout_rate=dropout_rate)

    query = numpy.random.rand(1, 1, num_units)
    tf_query = tf.convert_to_tensor(query, dtype=tf.float32)
    pt_query = torch.FloatTensor(query)
    # build layer
    _ = tf_att_layer(tf_query)
    _ = pt_att_layer(pt_query)
    pt_att_layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
        tf_att_layer._qkv_transform_layer._kernel.numpy())
    pt_att_layer._qkv_transform_layer._bias.data = torch.FloatTensor(
        tf_att_layer._qkv_transform_layer._bias.numpy())
    pt_att_layer._output_transform_layer._kernel.data = torch.FloatTensor(
        tf_att_layer._output_transform_layer._kernel.numpy())
    pt_att_layer._output_transform_layer._bias.data = torch.FloatTensor(
        tf_att_layer._output_transform_layer._bias.numpy())

    cache = {
        "keys": numpy.array([[[-0.46546218, -1.0054358, 0.42906007, -1.6854379],
                              [1.078194, 1.1625745, -0.25033495, -1.980812]]]),
        "values": numpy.array([[[-1.2360295, 0.69050753, -1.8204833, 0.23788007],
                                [2.3751693, -1.8772833, -0.2574517, 1.3010416]]]), }
    tf_cache = {"keys": tf.reshape(tf.convert_to_tensor(cache["keys"], dtype=tf.float32),
                                   [1, 2, num_heads, num_units // num_heads]),
                "values": tf.reshape(tf.convert_to_tensor(cache["values"], dtype=tf.float32),
                                     [1, 2, num_heads, num_units // num_heads])}
    pt_cache = {"keys": torch.reshape(torch.FloatTensor(cache["keys"]), [1, 2, num_heads, num_units // num_heads]),
                "values": torch.reshape(torch.FloatTensor(cache["values"]), [1, 2, num_heads, num_units // num_heads])}
    assert_equal_numpy(tf_att_layer(tf_query, cache=tf_cache, is_training=False).numpy(),
                       pt_att_layer(pt_query, cache=pt_cache, is_training=False).detach().numpy())
Esempio n. 10
0
def test_prepost():
    def layer(x, *args, **kwargs):
        _ = args
        _ = kwargs
        return x

    tf_prepost_layer = PrePostProcessingWrapper(layer,
                                                dropout_rate=0.1,
                                                name="lpp")
    pt_prepost_layer = PTPrePostProcessingWrapper(layer,
                                                  norm_shape=3,
                                                  dropout_rate=0.1)
    numpy_inp = numpy.array([[1, 2, 3.]])
    tf_inp = tf.convert_to_tensor(numpy_inp, tf.float32)
    pt_inp = torch.FloatTensor(numpy_inp)
    tf_out = tf_prepost_layer(tf_inp, is_training=False)
    _ = pt_prepost_layer(pt_inp, is_training=False)
    pt_prepost_layer._norm_layer.weight.data = torch.FloatTensor(
        tf_prepost_layer._norm_layer.gamma.numpy())
    pt_prepost_layer._norm_layer.bias.data = torch.FloatTensor(
        tf_prepost_layer._norm_layer.beta.numpy())
    assert_equal_numpy(
        tf_out.numpy(),
        pt_prepost_layer(pt_inp, is_training=False).detach().numpy())
Esempio n. 11
0
def test_transformer_decoder_prenorm():
    dmodel = 4
    num_layers = 1
    num_self_attention_heads = 2
    hidden_size = dmodel
    filter_size = 16
    self_attention_dropout_rate = 0.1
    ffn_dropout_rate = 0.1
    layer_postprocess_dropout_rate = 0.1

    tf_decoder = TFTransformerDecoder(
        num_layers=num_layers,
        num_attention_heads=num_self_attention_heads,
        hidden_size=hidden_size,
        filter_size=filter_size,
        attention_dropout_rate=self_attention_dropout_rate,
        ffn_dropout_rate=ffn_dropout_rate,
        layer_postprocess_dropout_rate=layer_postprocess_dropout_rate)
    pt_decoder = TransformerDecoder(
        num_layers=num_layers,
        num_attention_heads=num_self_attention_heads,
        hidden_size=hidden_size,
        filter_size=filter_size,
        attention_dropout_rate=self_attention_dropout_rate,
        ffn_dropout_rate=ffn_dropout_rate,
        layer_postprocess_dropout_rate=layer_postprocess_dropout_rate)

    inputs = [[[-0.37282175, 0.62301564, -2.0221813, -0.00875833],
               [0.31516594, -1.117763, -1.0697726, 0.80373234],
               [-0.717022, 0.3300997, -0.44306225, 1.550383],
               [-1.5516962, 0.6025011, 1.8262954, 0.42469704]],
              [[-0.98617625, 2.2856202, -1.3063533, 0.4174998],
               [1.5724765, 1.2201295, 1.1479746, 0.7810888],
               [0.8343642, -1.073388, 1.2718492, -0.7290778],
               [-1.4126722, 1.8000795, -2.118672, -0.1366007]]]
    input_padding = [[0, 0, 0, 0], [0, 0, 1., 1.]]
    decoder_input = [
        [[8.6675537e-01, 2.2135425e-01, 1.4054185e+00, -4.2268831e-01],
         [1.9606155e+00, -1.8318410e+00, -1.8158482e+00, -3.7030798e-01],
         [-1.1357157e-03, 5.5629879e-01, 6.6107117e-02, -1.7330967e+00]],
        [[-1.1870812e+00, -5.4499257e-01, -8.6622888e-01, -7.4098641e-01],
         [2.2233427e-01, 5.3582352e-01, 3.0567116e-01, 1.0201423e-01],
         [-1.8053315e+00, 7.2125041e-01, 1.0072237e+00, -2.0333264e+00]]
    ]
    tf_inp = tf.convert_to_tensor(inputs, dtype=tf.float32)
    pt_inp = torch.FloatTensor(inputs)
    tf_inppad = tf.convert_to_tensor(input_padding, dtype=tf.float32)
    pt_inppad = torch.FloatTensor(input_padding)
    tf_decinp = tf.convert_to_tensor(decoder_input, dtype=tf.float32)
    pt_decinp = torch.FloatTensor(decoder_input)
    tf_cache = tf_decoder.create_decoding_internal_cache(tf_inp,
                                                         tf_inppad,
                                                         is_inference=False)
    _ = tf_decoder(tf_decinp, tf_cache, is_training=False)
    pt_cache = pt_decoder.create_decoding_internal_cache(pt_inp,
                                                         pt_inppad,
                                                         is_inference=False)
    _ = pt_decoder(pt_decinp, pt_cache, is_training=False)

    pt_decoder._output_norm_layer.weight.data = torch.FloatTensor(
        tf_decoder._output_norm_layer.gamma.numpy())
    pt_decoder._output_norm_layer.bias.data = torch.FloatTensor(
        tf_decoder._output_norm_layer.beta.numpy())
    pt_decoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._selfatt_layer._layer.
            _qkv_transform_layer._kernel.numpy())
    pt_decoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._selfatt_layer._layer.
            _qkv_transform_layer._bias.numpy())
    pt_decoder._stacking_layers[0][
        0]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._selfatt_layer._layer.
            _output_transform_layer._kernel.numpy())
    pt_decoder._stacking_layers[0][
        0]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._selfatt_layer._layer.
            _output_transform_layer._bias.numpy())
    pt_decoder._stacking_layers[0][
        0]._norm_layer.weight.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._selfatt_layer._norm_layer.gamma.
            numpy())
    pt_decoder._stacking_layers[0][0]._norm_layer.bias.data = torch.FloatTensor(
        tf_decoder._stacking_layers[0]._selfatt_layer._norm_layer.beta.numpy())

    pt_decoder._stacking_layers[0][
        1]._layer._q_transform_layer._kernel.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._layer.
            _q_transform_layer._kernel.numpy())
    pt_decoder._stacking_layers[0][
        1]._layer._q_transform_layer._bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._layer.
            _q_transform_layer._bias.numpy())
    pt_decoder._stacking_layers[0][
        1]._layer._kv_transform_layer._kernel.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._layer.
            _kv_transform_layer._kernel.numpy())
    pt_decoder._stacking_layers[0][
        1]._layer._kv_transform_layer._bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._layer.
            _kv_transform_layer._bias.numpy())
    pt_decoder._stacking_layers[0][
        1]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._layer.
            _output_transform_layer._kernel.numpy())
    pt_decoder._stacking_layers[0][
        1]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._layer.
            _output_transform_layer._bias.numpy())
    pt_decoder._stacking_layers[0][
        1]._norm_layer.weight.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._norm_layer.gamma.
            numpy())
    pt_decoder._stacking_layers[0][
        1]._norm_layer.bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._crossatt_layer._norm_layer.beta.
            numpy())

    pt_decoder._stacking_layers[0][
        2]._layer._dense1.weight.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._ffn_layer._layer._conv1.kernel.
            numpy().transpose([1, 0]))
    pt_decoder._stacking_layers[0][
        2]._layer._dense1.bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._ffn_layer._layer._conv1.bias.numpy(
            ))
    pt_decoder._stacking_layers[0][
        2]._layer._dense2.weight.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._ffn_layer._layer._conv2.kernel.
            numpy().transpose([1, 0]))
    pt_decoder._stacking_layers[0][
        2]._layer._dense2.bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._ffn_layer._layer._conv2.bias.numpy(
            ))
    pt_decoder._stacking_layers[0][
        2]._norm_layer.weight.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._ffn_layer._norm_layer.gamma.numpy(
            ))
    pt_decoder._stacking_layers[0][
        2]._norm_layer.bias.data = torch.FloatTensor(
            tf_decoder._stacking_layers[0]._ffn_layer._norm_layer.beta.numpy())
    assert_equal_numpy(
        tf_decoder(tf_decinp, tf_cache, is_training=False).numpy(),
        pt_decoder(pt_decinp, pt_cache, is_training=False).detach().numpy(),
        5e-5)

    # for inference
    tf_cache = tf_decoder.create_decoding_internal_cache(tf_inp,
                                                         tf_inppad,
                                                         is_inference=True)
    pt_cache = pt_decoder.create_decoding_internal_cache(pt_inp,
                                                         pt_inppad,
                                                         is_inference=True)
    decoder_input = [[
        1.9606155e+00, -1.8318410e+00, -1.8158482e+00, -3.7030798e-01
    ], [-1.1357157e-03, 5.5629879e-01, 6.6107117e-02, -1.7330967e+00]]
    tf_decinp = tf.convert_to_tensor(decoder_input, tf.float32)
    pt_decinp = torch.FloatTensor(decoder_input)
    assert_equal_numpy(
        tf_decoder(tf_decinp, tf_cache, is_training=False).numpy(),
        pt_decoder(pt_decinp, pt_cache, is_training=False).detach().numpy(),
        5e-5)
    assert_equal_numpy(
        tf_cache["decoding_states"]["layer_0"]["self_attention"]
        ["keys"].numpy(), pt_cache["decoding_states"]["layer_0"]
        ["self_attention"]["keys"].detach().numpy(), 5e-5)
    assert_equal_numpy(
        tf_cache["decoding_states"]["layer_0"]["self_attention"]
        ["values"].numpy(), pt_cache["decoding_states"]["layer_0"]
        ["self_attention"]["values"].detach().numpy(), 5e-5)
Esempio n. 12
0
def test_transformer_encoder_prenorm():
    # batch_size = 2
    # max_len = 4
    dmodel = 4
    num_layers = 1
    num_self_attention_heads = 2
    hidden_size = dmodel
    filter_size = 16
    self_attention_dropout_rate = 0.1
    ffn_dropout_rate = 0.1
    layer_postprocess_dropout_rate = 0.1

    tf_encoder = TFTransformerEncoder(
        num_layers=num_layers,
        num_attention_heads=num_self_attention_heads,
        hidden_size=hidden_size,
        filter_size=filter_size,
        attention_dropout_rate=self_attention_dropout_rate,
        ffn_dropout_rate=ffn_dropout_rate,
        layer_postprocess_dropout_rate=layer_postprocess_dropout_rate)
    pt_encoder = TransformerEncoder(
        num_layers=num_layers,
        num_attention_heads=num_self_attention_heads,
        hidden_size=hidden_size,
        filter_size=filter_size,
        attention_dropout_rate=self_attention_dropout_rate,
        ffn_dropout_rate=ffn_dropout_rate,
        layer_postprocess_dropout_rate=layer_postprocess_dropout_rate)

    inputs = [[[-0.37282175, 0.62301564, -2.0221813, -0.00875833],
               [0.31516594, -1.117763, -1.0697726, 0.80373234],
               [-0.717022, 0.3300997, -0.44306225, 1.550383],
               [-1.5516962, 0.6025011, 1.8262954, 0.42469704]],
              [[-0.98617625, 2.2856202, -1.3063533, 0.4174998],
               [1.5724765, 1.2201295, 1.1479746, 0.7810888],
               [0.8343642, -1.073388, 1.2718492, -0.7290778],
               [-1.4126722, 1.8000795, -2.118672, -0.1366007]]]
    input_padding = [[0, 0, 0, 0], [0, 0, 1., 1.]]
    tf_inp = tf.convert_to_tensor(inputs, dtype=tf.float32)
    pt_inp = torch.FloatTensor(inputs)
    tf_inppad = tf.convert_to_tensor(input_padding, dtype=tf.float32)
    pt_inppad = torch.FloatTensor(input_padding)
    _ = tf_encoder(tf_inp, tf_inppad, is_training=False)
    _ = pt_encoder(pt_inp, pt_inppad, is_training=False)
    pt_encoder._output_norm_layer.weight.data = torch.FloatTensor(
        tf_encoder._output_norm_layer.gamma.numpy())
    pt_encoder._output_norm_layer.bias.data = torch.FloatTensor(
        tf_encoder._output_norm_layer.beta.numpy())
    pt_encoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0]
            [0]._layer._qkv_transform_layer._kernel.numpy())
    pt_encoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0]
            [0]._layer._qkv_transform_layer._bias.numpy())
    pt_encoder._stacking_layers[0][
        0]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0]
            [0]._layer._output_transform_layer._kernel.numpy())
    pt_encoder._stacking_layers[0][
        0]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0]
            [0]._layer._output_transform_layer._bias.numpy())
    pt_encoder._stacking_layers[0][
        1]._layer._dense1.weight.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0]
            [1]._layer._conv1.kernel.numpy().transpose([1, 0]))
    pt_encoder._stacking_layers[0][
        1]._layer._dense1.bias.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0][1]._layer._conv1.bias.numpy())
    pt_encoder._stacking_layers[0][
        1]._layer._dense2.weight.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0]
            [1]._layer._conv2.kernel.numpy().transpose([1, 0]))
    pt_encoder._stacking_layers[0][
        1]._layer._dense2.bias.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0][1]._layer._conv2.bias.numpy())
    pt_encoder._stacking_layers[0][
        0]._norm_layer.weight.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0][0]._norm_layer.gamma.numpy())
    pt_encoder._stacking_layers[0][
        0]._norm_layer.bias.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0][0]._norm_layer.beta.numpy())
    pt_encoder._stacking_layers[0][
        1]._norm_layer.weight.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0][1]._norm_layer.gamma.numpy())
    pt_encoder._stacking_layers[0][
        1]._norm_layer.bias.data = torch.FloatTensor(
            tf_encoder._stacking_layers[0][1]._norm_layer.beta.numpy())
    assert_equal_numpy(
        tf_encoder(tf_inp, tf_inppad, is_training=False).numpy(),
        pt_encoder(pt_inp, pt_inppad, is_training=False).detach().numpy(),
        5e-6)
Esempio n. 13
0
def test_transformer_encoder():
    # batch_size = 2
    # max_len = 4
    dmodel = 4
    num_layers = 1
    num_self_attention_heads = 2
    hidden_size = dmodel
    filter_size = 16
    self_attention_dropout_rate = 0.1
    ffn_dropout_rate = 0.1
    layer_postprocess_dropout_rate = 0.1

    encoder = TransformerEncoder(
        num_layers=num_layers,
        num_attention_heads=num_self_attention_heads,
        hidden_size=hidden_size,
        filter_size=filter_size,
        attention_dropout_rate=self_attention_dropout_rate,
        ffn_dropout_rate=ffn_dropout_rate,
        layer_postprocess_dropout_rate=layer_postprocess_dropout_rate)
    inputs = tf.convert_to_tensor(
        [[[-0.37282175, 0.62301564, -2.0221813, -0.00875833],
          [0.31516594, -1.117763, -1.0697726, 0.80373234],
          [-0.717022, 0.3300997, -0.44306225, 1.550383],
          [-1.5516962, 0.6025011, 1.8262954, 0.42469704]],
         [[-0.98617625, 2.2856202, -1.3063533, 0.4174998],
          [1.5724765, 1.2201295, 1.1479746, 0.7810888],
          [0.8343642, -1.073388, 1.2718492, -0.7290778],
          [-1.4126722, 1.8000795, -2.118672, -0.1366007]]],
        dtype=tf.float32)
    input_padding = tf.convert_to_tensor([[0, 0, 0, 0], [0, 0, 1., 1.]],
                                         dtype=tf.float32)
    _ = encoder(inputs, input_padding, is_training=False)
    for w in encoder.trainable_weights:
        if "layer_0/self_attention_prepost_wrapper/self_attention/output_transform/kernel" in w.name:
            tf.compat.v1.assign(
                w,
                tf.convert_to_tensor(
                    [[-0.04742211, -0.42928827, -0.54485893, -0.7514334],
                     [0.3391741, 0.61141425, -0.23809844, 0.27043575],
                     [-0.7315594, 0.8002729, -0.2958873, 0.698168],
                     [-0.59683925, -0.38270262, -0.59893274, -0.4040773]],
                    dtype=tf.float32))
        elif "layer_0/self_attention_prepost_wrapper/self_attention/qkv_transform/kernel" in w.name:
            tf.compat.v1.assign(
                w,
                tf.convert_to_tensor(
                    [[
                        0.5907243, -0.5555184, 0.5612393, -0.2724994,
                        0.23405826, 0.38096863, -0.02200276, -0.26264596,
                        0.36556423, 0.10351193, -0.1946517, 0.60423344
                    ],
                     [
                         0.16057128, -0.4464733, 0.32041794, -0.30858415,
                         0.26626736, 0.579398, -0.19076341, 0.1072132,
                         -0.43820834, 0.05253071, 0.08801651, -0.4995584
                     ],
                     [
                         -0.48593724, 0.1275987, 0.15794194, -0.4632662,
                         0.54038125, -0.45666856, -0.16076824, 0.43855423,
                         0.32468224, -0.1931965, -0.42853987, 0.2411524
                     ],
                     [
                         -0.32923162, -0.06395793, 0.33392805, -0.46701026,
                         -0.06507087, -0.61020637, 0.545703, -0.23786944,
                         -0.2854141, -0.1698403, -0.1244911, 0.40745395
                     ]],
                    dtype=tf.float32))
        elif "layer_0/ffn_prepost_wrapper/ffn/dense1/kernel" in w.name:
            tf.compat.v1.assign(
                w,
                tf.convert_to_tensor(
                    [[
                        -0.14616564, 0.30248666, 0.5319947, 0.5002098,
                        0.2705282, -0.21612385, -0.3336154, 0.03436899,
                        0.26958936, 0.26834202, 0.0843057, -0.50728637,
                        0.19995207, -0.3930181, -0.4985036, 0.33232063
                    ],
                     [
                         -0.04522616, -0.20491397, -0.19712418, 0.18106508,
                         0.33636385, 0.4030161, -0.30252987, 0.11853886,
                         0.2238034, 0.3744824, -0.28127617, -0.03388816,
                         0.32239246, -0.25639355, 0.02382994, 0.34818083
                     ],
                     [
                         0.4456296, -0.48834273, -0.26576972, 0.28717202,
                         0.02354515, -0.2434513, -0.26277977, -0.05434859,
                         0.09830189, 0.08207488, -0.28704825, -0.19418713,
                         0.47731507, 0.14538354, -0.3832153, -0.5143249
                     ],
                     [
                         0.33276683, -0.248025, -0.13612089, -0.15473047,
                         0.33012676, -0.39191568, -0.32679468, 0.52579904,
                         -0.17942387, -0.39317977, 0.13891649, -0.17397407,
                         -0.19002154, 0.05117792, 0.34706026, 0.11179692
                     ]],
                    dtype=tf.float32))
        elif "layer_0/ffn_prepost_wrapper/ffn/dense2/kernel" in w.name:
            tf.compat.v1.assign(
                w,
                tf.convert_to_tensor(
                    [[0.18234771, 0.23902518, 0.4304248, -0.05616844],
                     [-0.01435661, 0.11098373, 0.5370636, -0.5271752],
                     [-0.3239155, 0.5083337, 0.43396413, -0.47642848],
                     [0.31562793, -0.04991594, 0.530545, -0.51263183],
                     [0.10357869, 0.2883237, 0.16929054, 0.18414849],
                     [-0.30361128, -0.2045235, 0.05544132, 0.22116774],
                     [0.05548936, -0.11504656, 0.13726586, -0.13652831],
                     [0.5011635, 0.45315623, -0.35243145, 0.17173672],
                     [-0.52015716, 0.42873853, -0.09965438, -0.45107275],
                     [0.00233686, 0.2797522, 0.2702785, 0.33721972],
                     [0.10216439, -0.14768293, -0.5122431, -0.3882924],
                     [-0.44032216, -0.09983957, -0.41019306, -0.26434696],
                     [0.50977015, -0.18238857, 0.54663074, 0.05787665],
                     [0.3197481, -0.45845133, -0.14075449, -0.33339915],
                     [0.10717738, 0.28995162, 0.47179937, 0.01342988],
                     [0.37111026, -0.31352338, 0.37098122, 0.3895113]],
                    dtype=tf.float32))

    assert_equal_numpy(
        encoder(inputs, input_padding, is_training=False).numpy(),
        numpy.array([[[-0.2709918, 0.95230484, -1.5212451, 0.83993214],
                      [0.7688386, -0.69726187, -1.2441225, 1.1725458],
                      [-1.1408244, 0.57164305, -0.76654106, 1.3357224],
                      [-1.5286305, 0.23827001, 1.267273, 0.02308742]],
                     [[-1.0156152, 1.4036102, -0.8733843, 0.48538923],
                      [-0.60578734, 0.23574206, 1.5095922, -1.1395471],
                      [0.53838307, -0.7913252, 1.3617758, -1.1088338],
                      [-0.8927619, 1.3975127, -1.001557, 0.49680638]]]))
Esempio n. 14
0
def test_st():
    params = copy.deepcopy(
        get_hyper_parameters("speech_transformer_toy")["model.params"])
    params["modality.source.dim"] = None
    params["modality.target.dim"] = None
    params["modality.source.timing"] = None
    params["modality.target.timing"] = None
    params["encoder.num_layers"] = 1
    params["decoder.num_layers"] = 1

    src_vocab_meta = dict(audio_feature_dim=80, audio_feature_channels=1)
    trg_vocab_meta = dict(vocab_size=5, eos_id=4, bos_id=3, unk_id=2)

    fake_audio = numpy.random.rand(1, 11, 80, 1)
    pt_inps = {
        "src": torch.FloatTensor(fake_audio),
        "src_length": torch.LongTensor([11]),
        "trg_input": torch.LongTensor([[3, 0, 1]]),
    }
    tf_inps = {
        "src": tf.convert_to_tensor(fake_audio, tf.float32),
        "src_length": tf.convert_to_tensor([11], tf.int32),
        "trg_input": tf.convert_to_tensor([[3, 0, 1]], tf.int32),
    }

    pt_model: SpeechTransformer = build_model(
        {
            "model.class": "speech_transformer",
            "params": params
        },
        src_meta=src_vocab_meta,
        trg_meta=trg_vocab_meta)
    tf_model: TFSpeechTransformer = build_tf_model(
        {
            "model.class": "speech_transformer",
            "params": params
        },
        src_meta=src_vocab_meta,
        trg_meta=trg_vocab_meta)

    pt_model._src_modality.embedding_layer._conv_layer1.weight.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._conv_layers[0].kernel.numpy(
        ).transpose((3, 2, 0, 1)))
    pt_model._src_modality.embedding_layer._conv_layer1.bias.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._conv_layers[0].bias.numpy())
    pt_model._src_modality.embedding_layer._conv_layer2.weight.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._conv_layers[1].kernel.numpy(
        ).transpose((3, 2, 0, 1)))
    pt_model._src_modality.embedding_layer._conv_layer2.bias.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._conv_layers[1].bias.numpy())
    pt_model._src_modality.embedding_layer._norm_layer1.weight.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._norm_layers[0].gamma.numpy())
    pt_model._src_modality.embedding_layer._norm_layer1.bias.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._norm_layers[0].beta.numpy())
    pt_model._src_modality.embedding_layer._norm_layer2.weight.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._norm_layers[1].gamma.numpy())
    pt_model._src_modality.embedding_layer._norm_layer2.bias.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._norm_layers[1].beta.numpy())
    pt_model._src_modality.embedding_layer._dense_layer.weight.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._dense_layer.kernel.numpy(
        ).transpose())
    pt_model._src_modality.embedding_layer._dense_layer.bias.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._dense_layer.bias.numpy())
    pt_model._trg_modality.embedding_layer._shared_weights.data = torch.FloatTensor(
        tf_model._trg_modality.embedding_layer._shared_weights.numpy())
    pt_model._trg_modality.embedding_layer._bias.data = torch.FloatTensor(
        tf_model._trg_modality.embedding_layer._bias.numpy())
    pt_model._encoder._output_norm_layer.weight.data = torch.FloatTensor(
        tf_model._encoder._output_norm_layer.gamma.numpy())
    pt_model._encoder._output_norm_layer.bias.data = torch.FloatTensor(
        tf_model._encoder._output_norm_layer.beta.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._selfatt_layer._layer.
            _qkv_transform_layer._kernel.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._selfatt_layer._layer.
            _qkv_transform_layer._bias.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._selfatt_layer._layer.
            _output_transform_layer._kernel.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._selfatt_layer._layer.
            _output_transform_layer._bias.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense1.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv1.
            kernel.numpy().transpose([1, 0]))
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense1.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv1.
            bias.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense2.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv2.
            kernel.numpy().transpose([1, 0]))
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense2.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._ffn_layer._layer._conv2.
            bias.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._selfatt_layer._norm_layer.
            gamma.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._selfatt_layer._norm_layer.
            beta.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._ffn_layer._norm_layer.gamma.
            numpy())
    pt_model._encoder._stacking_layers[0][
        1]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]._ffn_layer._norm_layer.beta.
            numpy())
    pt_model._decoder._output_norm_layer.weight.data = torch.FloatTensor(
        tf_model._decoder._output_norm_layer.gamma.numpy())
    pt_model._decoder._output_norm_layer.bias.data = torch.FloatTensor(
        tf_model._decoder._output_norm_layer.beta.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._selfatt_layer._layer.
            _qkv_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._selfatt_layer._layer.
            _qkv_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._selfatt_layer._layer.
            _output_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._selfatt_layer._layer.
            _output_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._selfatt_layer._norm_layer.
            gamma.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._selfatt_layer._norm_layer.
            beta.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._q_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._layer.
            _q_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._q_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._layer.
            _q_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._kv_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._layer.
            _kv_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._kv_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._layer.
            _kv_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._layer.
            _output_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._layer.
            _output_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._norm_layer.
            gamma.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._crossatt_layer._norm_layer.
            beta.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense1.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv1.
            kernel.numpy().transpose([1, 0]))
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense1.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv1.
            bias.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense2.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv2.
            kernel.numpy().transpose([1, 0]))
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense2.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._ffn_layer._layer._conv2.
            bias.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._ffn_layer._norm_layer.gamma.
            numpy())
    pt_model._decoder._stacking_layers[0][
        2]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]._ffn_layer._norm_layer.beta.
            numpy())
    assert_equal_numpy(
        tf_model(tf_inps, is_training=False).numpy(),
        pt_model(pt_inps, is_training=False).detach().numpy(), 5e-6)
Esempio n. 15
0
def test_lower_triangle_attention_bias():
    assert_equal_numpy(lower_triangle_attention_bias(5).numpy(),
                       pt_lower_triangle_attention_bias(5).detach().numpy())
Esempio n. 16
0
def test_seq2seq():
    params = copy.deepcopy(
        get_hyper_parameters("transformer_toy")["model.params"])
    params["modality.source.dim"] = None
    params["modality.target.dim"] = None
    params["modality.source.timing"] = None
    params["modality.target.timing"] = None
    params["encoder.num_layers"] = 1
    params["decoder.num_layers"] = 1

    src_vocab_meta = dict(vocab_size=8, eos_id=7, bos_id=6, unk_id=5)
    trg_vocab_meta = dict(vocab_size=5, eos_id=4, bos_id=3, unk_id=2)

    pt_inps = {
        "src": torch.LongTensor([[0, 1, 1, 7], [1, 7, 7, 7]]),
        "src_padding": torch.FloatTensor([[0, 0, 0, 0.], [0, 0, 1, 1.]]),
        "trg_input": torch.LongTensor([[3, 0, 1], [3, 2, 4]]),
        "trg": torch.LongTensor([[0, 1, 4], [2, 4, 4]]),
        "trg_padding": torch.FloatTensor([[0, 0, 0.], [0, 0, 1.]]),
    }
    tf_inps = {
        "src":
        tf.convert_to_tensor([[0, 1, 1, 7], [1, 7, 7, 7]], tf.int64),
        "src_padding":
        tf.convert_to_tensor([[0, 0, 0, 0.], [0, 0, 1, 1.]], tf.float32),
        "trg_input":
        tf.convert_to_tensor([[3, 0, 1], [3, 2, 4]], tf.int32),
        "trg":
        tf.convert_to_tensor([[0, 1, 4], [2, 4, 4]], tf.int32),
        "trg_padding":
        tf.convert_to_tensor([[0, 0, 0.], [0, 0, 1.]], tf.float32),
    }

    pt_model: Transformer = build_pt_model(
        {
            "model.class": "transformer",
            "params": params
        },
        src_meta=src_vocab_meta,
        trg_meta=trg_vocab_meta)
    tf_model: TFTransformer = build_model(
        {
            "model.class": "transformer",
            "params": params
        },
        src_meta=src_vocab_meta,
        trg_meta=trg_vocab_meta)
    pt_model._src_modality.embedding_layer._shared_weights.data = torch.FloatTensor(
        tf_model._src_modality.embedding_layer._shared_weights.numpy())
    pt_model._trg_modality.embedding_layer._shared_weights.data = torch.FloatTensor(
        tf_model._trg_modality.embedding_layer._shared_weights.numpy())
    pt_model._trg_modality.embedding_layer._bias.data = torch.FloatTensor(
        tf_model._trg_modality.embedding_layer._bias.numpy())
    pt_model._encoder._output_norm_layer.weight.data = torch.FloatTensor(
        tf_model._encoder._output_norm_layer.gamma.numpy())
    pt_model._encoder._output_norm_layer.bias.data = torch.FloatTensor(
        tf_model._encoder._output_norm_layer.beta.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [0]._layer._qkv_transform_layer._kernel.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [0]._layer._qkv_transform_layer._bias.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [0]._layer._output_transform_layer._kernel.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [0]._layer._output_transform_layer._bias.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense1.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [1]._layer._conv1.kernel.numpy().transpose([1, 0]))
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense1.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [1]._layer._conv1.bias.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense2.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [1]._layer._conv2.kernel.numpy().transpose([1, 0]))
    pt_model._encoder._stacking_layers[0][
        1]._layer._dense2.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0]
            [1]._layer._conv2.bias.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0][0]._norm_layer.gamma.numpy())
    pt_model._encoder._stacking_layers[0][
        0]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0][0]._norm_layer.beta.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0][1]._norm_layer.gamma.numpy())
    pt_model._encoder._stacking_layers[0][
        1]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._encoder._stacking_layers[0][1]._norm_layer.beta.numpy())
    pt_model._decoder._output_norm_layer.weight.data = torch.FloatTensor(
        tf_model._decoder._output_norm_layer.gamma.numpy())
    pt_model._decoder._output_norm_layer.bias.data = torch.FloatTensor(
        tf_model._decoder._output_norm_layer.beta.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [0]._layer._qkv_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._qkv_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [0]._layer._qkv_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [0]._layer._output_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [0]._layer._output_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0][0]._norm_layer.gamma.numpy())
    pt_model._decoder._stacking_layers[0][
        0]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0][0]._norm_layer.beta.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._q_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [1]._layer._q_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._q_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [1]._layer._q_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._kv_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [1]._layer._kv_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._kv_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [1]._layer._kv_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._output_transform_layer._kernel.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [1]._layer._output_transform_layer._kernel.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._layer._output_transform_layer._bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [1]._layer._output_transform_layer._bias.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0][1]._norm_layer.gamma.numpy())
    pt_model._decoder._stacking_layers[0][
        1]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0][1]._norm_layer.beta.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense1.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [2]._layer._conv1.kernel.numpy().transpose([1, 0]))
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense1.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [2]._layer._conv1.bias.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense2.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [2]._layer._conv2.kernel.numpy().transpose([1, 0]))
    pt_model._decoder._stacking_layers[0][
        2]._layer._dense2.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0]
            [2]._layer._conv2.bias.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._norm_layer.weight.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0][2]._norm_layer.gamma.numpy())
    pt_model._decoder._stacking_layers[0][
        2]._norm_layer.bias.data = torch.FloatTensor(
            tf_model._decoder._stacking_layers[0][2]._norm_layer.beta.numpy())
    assert_equal_numpy(
        tf_model(tf_inps, is_training=False).numpy(),
        pt_model(pt_inps, is_training=False).detach().numpy(), 5e-6)