def test_roberta(model_name):
    # test from pretrained
    with tempfile.TemporaryDirectory() as root:
        cfg, tokenizer, params_path, mlm_params_path =\
            get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root)
        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
        # test backbone
        roberta_model = RobertaModel.from_cfg(cfg)
        roberta_model.load_parameters(params_path)
        roberta_model.hybridize()
        # test mlm model
        roberta_mlm_model = RobertaForMLM(cfg)
        if mlm_params_path is not None:
            roberta_mlm_model.load_parameters(mlm_params_path)
        roberta_mlm_model = RobertaForMLM(cfg)
        roberta_mlm_model.backbone_model.load_parameters(params_path)

        # test forward
        batch_size = 3
        seq_length = 32
        vocab_size = len(tokenizer.vocab)
        input_ids = mx.np.array(
            np.random.randint(
                2,
                vocab_size,
                (batch_size, seq_length)
            ),
            dtype=np.int32
        )
        valid_length = mx.np.array(
            np.random.randint(
                seq_length // 2,
                seq_length,
                (batch_size,)
            ),
            dtype=np.int32
        )
        roberta_model(input_ids, valid_length)
        mx.npx.waitall()
        # test backward
        label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
        with mx.autograd.record():
            contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length)
            loss = label_smooth_loss(contextual_embeddings, input_ids)
            loss.backward()
        mx.npx.waitall()
Exemple #2
0
def test_robert_small_config(compute_layout):
    cfg = RobertaModel.get_cfg()
    cfg.defrost()
    cfg.MODEL.vocab_size = 1000
    cfg.MODEL.num_layers = 2
    cfg.MODEL.hidden_size = 128
    cfg.MODEL.num_heads = 2
    cfg.MODEL.compute_layout = compute_layout
    cfg.freeze()

    # Generate TN layout
    cfg_tn = cfg.clone()
    cfg_tn.defrost()
    cfg_tn.MODEL.layout = 'TN'
    cfg_tn.freeze()

    batch_size = 4
    sequence_length = 16
    num_mask = 3
    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
    valid_length = mx.np.random.randint(3, sequence_length, (batch_size, ))
    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))

    roberta_model = RobertaModel.from_cfg(cfg)
    roberta_model.initialize()
    roberta_model.hybridize()
    contextual_embeddings, pooled_out = roberta_model(inputs, valid_length)
    roberta_model_tn = RobertaModel.from_cfg(cfg_tn)
    roberta_model_tn.share_parameters(roberta_model.collect_params())
    roberta_model_tn.hybridize()
    contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(
        inputs.T, valid_length)
    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)

    # Test for RobertaForMLM
    roberta_mlm_model = RobertaForMLM(cfg)
    roberta_mlm_model.initialize()
    roberta_mlm_model.hybridize()
    contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(
        inputs, valid_length, masked_positions)
    roberta_mlm_model_tn = RobertaForMLM(cfg_tn)
    roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params())
    roberta_mlm_model_tn.hybridize()
    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
        roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions)
    assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
                    contextual_embedding.asnumpy(), 1E-4, 1E-4)
    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
def convert_params(fairseq_model, gluon_cfg, ctx):
    fairseq_params = fairseq_model.state_dict()
    fairseq_prefix = 'model.encoder.'
    gluon_prefix = 'backbone_model.'
    print('converting {} params'.format(gluon_prefix))

    gluon_model = RobertaForMLM(backbone_cfg=gluon_cfg)
    # output all hidden states for testing
    gluon_model.backbone_model._output_all_encodings = True
    gluon_model.backbone_model.encoder._output_all_encodings = True

    gluon_model.initialize(ctx=ctx)
    gluon_model.hybridize()
    gluon_params = gluon_model.collect_params()
    num_layers = gluon_cfg.MODEL.num_layers
    for layer_id in range(num_layers):
        fs_atten_prefix = \
            '{}sentence_encoder.layers.{}.self_attn.' \
            .format(fairseq_prefix, layer_id)
        fs_q_weight = fairseq_params[fs_atten_prefix +
                                     'q_proj.weight'].cpu().numpy()
        fs_k_weight = fairseq_params[fs_atten_prefix +
                                     'k_proj.weight'].cpu().numpy()
        fs_v_weight = fairseq_params[fs_atten_prefix +
                                     'v_proj.weight'].cpu().numpy()
        fs_q_bias = fairseq_params[fs_atten_prefix +
                                   'q_proj.bias'].cpu().numpy()
        fs_k_bias = fairseq_params[fs_atten_prefix +
                                   'k_proj.bias'].cpu().numpy()
        fs_v_bias = fairseq_params[fs_atten_prefix +
                                   'v_proj.bias'].cpu().numpy()
        gl_qkv_prefix = \
            '{}encoder.all_layers.{}.attn_qkv.' \
            .format(gluon_prefix, layer_id)
        gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
        gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
        gl_qkv_weight.set_data(
            np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0))
        gl_qkv_bias.set_data(
            np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))

        for k, v in [('self_attn.out_proj.weight', 'attention_proj.weight'),
                     ('self_attn.out_proj.bias', 'attention_proj.bias'),
                     ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
                     ('self_attn_layer_norm.bias', 'layer_norm.beta'),
                     ('fc1.weight', 'ffn.ffn_1.weight'),
                     ('fc1.bias', 'ffn.ffn_1.bias'),
                     ('fc2.weight', 'ffn.ffn_2.weight'),
                     ('fc2.bias', 'ffn.ffn_2.bias'),
                     ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
                     ('final_layer_norm.bias', 'ffn.layer_norm.beta')]:
            fs_name = '{}sentence_encoder.layers.{}.{}' \
                      .format(fairseq_prefix, layer_id, k)
            gl_name = '{}encoder.all_layers.{}.{}' \
                      .format(gluon_prefix, layer_id, v)
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    for k, v in [
        ('sentence_encoder.embed_tokens.weight', 'word_embed.weight'),
        ('sentence_encoder.emb_layer_norm.weight', 'embed_ln.gamma'),
        ('sentence_encoder.emb_layer_norm.bias', 'embed_ln.beta'),
    ]:
        fs_name = fairseq_prefix + k
        gl_name = gluon_prefix + v
        gluon_params[gl_name].set_data(fairseq_params[fs_name].cpu().numpy())

    # position embed weight
    padding_idx = fairseq_model.task.dictionary.pad_index
    fs_pos_embed_name = fairseq_prefix + 'sentence_encoder.embed_positions.weight'
    gl_pos_embed_name = gluon_prefix + 'pos_embed._embed.weight'
    gluon_params[gl_pos_embed_name].set_data(
        fairseq_params[fs_pos_embed_name].cpu().numpy()[padding_idx + 1:, :])

    for k, v in [('lm_head.dense.weight', 'mlm_decoder.0.weight'),
                 ('lm_head.dense.bias', 'mlm_decoder.0.bias'),
                 ('lm_head.layer_norm.weight', 'mlm_decoder.2.gamma'),
                 ('lm_head.layer_norm.bias', 'mlm_decoder.2.beta'),
                 ('lm_head.bias', 'mlm_decoder.3.bias')]:
        fs_name = fairseq_prefix + k
        gluon_params[v].set_data(fairseq_params[fs_name].cpu().numpy())
    # assert untie=False
    assert np.array_equal(
        fairseq_params[fairseq_prefix +
                       'sentence_encoder.embed_tokens.weight'].cpu().numpy(),
        fairseq_params[fairseq_prefix + 'lm_head.weight'].cpu().numpy())
    return gluon_model