def test_roberta(model_name): # test from pretrained with tempfile.TemporaryDirectory() as root: cfg, tokenizer, params_path, mlm_params_path =\ get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root) assert cfg.MODEL.vocab_size == len(tokenizer.vocab) # test backbone roberta_model = RobertaModel.from_cfg(cfg) roberta_model.load_parameters(params_path) roberta_model.hybridize() # test mlm model roberta_mlm_model = RobertaForMLM(cfg) if mlm_params_path is not None: roberta_mlm_model.load_parameters(mlm_params_path) roberta_mlm_model = RobertaForMLM(cfg) roberta_mlm_model.backbone_model.load_parameters(params_path) # test forward batch_size = 3 seq_length = 32 vocab_size = len(tokenizer.vocab) input_ids = mx.np.array( np.random.randint( 2, vocab_size, (batch_size, seq_length) ), dtype=np.int32 ) valid_length = mx.np.array( np.random.randint( seq_length // 2, seq_length, (batch_size,) ), dtype=np.int32 ) roberta_model(input_ids, valid_length) mx.npx.waitall() # test backward label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size) with mx.autograd.record(): contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length) loss = label_smooth_loss(contextual_embeddings, input_ids) loss.backward() mx.npx.waitall()
def test_robert_small_config(compute_layout): cfg = RobertaModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 1000 cfg.MODEL.num_layers = 2 cfg.MODEL.hidden_size = 128 cfg.MODEL.num_heads = 2 cfg.MODEL.compute_layout = compute_layout cfg.freeze() # Generate TN layout cfg_tn = cfg.clone() cfg_tn.defrost() cfg_tn.MODEL.layout = 'TN' cfg_tn.freeze() batch_size = 4 sequence_length = 16 num_mask = 3 inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) valid_length = mx.np.random.randint(3, sequence_length, (batch_size, )) masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) roberta_model = RobertaModel.from_cfg(cfg) roberta_model.initialize() roberta_model.hybridize() contextual_embeddings, pooled_out = roberta_model(inputs, valid_length) roberta_model_tn = RobertaModel.from_cfg(cfg_tn) roberta_model_tn.share_parameters(roberta_model.collect_params()) roberta_model_tn.hybridize() contextual_embeddings_tn, pooled_out_tn = roberta_model_tn( inputs.T, valid_length) assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), contextual_embeddings.asnumpy(), 1E-4, 1E-4) assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) # Test for RobertaForMLM roberta_mlm_model = RobertaForMLM(cfg) roberta_mlm_model.initialize() roberta_mlm_model.hybridize() contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model( inputs, valid_length, masked_positions) roberta_mlm_model_tn = RobertaForMLM(cfg_tn) roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params()) roberta_mlm_model_tn.hybridize() contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions) assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), contextual_embedding.asnumpy(), 1E-4, 1E-4) assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
def convert_params(fairseq_model, gluon_cfg, ctx): fairseq_params = fairseq_model.state_dict() fairseq_prefix = 'model.encoder.' gluon_prefix = 'backbone_model.' print('converting {} params'.format(gluon_prefix)) gluon_model = RobertaForMLM(backbone_cfg=gluon_cfg) # output all hidden states for testing gluon_model.backbone_model._output_all_encodings = True gluon_model.backbone_model.encoder._output_all_encodings = True gluon_model.initialize(ctx=ctx) gluon_model.hybridize() gluon_params = gluon_model.collect_params() num_layers = gluon_cfg.MODEL.num_layers for layer_id in range(num_layers): fs_atten_prefix = \ '{}sentence_encoder.layers.{}.self_attn.' \ .format(fairseq_prefix, layer_id) fs_q_weight = fairseq_params[fs_atten_prefix + 'q_proj.weight'].cpu().numpy() fs_k_weight = fairseq_params[fs_atten_prefix + 'k_proj.weight'].cpu().numpy() fs_v_weight = fairseq_params[fs_atten_prefix + 'v_proj.weight'].cpu().numpy() fs_q_bias = fairseq_params[fs_atten_prefix + 'q_proj.bias'].cpu().numpy() fs_k_bias = fairseq_params[fs_atten_prefix + 'k_proj.bias'].cpu().numpy() fs_v_bias = fairseq_params[fs_atten_prefix + 'v_proj.bias'].cpu().numpy() gl_qkv_prefix = \ '{}encoder.all_layers.{}.attn_qkv.' \ .format(gluon_prefix, layer_id) gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight'] gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias'] gl_qkv_weight.set_data( np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0)) gl_qkv_bias.set_data( np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0)) for k, v in [('self_attn.out_proj.weight', 'attention_proj.weight'), ('self_attn.out_proj.bias', 'attention_proj.bias'), ('self_attn_layer_norm.weight', 'layer_norm.gamma'), ('self_attn_layer_norm.bias', 'layer_norm.beta'), ('fc1.weight', 'ffn.ffn_1.weight'), ('fc1.bias', 'ffn.ffn_1.bias'), ('fc2.weight', 'ffn.ffn_2.weight'), ('fc2.bias', 'ffn.ffn_2.bias'), ('final_layer_norm.weight', 'ffn.layer_norm.gamma'), ('final_layer_norm.bias', 'ffn.layer_norm.beta')]: fs_name = '{}sentence_encoder.layers.{}.{}' \ .format(fairseq_prefix, layer_id, k) gl_name = '{}encoder.all_layers.{}.{}' \ .format(gluon_prefix, layer_id, v) gluon_params[gl_name].set_data( fairseq_params[fs_name].cpu().numpy()) for k, v in [ ('sentence_encoder.embed_tokens.weight', 'word_embed.weight'), ('sentence_encoder.emb_layer_norm.weight', 'embed_ln.gamma'), ('sentence_encoder.emb_layer_norm.bias', 'embed_ln.beta'), ]: fs_name = fairseq_prefix + k gl_name = gluon_prefix + v gluon_params[gl_name].set_data(fairseq_params[fs_name].cpu().numpy()) # position embed weight padding_idx = fairseq_model.task.dictionary.pad_index fs_pos_embed_name = fairseq_prefix + 'sentence_encoder.embed_positions.weight' gl_pos_embed_name = gluon_prefix + 'pos_embed._embed.weight' gluon_params[gl_pos_embed_name].set_data( fairseq_params[fs_pos_embed_name].cpu().numpy()[padding_idx + 1:, :]) for k, v in [('lm_head.dense.weight', 'mlm_decoder.0.weight'), ('lm_head.dense.bias', 'mlm_decoder.0.bias'), ('lm_head.layer_norm.weight', 'mlm_decoder.2.gamma'), ('lm_head.layer_norm.bias', 'mlm_decoder.2.beta'), ('lm_head.bias', 'mlm_decoder.3.bias')]: fs_name = fairseq_prefix + k gluon_params[v].set_data(fairseq_params[fs_name].cpu().numpy()) # assert untie=False assert np.array_equal( fairseq_params[fairseq_prefix + 'sentence_encoder.embed_tokens.weight'].cpu().numpy(), fairseq_params[fairseq_prefix + 'lm_head.weight'].cpu().numpy()) return gluon_model