コード例 #1
0
ファイル: test_models_bart.py プロジェクト: zheyuye/gluon-nlp
def test_bart_cfg(cfg_key):
    cfg = BartModel.get_cfg(cfg_key)
    cfg.defrost()
    cfg.MODEL.vocab_size = 32
    cfg.freeze()
    model = BartModel.from_cfg(cfg)
    model.initialize()
    model.hybridize()
    cfg.defrost()
    cfg.MODEL.layout = 'TN'
    cfg.freeze()
    model_tn = BartModel.from_cfg(cfg)
    model_tn.share_parameters(model.collect_params())
    model_tn.hybridize()
    mx.npx.waitall()
コード例 #2
0
def convert_params(fairseq_model, gluon_cfg, ctx):
    fairseq_params = fairseq_model.state_dict()
    # apply a linear mapping to vocab dictionary
    gluon_model = BartModel.from_cfg(gluon_cfg, use_pooler=False)
    gluon_model.initialize(ctx=ctx)
    gluon_model.hybridize()
    gluon_params = gluon_model.collect_params()
    all_keys = set(gluon_params.keys())

    def convert_attention(num_layers,
                          fairseq_prefix,
                          gluon_prefix,
                          fairseq_attn_prefix='self_attn',
                          gluon_attn_prefix='attn_qkv'):
        for layer_id in range(num_layers):
            fs_atten_prefix = \
                '{}.layers.{}.{}.' \
                .format(fairseq_prefix, layer_id, fairseq_attn_prefix)
            fs_q_weight = fairseq_params[fs_atten_prefix +
                                         'q_proj.weight'].cpu().numpy()
            fs_k_weight = fairseq_params[fs_atten_prefix +
                                         'k_proj.weight'].cpu().numpy()
            fs_v_weight = fairseq_params[fs_atten_prefix +
                                         'v_proj.weight'].cpu().numpy()
            fs_q_bias = fairseq_params[fs_atten_prefix +
                                       'q_proj.bias'].cpu().numpy()
            fs_k_bias = fairseq_params[fs_atten_prefix +
                                       'k_proj.bias'].cpu().numpy()
            fs_v_bias = fairseq_params[fs_atten_prefix +
                                       'v_proj.bias'].cpu().numpy()
            gl_qkv_prefix = \
                '{}.layers.{}.{}.' \
                .format(gluon_prefix, layer_id, gluon_attn_prefix)
            gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
            gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
            all_keys.remove(gl_qkv_prefix + 'weight')
            all_keys.remove(gl_qkv_prefix + 'bias')
            gl_qkv_weight.set_data(
                np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight],
                               axis=0))
            gl_qkv_bias.set_data(
                np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))

    def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
        # convert feed forward layer in encoder
        for layer_id in range(num_layers):
            for k, v in [('fc1.weight', 'ffn.ffn_1.weight'),
                         ('fc1.bias', 'ffn.ffn_1.bias'),
                         ('fc2.weight', 'ffn.ffn_2.weight'),
                         ('fc2.bias', 'ffn.ffn_2.bias'),
                         ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
                         ('final_layer_norm.bias', 'ffn.layer_norm.beta')]:
                fs_name = '{}.layers.{}.{}' \
                          .format(fairseq_prefix, layer_id, k)
                gl_name = '{}.layers.{}.{}' \
                          .format(gluon_prefix, layer_id, v)
                all_keys.remove(gl_name)
                gluon_params[gl_name].set_data(
                    fairseq_params[fs_name].cpu().numpy())

    print('converting embedding params')
    padding_idx = fairseq_model.task.dictionary.pad_index
    for fs_name, gl_name in [
        ('model.encoder.embed_tokens.weight', 'src_embed_layer.weight'),
        ('model.encoder.embed_positions.weight',
         'src_pos_embed_layer._embed.weight'),
        ('model.encoder.layernorm_embedding.weight', 'encoder.ln_data.gamma'),
        ('model.encoder.layernorm_embedding.bias', 'encoder.ln_data.beta'),
        ('model.decoder.embed_tokens.weight', 'tgt_embed_layer.weight'),
        ('model.decoder.embed_positions.weight',
         'tgt_pos_embed_layer._embed.weight'),
        ('model.decoder.layernorm_embedding.weight', 'decoder.ln_data.gamma'),
        ('model.decoder.layernorm_embedding.bias', 'decoder.ln_data.beta'),
            # final projection in decoder
        ('model.decoder.output_projection.weight', 'tgt_final_layer.weight'),
    ]:
        all_keys.remove(gl_name)
        if 'embed_positions' in fs_name:
            # position embed weight
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy()[padding_idx + 1:, :])
        else:
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    print('converting encoder params')
    encoder_num_layers = gluon_cfg.MODEL.ENCODER.num_layers
    convert_attention(encoder_num_layers, 'model.encoder', 'encoder')
    convert_ffn(encoder_num_layers, 'model.encoder', 'encoder')
    for layer_id in range(encoder_num_layers):
        for k, v in [
            ('self_attn.out_proj.weight', 'attention_proj.weight'),
            ('self_attn.out_proj.bias', 'attention_proj.bias'),
            ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
            ('self_attn_layer_norm.bias', 'layer_norm.beta'),
        ]:
            fs_name = 'model.encoder.layers.{}.{}' \
                      .format(layer_id, k)
            gl_name = 'encoder.layers.{}.{}' \
                      .format(layer_id, v)
            all_keys.remove(gl_name)
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    print('converting decoder params')
    decoder_num_layers = gluon_cfg.MODEL.DECODER.num_layers
    convert_attention(decoder_num_layers,
                      'model.decoder',
                      'decoder',
                      gluon_attn_prefix='attn_in_qkv')
    convert_ffn(decoder_num_layers, 'model.decoder', 'decoder')

    for layer_id in range(decoder_num_layers):
        for k, v in [
            ('self_attn.out_proj.weight', 'proj_in.weight'),
            ('self_attn.out_proj.bias', 'proj_in.bias'),
            ('self_attn_layer_norm.weight', 'ln_in.gamma'),
            ('self_attn_layer_norm.bias', 'ln_in.beta'),
            ('encoder_attn.out_proj.weight', 'proj_inter.weight'),
            ('encoder_attn.out_proj.bias', 'proj_inter.bias'),
            ('encoder_attn_layer_norm.weight', 'ln_inter.gamma'),
            ('encoder_attn_layer_norm.bias', 'ln_inter.beta'),
            ('encoder_attn.q_proj.weight', 'attn_inter_q.weight'),
            ('encoder_attn.q_proj.bias', 'attn_inter_q.bias'),
            ('encoder_attn.k_proj.weight', 'attn_inter_k.weight'),
            ('encoder_attn.k_proj.bias', 'attn_inter_k.bias'),
            ('encoder_attn.v_proj.weight', 'attn_inter_v.weight'),
            ('encoder_attn.v_proj.bias', 'attn_inter_v.bias'),
        ]:
            fs_name = 'model.decoder.layers.{}.{}' \
                      .format(layer_id, k)
            gl_name = 'decoder.layers.{}.{}' \
                      .format(layer_id, v)
            all_keys.remove(gl_name)
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'

    # check parameters sharing if share_decoder_input_output_embed is true
    assert np.array_equal(
        fairseq_params['model.decoder.embed_tokens.weight'].cpu().numpy(),
        fairseq_params['model.decoder.output_projection.weight'].cpu().numpy())
    return gluon_model