Beispiel #1
0
def test_bart_cfg(cfg_key, ctx):
    cfg = BartModel.get_cfg(cfg_key)
    cfg.defrost()
    cfg.MODEL.vocab_size = 32
    cfg.freeze()

    cfg_tn = cfg.clone()
    cfg_tn.defrost()
    cfg_tn.MODEL.layout = 'TN'
    cfg_tn.freeze()

    batch_size = 4
    src_length = 32
    tgt_length = 16

    with ctx:
        src_data = mx.np.random.randint(0,
                                        cfg.MODEL.vocab_size,
                                        (batch_size, src_length),
                                        dtype=np.int32)
        src_valid_length = mx.np.random.randint(src_length // 2,
                                                src_length, (batch_size, ),
                                                dtype=np.int32)
        tgt_data = mx.np.random.randint(0,
                                        cfg.MODEL.vocab_size,
                                        (batch_size, tgt_length),
                                        dtype=np.int32)
        tgt_valid_length = mx.np.random.randint(tgt_length // 2,
                                                tgt_length, (batch_size, ),
                                                dtype=np.int32)
        model = BartModel.from_cfg(cfg, extract_feature=True)
        model.initialize()
        model.hybridize()

        contextual_embedding, pooled_output = model(src_data, src_valid_length,
                                                    tgt_data, tgt_valid_length)
        model_tn = BartModel.from_cfg(cfg_tn, extract_feature=True)
        model_tn.share_parameters(model.collect_params())
        model_tn.hybridize()
        contextual_embedding_tn, pooled_out_tn = model_tn(
            src_data.T, src_valid_length, tgt_data.T, tgt_valid_length)
        npt.assert_allclose(
            contextual_embedding.asnumpy(),
            np.transpose(contextual_embedding_tn.asnumpy(), (1, 0, 2)), 5E-3,
            5E-3)
        npt.assert_allclose(pooled_out_tn.asnumpy(), pooled_output.asnumpy(),
                            5E-3, 5E-3)
        mx.npx.waitall()

        # Verify Float16
        if ctx.device_type == 'gpu':
            verify_backbone_fp16(model_cls=BartModel,
                                 cfg=cfg,
                                 ctx=ctx,
                                 inputs=[
                                     src_data, src_valid_length, tgt_data,
                                     tgt_valid_length
                                 ])
Beispiel #2
0
def test_bart(model_name):
    # test from pretrained
    assert len(list_pretrained_bart()) > 0
    with tempfile.TemporaryDirectory() as root:
        cfg, tokenizer, params_path, _ =\
            get_pretrained_bart(model_name, load_backbone=True, root=root)
        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
        # test standard bart encoder and decoder
        bart_model = BartModel.from_cfg(cfg)
        bart_model.load_parameters(params_path)
        # test bart encoder and decoder with pooler
        bart_model_with_pooler = BartModel.from_cfg(
            cfg, use_pooler=True, classifier_activation=False)
        bart_model_with_pooler.load_parameters(params_path)
Beispiel #3
0
def test_bart_cfg(cfg_key):
    cfg = BartModel.get_cfg(cfg_key)
    cfg.defrost()
    cfg.MODEL.vocab_size = 32
    cfg.freeze()
    model = BartModel.from_cfg(cfg)
    model.initialize()
    model.hybridize()
    cfg.defrost()
    cfg.MODEL.layout = 'TN'
    cfg.freeze()
    model_tn = BartModel.from_cfg(cfg)
    model_tn.share_parameters(model.collect_params())
    model_tn.hybridize()
    mx.npx.waitall()
Beispiel #4
0
def convert_fairseq_model(args):
    if not args.save_dir:
        args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    fairseq_bart = fairseq_BARTModel.from_pretrained(
        args.fairseq_model_path, checkpoint_file='model.pt')
    vocab_size = convert_vocab(args, fairseq_bart)
    gluon_cfg = convert_config(fairseq_bart.args, vocab_size,
                               BartModel.get_cfg().clone())
    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
        of.write(gluon_cfg.dump())

    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
    gluon_bart = convert_params(fairseq_bart, gluon_cfg, ctx)
    if args.test:
        test_model(fairseq_bart, gluon_bart, args.gpu)

    gluon_bart.save_parameters(os.path.join(args.save_dir, 'model.params'),
                               deduplicate=True)
    logging.info('Convert the BART MLM model in {} to {}'.format(
        os.path.join(args.fairseq_model_path, 'model.pt'),
        os.path.join(args.save_dir, 'model.params')))

    logging.info('Conversion finished!')
    logging.info('Statistics:')
    old_names = os.listdir(args.save_dir)
    for old_name in old_names:
        new_name, long_hash = naming_convention(args.save_dir, old_name)
        old_path = os.path.join(args.save_dir, old_name)
        new_path = os.path.join(args.save_dir, new_name)
        shutil.move(old_path, new_path)
        file_size = os.path.getsize(new_path)
        logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash,
                                            file_size))
Beispiel #5
0
def convert_params(fairseq_model, gluon_cfg, ctx):
    fairseq_params = fairseq_model.state_dict()
    # apply a linear mapping to vocab dictionary
    gluon_model = BartModel.from_cfg(gluon_cfg, use_pooler=False)
    gluon_model.initialize(ctx=ctx)
    gluon_model.hybridize()
    gluon_params = gluon_model.collect_params()
    all_keys = set(gluon_params.keys())

    def convert_attention(num_layers,
                          fairseq_prefix,
                          gluon_prefix,
                          fairseq_attn_prefix='self_attn',
                          gluon_attn_prefix='attn_qkv'):
        for layer_id in range(num_layers):
            fs_atten_prefix = \
                '{}.layers.{}.{}.' \
                .format(fairseq_prefix, layer_id, fairseq_attn_prefix)
            fs_q_weight = fairseq_params[fs_atten_prefix +
                                         'q_proj.weight'].cpu().numpy()
            fs_k_weight = fairseq_params[fs_atten_prefix +
                                         'k_proj.weight'].cpu().numpy()
            fs_v_weight = fairseq_params[fs_atten_prefix +
                                         'v_proj.weight'].cpu().numpy()
            fs_q_bias = fairseq_params[fs_atten_prefix +
                                       'q_proj.bias'].cpu().numpy()
            fs_k_bias = fairseq_params[fs_atten_prefix +
                                       'k_proj.bias'].cpu().numpy()
            fs_v_bias = fairseq_params[fs_atten_prefix +
                                       'v_proj.bias'].cpu().numpy()
            gl_qkv_prefix = \
                '{}.layers.{}.{}.' \
                .format(gluon_prefix, layer_id, gluon_attn_prefix)
            gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
            gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
            all_keys.remove(gl_qkv_prefix + 'weight')
            all_keys.remove(gl_qkv_prefix + 'bias')
            gl_qkv_weight.set_data(
                np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight],
                               axis=0))
            gl_qkv_bias.set_data(
                np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))

    def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
        # convert feed forward layer in encoder
        for layer_id in range(num_layers):
            for k, v in [('fc1.weight', 'ffn.ffn_1.weight'),
                         ('fc1.bias', 'ffn.ffn_1.bias'),
                         ('fc2.weight', 'ffn.ffn_2.weight'),
                         ('fc2.bias', 'ffn.ffn_2.bias'),
                         ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
                         ('final_layer_norm.bias', 'ffn.layer_norm.beta')]:
                fs_name = '{}.layers.{}.{}' \
                          .format(fairseq_prefix, layer_id, k)
                gl_name = '{}.layers.{}.{}' \
                          .format(gluon_prefix, layer_id, v)
                all_keys.remove(gl_name)
                gluon_params[gl_name].set_data(
                    fairseq_params[fs_name].cpu().numpy())

    print('converting embedding params')
    padding_idx = fairseq_model.task.dictionary.pad_index
    for fs_name, gl_name in [
        ('model.encoder.embed_tokens.weight', 'src_embed_layer.weight'),
        ('model.encoder.embed_positions.weight',
         'src_pos_embed_layer._embed.weight'),
        ('model.encoder.layernorm_embedding.weight', 'encoder.ln_data.gamma'),
        ('model.encoder.layernorm_embedding.bias', 'encoder.ln_data.beta'),
        ('model.decoder.embed_tokens.weight', 'tgt_embed_layer.weight'),
        ('model.decoder.embed_positions.weight',
         'tgt_pos_embed_layer._embed.weight'),
        ('model.decoder.layernorm_embedding.weight', 'decoder.ln_data.gamma'),
        ('model.decoder.layernorm_embedding.bias', 'decoder.ln_data.beta'),
            # final projection in decoder
        ('model.decoder.output_projection.weight', 'tgt_final_layer.weight'),
    ]:
        all_keys.remove(gl_name)
        if 'embed_positions' in fs_name:
            # position embed weight
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy()[padding_idx + 1:, :])
        else:
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    print('converting encoder params')
    encoder_num_layers = gluon_cfg.MODEL.ENCODER.num_layers
    convert_attention(encoder_num_layers, 'model.encoder', 'encoder')
    convert_ffn(encoder_num_layers, 'model.encoder', 'encoder')
    for layer_id in range(encoder_num_layers):
        for k, v in [
            ('self_attn.out_proj.weight', 'attention_proj.weight'),
            ('self_attn.out_proj.bias', 'attention_proj.bias'),
            ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
            ('self_attn_layer_norm.bias', 'layer_norm.beta'),
        ]:
            fs_name = 'model.encoder.layers.{}.{}' \
                      .format(layer_id, k)
            gl_name = 'encoder.layers.{}.{}' \
                      .format(layer_id, v)
            all_keys.remove(gl_name)
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    print('converting decoder params')
    decoder_num_layers = gluon_cfg.MODEL.DECODER.num_layers
    convert_attention(decoder_num_layers,
                      'model.decoder',
                      'decoder',
                      gluon_attn_prefix='attn_in_qkv')
    convert_ffn(decoder_num_layers, 'model.decoder', 'decoder')

    for layer_id in range(decoder_num_layers):
        for k, v in [
            ('self_attn.out_proj.weight', 'proj_in.weight'),
            ('self_attn.out_proj.bias', 'proj_in.bias'),
            ('self_attn_layer_norm.weight', 'ln_in.gamma'),
            ('self_attn_layer_norm.bias', 'ln_in.beta'),
            ('encoder_attn.out_proj.weight', 'proj_inter.weight'),
            ('encoder_attn.out_proj.bias', 'proj_inter.bias'),
            ('encoder_attn_layer_norm.weight', 'ln_inter.gamma'),
            ('encoder_attn_layer_norm.bias', 'ln_inter.beta'),
            ('encoder_attn.q_proj.weight', 'attn_inter_q.weight'),
            ('encoder_attn.q_proj.bias', 'attn_inter_q.bias'),
            ('encoder_attn.k_proj.weight', 'attn_inter_k.weight'),
            ('encoder_attn.k_proj.bias', 'attn_inter_k.bias'),
            ('encoder_attn.v_proj.weight', 'attn_inter_v.weight'),
            ('encoder_attn.v_proj.bias', 'attn_inter_v.bias'),
        ]:
            fs_name = 'model.decoder.layers.{}.{}' \
                      .format(layer_id, k)
            gl_name = 'decoder.layers.{}.{}' \
                      .format(layer_id, v)
            all_keys.remove(gl_name)
            gluon_params[gl_name].set_data(
                fairseq_params[fs_name].cpu().numpy())

    assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'

    # check parameters sharing if share_decoder_input_output_embed is true
    assert np.array_equal(
        fairseq_params['model.decoder.embed_tokens.weight'].cpu().numpy(),
        fairseq_params['model.decoder.output_projection.weight'].cpu().numpy())
    return gluon_model