def test_mobilebert_get_pretrained(model_name):
    with tempfile.TemporaryDirectory() as root:
        cfg, tokenizer, backbone_params_path, mlm_params_path =\
            get_pretrained_mobilebert(model_name, load_backbone=True, load_mlm=True, root=root)
        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
        mobilebert_model = MobileBertModel.from_cfg(cfg)
        mobilebert_model.load_parameters(backbone_params_path)
        mobilebert_pretain_model = MobileBertForPretrain(cfg)
        if mlm_params_path is not None:
            mobilebert_pretain_model.load_parameters(mlm_params_path)
        mobilebert_pretain_model = MobileBertForPretrain(cfg)
        mobilebert_pretain_model.backbone_model.load_parameters(backbone_params_path)
def test_mobilebert_model_small_cfg(compute_layout, ctx):
    with ctx:
        cfg = MobileBertModel.get_cfg()
        cfg.defrost()
        cfg.MODEL.vocab_size = 100
        cfg.MODEL.num_layers = 2
        cfg.MODEL.hidden_size = 128
        cfg.MODEL.num_heads = 2
        cfg.MODEL.compute_layout = compute_layout
        cfg.freeze()

        # Generate TN layout
        cfg_tn = cfg.clone()
        cfg_tn.defrost()
        cfg_tn.MODEL.layout = 'TN'
        cfg_tn.freeze()

        batch_size = 4
        sequence_length = 16
        num_mask = 3
        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
        valid_length = mx.np.random.randint(3, sequence_length, (batch_size, ))
        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))

        mobile_bert_model = MobileBertModel.from_cfg(cfg)
        mobile_bert_model.initialize()
        mobile_bert_model.hybridize()
        mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn)
        mobile_bert_model_tn.share_parameters(
            mobile_bert_model.collect_params())
        mobile_bert_model_tn.hybridize()
        contextual_embedding, pooled_out = mobile_bert_model(
            inputs, token_types, valid_length)
        contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(
            inputs.T, token_types.T, valid_length)
        assert_allclose(contextual_embedding.asnumpy(),
                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
                        1E-3, 1E-3)
        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3,
                        1E-3)

        # Test for MobileBertForMLM
        mobile_bert_mlm_model = MobileBertForMLM(cfg)
        mobile_bert_mlm_model.initialize()
        mobile_bert_mlm_model.hybridize()
        mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn)
        mobile_bert_mlm_model_tn.share_parameters(
            mobile_bert_mlm_model.collect_params())
        mobile_bert_model_tn.hybridize()
        contextual_embedding, pooled_out, mlm_score = mobile_bert_mlm_model(
            inputs, token_types, valid_length, masked_positions)
        contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\
            mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
        assert_allclose(contextual_embedding.asnumpy(),
                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
                        1E-3, 1E-3)
        assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3,
                        1E-3)
        assert_allclose(mlm_score_tn.asnumpy(), mlm_score.asnumpy(), 1E-3,
                        1E-3)

        # Test for MobileBertForPretrain
        mobile_bert_pretrain_model = MobileBertForPretrain(cfg)
        mobile_bert_pretrain_model.initialize()
        mobile_bert_pretrain_model.hybridize()
        mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn)
        mobile_bert_pretrain_model_tn.share_parameters(
            mobile_bert_pretrain_model.collect_params())
        mobile_bert_pretrain_model_tn.hybridize()
        contextual_embedding, pooled_out, nsp_score, mlm_score =\
            mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
        contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_score_tn = \
            mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
        assert_allclose(contextual_embedding.asnumpy(),
                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
                        1E-3, 1E-3)
        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3,
                        1E-3)
        assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-3,
                        1E-3)
        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-3,
                        1E-3)

        # Test for fp16
        if ctx.device_type == 'gpu':
            pytest.skip('MobileBERT will have nan values in FP16 mode.')
            verify_backbone_fp16(model_cls=MobileBertModel,
                                 cfg=cfg,
                                 ctx=ctx,
                                 inputs=[inputs, token_types, valid_length])
Example #3
0
def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir):
    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    cfg, json_cfg_path, vocab_path = convert_tf_assets(model_dir)
    with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
        of.write(cfg.dump())
    new_vocab = HuggingFaceWordPieceTokenizer(
        vocab_file=vocab_path,
        unk_token='[UNK]',
        pad_token='[PAD]',
        cls_token='[CLS]',
        sep_token='[SEP]',
        mask_token='[MASK]',
        lowercase=True).vocab
    new_vocab.save(os.path.join(save_dir, 'vocab.json'))

    # test input data
    batch_size = 3
    seq_length = 32
    num_mask = 5
    input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
    valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
    input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
        < np.expand_dims(valid_length, 1)
    segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
    mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))

    tf_input_ids = tf.constant(input_ids, dtype=np.int32)
    tf_input_mask = tf.constant(input_mask, dtype=np.int32)
    tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)

    init_checkpoint = os.path.join(model_dir, 'mobilebert_variables.ckpt')
    tf_params = read_tf_checkpoint(init_checkpoint)
    # get parameter names for tensorflow with unused parameters filtered out.
    tf_names = sorted(tf_params.keys())
    tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
    tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
    tf_names = filter(lambda name: name != 'global_step', tf_names)
    tf_names = list(tf_names)

    sys.path.append(mobilebert_dir)
    from mobilebert import modeling

    tf_bert_config = modeling.BertConfig.from_json_file(json_cfg_path)
    bert_model = modeling.BertModel(
        config=tf_bert_config,
        is_training=False,
        input_ids=tf_input_ids,
        input_mask=tf_input_mask,
        token_type_ids=tf_segment_ids,
        use_one_hot_embeddings=False)
    tvars = tf.trainable_variables()
    assignment_map, _ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # the name of the parameters are ending with ':0' like 'Mobile
        # Bert/embeddings/word_embeddings:0'
        backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars}
        backbone_params = sess.run(backbone_params)
        tf_token_outputs_np = {
            'pooled_output': sess.run(bert_model.get_pooled_output()),
            'sequence_output': sess.run(bert_model.get_sequence_output()),
        }

    # The following part only ensure the parameters in backbone model are valid
    for k in backbone_params:
        assert_allclose(tf_params[k], backbone_params[k])

    # Build gluon model and initialize
    gluon_pretrain_model = MobileBertForPretrain(cfg)
    gluon_pretrain_model.initialize(ctx=ctx)
    gluon_pretrain_model.hybridize()

    # pepare test data
    mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
    mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
    mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
    mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)

    has_mlm = True
    name_map = get_name_map(tf_names, cfg.MODEL.num_stacked_ffn)
    # go through the gluon model to infer the shape of parameters
    model = gluon_pretrain_model
    contextual_embedding, pooled_output, nsp_score, mlm_scores = \
        model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
    # replace tensorflow parameter names with gluon parameter names
    mx_params = model.collect_params()
    all_keys = set(mx_params.keys())
    for (src_name, dst_name) in name_map.items():
        tf_param_val = tf_params[src_name]
        if dst_name is None:
            continue
        all_keys.remove(dst_name)
        if src_name.endswith('kernel'):
            mx_params[dst_name].set_data(tf_param_val.T)
        else:
            mx_params[dst_name].set_data(tf_param_val)

    if has_mlm:
        # 'embedding_table.weight' is shared with word_embed.weight
        all_keys.remove('embedding_table.weight')
    assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'

    # test conversion results for backbone model
    if test_conversion:
        tf_contextual_embedding = tf_token_outputs_np['sequence_output']
        tf_pooled_output = tf_token_outputs_np['pooled_output']
        contextual_embedding, pooled_output = model.backbone_model(
            mx_input_ids, mx_token_types, mx_valid_length)
        assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-2, 1E-2)
        for i in range(batch_size):
            ele_valid_length = valid_length[i]
            assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
                            tf_contextual_embedding[i, :ele_valid_length, :], 1E-2, 1E-2)
    model.backbone_model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
    logging.info('Convert the backbone model in {} to {}/{}'.format(model_dir, save_dir, 'model.params'))
    model.save_parameters(os.path.join(save_dir, 'model_mlm.params'), deduplicate=True)
    logging.info('Convert the MLM and NSP model in {} to {}/{}'.format(model_dir,
                                                                       save_dir, 'model_mlm.params'))

    logging.info('Conversion finished!')
    logging.info('Statistics:')

    old_names = os.listdir(save_dir)
    for old_name in old_names:
        new_name, long_hash = naming_convention(save_dir, old_name)
        old_path = os.path.join(save_dir, old_name)
        new_path = os.path.join(save_dir, new_name)
        shutil.move(old_path, new_path)
        file_size = os.path.getsize(new_path)
        logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
def test_mobilebert_model_small_cfg(compute_layout):
    cfg = MobileBertModel.get_cfg()
    cfg.defrost()
    cfg.MODEL.vocab_size = 100
    cfg.MODEL.num_layers = 2
    cfg.MODEL.hidden_size = 128
    cfg.MODEL.num_heads = 2
    cfg.MODEL.compute_layout = compute_layout
    cfg.freeze()

    # Generate TN layout
    cfg_tn = cfg.clone()
    cfg_tn.defrost()
    cfg_tn.MODEL.layout = 'TN'
    cfg_tn.freeze()

    batch_size = 4
    sequence_length = 16
    num_mask = 3
    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
    valid_length = mx.np.random.randint(3, sequence_length, (batch_size, ))
    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))

    mobile_bert_model = MobileBertModel.from_cfg(cfg)
    mobile_bert_model.initialize()
    mobile_bert_model.hybridize()
    mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn)
    mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params())
    mobile_bert_model_tn.hybridize()
    contextual_embedding, pooled_out = mobile_bert_model(
        inputs, token_types, valid_length)
    contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(
        inputs.T, token_types.T, valid_length)
    assert_allclose(contextual_embedding.asnumpy(),
                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4,
                    1E-4)
    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)

    # Test for MobileBertForMLM
    mobile_bert_mlm_model = MobileBertForMLM(cfg)
    mobile_bert_mlm_model.initialize()
    mobile_bert_mlm_model.hybridize()
    mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn)
    mobile_bert_mlm_model_tn.share_parameters(
        mobile_bert_mlm_model.collect_params())
    mobile_bert_model_tn.hybridize()
    contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(
        inputs, token_types, valid_length, masked_positions)
    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
        mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
    assert_allclose(contextual_embedding.asnumpy(),
                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4,
                    1E-4)
    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)

    # Test for MobileBertForPretrain
    mobile_bert_pretrain_model = MobileBertForPretrain(cfg)
    mobile_bert_pretrain_model.initialize()
    mobile_bert_pretrain_model.hybridize()
    mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn)
    mobile_bert_pretrain_model_tn.share_parameters(
        mobile_bert_pretrain_model.collect_params())
    mobile_bert_pretrain_model_tn.hybridize()
    contextual_embedding, pooled_out, nsp_score, mlm_scores =\
        mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
    contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
        mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
    assert_allclose(contextual_embedding.asnumpy(),
                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4,
                    1E-4)
    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
    assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
    assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)