def test_robert_small_config(compute_layout, ctx): with ctx: cfg = RobertaModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 1000 cfg.MODEL.num_layers = 2 cfg.MODEL.hidden_size = 128 cfg.MODEL.num_heads = 2 cfg.MODEL.compute_layout = compute_layout cfg.freeze() # Generate TN layout cfg_tn = cfg.clone() cfg_tn.defrost() cfg_tn.MODEL.layout = 'TN' cfg_tn.freeze() batch_size = 4 sequence_length = 16 num_mask = 3 inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) roberta_model = RobertaModel.from_cfg(cfg) roberta_model.initialize() roberta_model.hybridize() contextual_embeddings, pooled_out = roberta_model(inputs, valid_length) roberta_model_tn = RobertaModel.from_cfg(cfg_tn) roberta_model_tn.share_parameters(roberta_model.collect_params()) roberta_model_tn.hybridize() contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length) assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), contextual_embeddings.asnumpy(), 1E-3, 1E-3) assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3) # Test for RobertaForMLM roberta_mlm_model = RobertaForMLM(cfg) roberta_mlm_model.initialize() roberta_mlm_model.hybridize() contextual_embedding, pooled_out, mlm_score = roberta_mlm_model(inputs, valid_length, masked_positions) roberta_mlm_model_tn = RobertaForMLM(cfg_tn) roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params()) roberta_mlm_model_tn.hybridize() contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\ roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions) assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), contextual_embedding.asnumpy(), 1E-3, 1E-3) assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3) assert_allclose(mlm_score_tn.asnumpy(), mlm_score.asnumpy(), 1E-3, 1E-3) # Test for fp16 if ctx.device_type == 'gpu': verify_backbone_fp16(model_cls=RobertaModel, cfg=cfg, ctx=ctx, inputs=[inputs, valid_length])
def test_roberta(model_name): # test from pretrained with tempfile.TemporaryDirectory() as root: cfg, tokenizer, params_path, mlm_params_path =\ get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root) assert cfg.MODEL.vocab_size == len(tokenizer.vocab) # test backbone roberta_model = RobertaModel.from_cfg(cfg) roberta_model.load_parameters(params_path) roberta_model.hybridize() # test mlm model roberta_mlm_model = RobertaForMLM(cfg) if mlm_params_path is not None: roberta_mlm_model.load_parameters(mlm_params_path) roberta_mlm_model = RobertaForMLM(cfg) roberta_mlm_model.backbone_model.load_parameters(params_path) # test forward batch_size = 3 seq_length = 32 vocab_size = len(tokenizer.vocab) input_ids = mx.np.array( np.random.randint( 2, vocab_size, (batch_size, seq_length) ), dtype=np.int32 ) valid_length = mx.np.array( np.random.randint( seq_length // 2, seq_length, (batch_size,) ), dtype=np.int32 ) roberta_model(input_ids, valid_length) mx.npx.waitall() # test backward label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size) with mx.autograd.record(): contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length) loss = label_smooth_loss(contextual_embeddings, input_ids) loss.backward() mx.npx.waitall()
def convert_fairseq_model(args): if not args.save_dir: args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon' if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) fairseq_roberta = fairseq_RobertaModel.from_pretrained( args.fairseq_model_path, checkpoint_file='model.pt') vocab_size = convert_vocab(args, fairseq_roberta) gluon_cfg = convert_config(fairseq_roberta.args, vocab_size, RobertaModel.get_cfg().clone()) with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of: of.write(gluon_cfg.dump()) ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu() gluon_roberta = convert_params(fairseq_roberta, gluon_cfg, ctx) if args.test: test_model(fairseq_roberta, gluon_roberta, args.gpu) gluon_roberta.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True) logging.info('Convert the RoBERTa MLM model in {} to {}'.format( os.path.join(args.fairseq_model_path, 'model.pt'), os.path.join(args.save_dir, 'model_mlm.params'))) gluon_roberta.backbone_model.save_parameters(os.path.join( args.save_dir, 'model.params'), deduplicate=True) logging.info('Convert the RoBERTa backbone model in {} to {}'.format( os.path.join(args.fairseq_model_path, 'model.pt'), os.path.join(args.save_dir, 'model.params'))) logging.info('Conversion finished!') logging.info('Statistics:') old_names = os.listdir(args.save_dir) for old_name in old_names: new_name, long_hash = naming_convention(args.save_dir, old_name) old_path = os.path.join(args.save_dir, old_name) new_path = os.path.join(args.save_dir, new_name) shutil.move(old_path, new_path) file_size = os.path.getsize(new_path) logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))