def convert(local_rank, rank, world_size, args): app_state = AppState() app_state.data_parallel_rank = 0 num_nodes = world_size // args.gpus_per_node if args.bcp: trainer = Trainer(devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=[TorchElasticEnvironment()]) else: trainer = Trainer(devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu') app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size app_state.tensor_model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size parallel_state.initialize_model_parallel( tensor_model_parallel_size_=app_state.tensor_model_parallel_size, pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size, ) app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank( ) app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank( ) # inject model parallel rank checkpoint_path = inject_model_parallel_rank( os.path.join(args.checkpoint_folder, args.checkpoint_name)) logging.info( f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}' ) if args.model_type == 'gpt': model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'bert': model = MegatronBertModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 't5': model = MegatronT5Model.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'nmt': model = MegatronNMTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() if torch.distributed.is_initialized(): torch.distributed.barrier() model.save_to(args.nemo_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}')
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.tensor_model_parallel_size = 1 app_state.tensor_model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) try: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper if is_global_rank_zero(): compile_helper() if torch.distributed.is_available() and torch.distributed.is_initialized(): torch.distributed.barrier() from nemo.collections.nlp.data.language_modeling.megatron import helpers logging.info('Megatron dataset helper compiled successfully.') except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' )
def convert(local_rank, rank, world_size, args): app_state = AppState() app_state.data_parallel_rank = 0 tensor_model_parallel_size = args.tensor_model_parallel_size num_nodes = world_size // args.gpus_per_node pipeline_model_parallel_size = world_size // args.tensor_model_parallel_size assert args.pipeline_model_parallel_size == pipeline_model_parallel_size trainer = Trainer(devices=args.gpus_per_node, accelerator='gpu', num_nodes=num_nodes) app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size app_state.tensor_model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size parallel_state.initialize_model_parallel( tensor_model_parallel_size_=app_state.tensor_model_parallel_size, pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size, ) app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank( ) app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank( ) pipeline_rank = rank // tensor_model_parallel_size tensor_rank = app_state.tensor_model_parallel_rank assert pipeline_rank == app_state.pipeline_model_parallel_rank if tensor_model_parallel_size is not None and tensor_model_parallel_size > 1 and pipeline_model_parallel_size == 1: # inject model parallel rank checkpoint_path = os.path.join(args.checkpoint_folder, f'mp_rank_{tensor_rank:02d}', args.checkpoint_name) elif tensor_model_parallel_size is not None and pipeline_model_parallel_size > 1: checkpoint_path = os.path.join( args.checkpoint_folder, f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}', args.checkpoint_name) else: checkpoint_path = os.path.join(args.checkpoint_folder, args.checkpoint_name) logging.info(f"loading checkpoint {checkpoint_path}") if args.model_type == 'gpt': ## this dictionary is used to rename the model parameters name_translate = {} name_translate['transformer'] = 'encoder' name_translate['.attention.'] = '.self_attention.' # nemo megatron doesn't have _for_head key name_translate['word_embeddings_for_head'] = 'word_embeddings' checkpoint, consumed, steps, version = load_from_checkpoint( MegatronGPTModel, checkpoint_path, hparams_file=args.hparams_file, trainer=trainer, translator=name_translate, strict=False, ) elif args.model_type == 'bert': ## this dictionary is used to rename the model parameters name_translate = {} name_translate['transformer'] = 'encoder' name_translate['.attention.'] = '.self_attention.' # nemo megatron doesn't have _for_head key name_translate['word_embeddings_for_head'] = 'word_embeddings' checkpoint, consumed, steps, version = load_from_checkpoint( MegatronBertModel, checkpoint_path, hparams_file=args.hparams_file, trainer=trainer, translator=name_translate, strict=False, ) else: raise NotImplemented("{} is not supported".format(args.model_type)) if torch.distributed.is_initialized(): torch.distributed.barrier() if args.output_ckpt_file_path: filepath = args.output_ckpt_file_path base_dir = pathlib.Path(filepath).parent filename_str = pathlib.Path(filepath).name suffix = '.ckpt' content = {} if consumed is not None: content['consumed'] = consumed else: content['consumed'] = 0 if steps is not None: content['steps'] = steps else: content['steps'] = 0 filename = filename_str.format(**content) + suffix checkpoint_path_output = inject_model_parallel_rank( os.path.join(base_dir, filename)) trainer.accelerator.training_type_plugin.checkpoint_io.save_checkpoint( checkpoint, checkpoint_path_output) logging.info( f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}' ) if args.nemo_file_path: if args.model_type == 'gpt': model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) elif args.model_type == 'bert': model = load_model(MegatronBertModel, checkpoint, strict=False, trainer=trainer) else: raise NotImplemented("{} is not supported".format(args.model_type)) # verify tensor parallel rank id and pipeline parallel rank id matches assert app_state.data_parallel_size == 1 assert app_state.tensor_model_parallel_size == tensor_model_parallel_size assert app_state.tensor_model_parallel_rank == tensor_rank assert app_state.pipeline_model_parallel_size == pipeline_model_parallel_size assert app_state.pipeline_model_parallel_rank == pipeline_rank model._save_restore_connector = NLPSaveRestoreConnector() model.save_to(args.nemo_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}')