def __init__(self, model_name, config, vocab_file, model_parallel_size=None, model_parallel_rank=None): super().__init__() self._model_parallel_size = model_parallel_size self._model_parallel_rank = model_parallel_rank self._restore_path = None self._app_state = None self._model_name = model_name if not os.path.exists(vocab_file): raise ValueError(f'Vocab file not found at {vocab_file}') # convert config to dictionary if isinstance(config, DictConfig): config = OmegaConf.to_container(config) config["vocab_file"] = vocab_file config['tokenizer_type'] = 'BertWordPieceLowerCase' config['lazy_mpu_init'] = True config['onnx_safe'] = True # if 'model_parallel_size' in config: if self._model_parallel_size is not None: app_state = AppState() self._app_state = app_state # must be set for model parallel megatron-lm os.environ["WORLD_SIZE"] = str(app_state.world_size) os.environ["RANK"] = str(self._model_parallel_rank) extra_args_provider = self._update_megatron_args(tensor_model_parallel_size=self._model_parallel_size) else: extra_args_provider = self._update_megatron_args() # configure globals for megatron set_pipeline_model_parallel_rank(0) # pipeline model parallelism not implemented in NeMo set_pipeline_model_parallel_world_size(1) # pipeline model parallelism not implemented in NeMo # Initialize part of Megatron global state that is needed for its constructor. # We set 'lazy_mpu_init' flag on to make Megatron do only the initialization that does not depend # on ddp be initialized yet (and we don't want Megatron to initialize DDP itself either) # and to return a hook for us to call after PTL has torch.distributed initialized. # (or if no PTL in case of inference - then we'll initialize torch.distributed) # We call and clear this hook on first call to forward() self._lazy_init_fn = initialize_megatron( extra_args_provider=extra_args_provider, args_defaults=config, ignore_unknown_args=True ) # read Megatron arguments back args = get_args() logging.info(f'Megatron-lm argparse args: {args}') self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False ) self.config = OmegaConf.create(config) # key used for checkpoints self._hidden_size = self.language_model.hidden_size
def model_setup(yaml_list=None, param_dict=None, clear_data=True, inference=False): from megatron.neox_arguments import NeoXArgs from megatron.mpu import destroy_model_parallel from megatron import initialize_megatron from megatron.training import setup_model_and_optimizer destroy_model_parallel( ) # mpu model parallel contains remaining global vars if clear_data and (not torch.distributed.is_initialized() or torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0): clear_test_dirs() overwrite_values = { "user_script": str(get_root_directory() / "pretrain_gpt2.py"), "save": TEST_CHECKPOINT_DIR, "load": TEST_CHECKPOINT_DIR, "log_dir": TEST_LOG_DIR, "tensorboard_dir": TEST_TENSORBOARD_DIR, } # should not both be none assert yaml_list is not None or param_dict is not None # initially load config from files as would be the case in deepy.py if yaml_list is not None: args_loaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values) else: p_dict = param_dict.copy() p_dict.update(overwrite_values) args_loaded = NeoXArgs.from_dict(p_dict) args_loaded.build_tokenizer() initialize_megatron(neox_args=args_loaded) model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=args_loaded, inference=inference, get_key_value=True) return model, optimizer, lr_scheduler, args_loaded
def __init__(self, model_name, config, vocab_file, model_parallel_size=None): super().__init__() self._model_parallel_size = model_parallel_size self._restore_path = None self._app_state = None if not os.path.exists(vocab_file): raise ValueError(f'Vocab file not found at {vocab_file}') config["vocab_file"] = vocab_file config['tokenizer_type'] = 'BertWordPieceLowerCase' config['lazy_mpu_init'] = True config['onnx_safe'] = True # if 'model_parallel_size' in config: if self._model_parallel_size is not None: app_state = AppState() self._app_state = app_state # must be set for model parallel megatron-lm os.environ["WORLD_SIZE"] = str(app_state.world_size) # used to set model_parallel_size in megatron-lm argparser def _update_model_parallel_arg(parser): parser.set_defaults(model_parallel_size=self._model_parallel_size) return parser extra_args_provider = _update_model_parallel_arg else: extra_args_provider = None # Initialize part of Megatron global state that is needed for its constructor. # We set 'lazy_mpu_init' flag on to make Megatron do only the initialization that does not depend # on ddp be initialized yet (and we don't want Megatron to initialize DDP itself either) # and to return a hook for us to call after PTL has torch.distributed initialized. # We call this hook during .forward # TODO: can we call this hook using the PTL hook .setup() self._lazy_init_fn = initialize_megatron( extra_args_provider=extra_args_provider, args_defaults=config, ignore_unknown_args=True ) # read Megatron arguments back args = get_args() logging.info(f'Megatron-lm argparse args: {args}') self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False ) self.config = OmegaConf.create(config) # key used for checkpoints self._hidden_size = self.language_model.hidden_size