def setup_class(cls): if not torch.cuda.is_available(): return GPUS = 1 plugins = [NLPDDPPlugin()] TP_SIZE = GPUS PP_SIZE = 1 MB_SIZE = 4 GB_SIZE = 8 SEED = 1234 trainer = Trainer( plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None ) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=TP_SIZE, pipeline_model_parallel_size=PP_SIZE, micro_batch_size=MB_SIZE, global_batch_size=GB_SIZE, seed=SEED, apex_transformer_log_level=30, ) def dummy(): return if trainer.strategy.launcher is not None: trainer.strategy.launcher.launch(dummy, trainer=trainer) trainer.strategy.setup_environment() torch.distributed.barrier()
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.model_parallel_size = 1 app_state.model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get( 'tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) try: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() logging.info('Megatron dataset helper compiled successfully.') from nemo.collections.nlp.data.language_modeling.megatron import helpers except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' )
def __init__(self, cfg: DictConfig, trainer: Trainer): # FIXME: switch to self._cfg if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) super().__init__(cfg, trainer=trainer) # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self._cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self._cfg.get('seed', 1234), )
def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): # FIXME: switch to self._cfg if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=no_lm_init) # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self._cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), pipeline_model_parallel_split_rank=cfg.get( 'pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), ) self.grad_clip_pl_default = False # use pytorch default for gradient clipping. Default False if hasattr( self._cfg, "tokenizer") or (hasattr(self._cfg, "encoder_tokenizer") and hasattr(self._cfg, "decoder_tokenizer")): # build tokenizer (defaults to nemo supported tokenizers) self._build_tokenizer() # manipulate vocabulary (e.g., pad vocabulary for better efficiency) self._build_vocab()
def __init__(self, cfg: DictConfig, trainer: Trainer): # FIXME: switch to self._cfg if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=True) # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self._cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] if cfg.get('pipeline_model_parallel_size', 1) > 1: if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0: raise ValueError( f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1" ) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), pipeline_model_parallel_split_rank=cfg.get( 'pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), )
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.model_parallel_size = 1 app_state.model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get( 'tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), )
def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) self.cfg = cfg # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] self._reduced_lm_loss_buffer = [] self._reduced_sop_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer_model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("merges_file", self.cfg.tokenizer.merge_file), ) vocab_size = self.tokenizer.vocab_size padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) num_tokentypes = 2 if cfg.bert_binary_head else 0 self.model = BertModel( vocab_size=padded_vocab_size, hidden_size=cfg.hidden_size, max_position_embeddings=cfg.max_position_embeddings, num_layers=cfg.num_layers, num_attention_heads=cfg.num_attention_heads, apply_query_key_layer_scaling=cfg.get( 'apply_query_key_layer_scaling', True), kv_channels=cfg.get('kv_channels', None), ffn_hidden_size=cfg.ffn_hidden_size, num_tokentypes=num_tokentypes, parallel_output=True, pre_process=cfg.get('pre_process', True), post_process=cfg.get('post_process', True), init_method_std=cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), activations_checkpoint_method=cfg.get( 'activations_checkpoint_method', None), activations_checkpoint_num_layers=cfg.get( 'activations_checkpoint_num_layers', 1), layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), onnx_safe=cfg.get('onnx_safe', False), add_binary_head=cfg.bert_binary_head, )
def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=True) self._validate_trainer() # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self.cfg.tokenizer.merge_file), delimiter=self.cfg.tokenizer.get('delimiter', None), ) vocab_size = self.tokenizer.vocab_size self.padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.model = Float16Module(module=self.model, precision=cfg.precision) if self.trainer.precision == 32: self.autocast_dtype = torch.float elif self.trainer.precision == 16: self.autocast_dtype = torch.half elif self.trainer.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') # configuration used for inference self._inference_config = None
def __init__(self, cfg: DictConfig, trainer: Trainer = None): """Initializes the PTune TextClassifier model.""" super().__init__(cfg=cfg, trainer=trainer) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=cfg.get('seed', 1234), ) # shared params for dataset and data loaders self.dataset_cfg = cfg.dataset # tokenizer needs to get initialized before the super.__init__() # as dataloaders and datasets need it to process the data self.tokenizer = get_nmt_tokenizer( library=cfg.tokenizer.library, model_name=cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merges_file", cfg.tokenizer.merge_file), ) self.class_weights = None self.model = MegatronGPTModel.restore_from( self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)), trainer=trainer, ) if not cfg.use_lm_finetune: self.model.freeze() hidden_size = self.model.cfg.hidden_size # register the file containing the labels into the artifacts to get stored in the '.nemo' file later self.classes = cfg.dataset.classes self.embeddings = self.model.model.language_model.embedding.word_embeddings # set allowed vocab set self.vocab = self.tokenizer.tokenizer.get_vocab() # make sure classes are part of the vocab for k in cfg.dataset.classes: if token_wrapper(k) not in self.vocab: logging.error(f'class {k} is not part of the vocabulary. Please add it to your vocab') self.allowed_vocab_ids = set(self.vocab[token_wrapper(k)] for k in cfg.dataset.classes) # map from id to label self.allowed_vocab = {} self.label_ids = {} self.id_to_label = {} for i, k in enumerate(cfg.dataset.classes): self.allowed_vocab[self.vocab[token_wrapper(k)]] = i self.label_ids[k] = i self.id_to_label[i] = k self.template = cfg.prompt_encoder.template self.prompt_encoder = PromptEncoder( template=cfg.prompt_encoder.template, hidden_size=hidden_size, lstm_dropout=cfg.prompt_encoder.dropout, num_layers=cfg.prompt_encoder.num_layers, ) # load prompt encoder self.hidden_size = hidden_size self.tokenizer.add_special_tokens({'additional_special_tokens': [cfg.pseudo_token]}) self.pseudo_token_id = self.tokenizer.tokenizer.get_vocab()[cfg.pseudo_token] self.pad_token_id = ( self.tokenizer.tokenizer.pad_token_id if self.tokenizer.tokenizer.pad_token_id is not None else self.tokenizer.tokenizer.unk_token_id ) self.spell_length = sum(self.template)
def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) super().__init__(cfg, trainer=trainer) self.cfg = cfg # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] self._reduced_lm_loss_buffer = [] self._reduced_sop_loss_buffer = [] # not saved as part of nemo model graph but required during export to ONNX input_names = ['input_ids', 'attention_mask', 'token_type_ids'] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self.cfg.tokenizer.merge_file), ) vocab_size = self.tokenizer.vocab_size padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) num_tokentypes = 2 if cfg.bert_binary_head else 0 self.model = BertModel( vocab_size=padded_vocab_size, hidden_size=cfg.hidden_size, max_position_embeddings=cfg.max_position_embeddings, num_layers=cfg.num_layers, num_attention_heads=cfg.num_attention_heads, apply_query_key_layer_scaling=cfg.get( 'apply_query_key_layer_scaling', True), kv_channels=cfg.get('kv_channels', None), ffn_hidden_size=cfg.ffn_hidden_size, num_tokentypes=num_tokentypes, parallel_output=True, pre_process=cfg.get('pre_process', True), post_process=cfg.get('post_process', True), init_method_std=cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), activations_checkpoint_method=cfg.get( 'activations_checkpoint_method', None), activations_checkpoint_num_layers=cfg.get( 'activations_checkpoint_num_layers', 1), layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), masked_softmax_fusion=cfg.get('masked_softmax_fusion', True), bias_gelu_fusion=cfg.get('bias_gelu_fusion', True), onnx_safe=cfg.get('onnx_safe', False), add_binary_head=cfg.bert_binary_head, megatron_legacy=cfg.get('megatron_legacy', False), )