def initialize_megatron(extra_args_provider=None, args_defaults={}, ignore_unknown_args=False, allow_no_cuda=False): """Set global variables, initialize distributed, and set autoresume and random seeds. `allow_no_cuda` should not be set unless using megatron for cpu only data processing. In general this arg should not be set unless you know what you are doing. Returns a function to finalize distributed env initialization (optionally, only when args.lazy_mpu_init == True) """ if not allow_no_cuda: # Make sure cuda is available. assert torch.cuda.is_available(), 'Megatron requires CUDA.' # Parse args, build tokenizer, and set adlr-autoresume, # tensorboard-writer, and timers. set_global_variables(extra_args_provider=extra_args_provider, args_defaults=args_defaults, ignore_unknown_args=ignore_unknown_args) # torch.distributed initialization def finish_mpu_init(): args = get_args() # Pytorch distributed. _initialize_distributed() # Random seeds for reproducibility. if args.rank == 0: print('> setting random seeds to {} ...'.format(args.seed)) _set_random_seed(args.seed) args = get_args() if args.lazy_mpu_init: args.use_cpu_initialization = True # delayed initialization of DDP-related stuff # We only set basic DDP globals set_model_parallel_world_size(args.model_parallel_size) # and return function for external DDP manager to call when it has DDP initialized set_model_parallel_rank(args.rank) return finish_mpu_init else: # Megatron's MPU is the master. Complete initialization right away. finish_mpu_init() # Initialize memory buffers. _initialize_mem_buffs() # Autoresume. _init_autoresume() # Write arguments to tensorboard. _write_args_to_tensorboard() # No continuation function return None
def initialize_megatron(neox_args, allow_no_cuda=False): """Set initialize distributed and set autoresume and random seeds. `allow_no_cuda` should not be set unless using megatron for cpu only data processing. In general this arg should not be set unless you know what you are doing. Returns a function to finalize distributed env initialization (optionally, only when args.lazy_mpu_init == True) """ if not allow_no_cuda: # Make sure cuda is available. assert torch.cuda.is_available(), "Megatron requires CUDA." # torch.distributed initialization def finish_mpu_init(): # Pytorch distributed. _initialize_distributed(neox_args=neox_args) # Random seeds for reproducibility. if neox_args.rank == 0: print("> setting random seeds to {} ...".format(neox_args.seed)) _set_random_seed(neox_args.seed) # check fused kernels are installed: if (neox_args.scaled_upper_triang_masked_softmax_fusion or neox_args.scaled_masked_softmax_fusion): fused_kernels.load_fused_kernels() if neox_args.lazy_mpu_init: neox_args.use_cpu_initialization = True # delayed initialization of DDP-related stuff # We only set basic DDP globals set_model_parallel_world_size(neox_args.model_parallel_size) # and return function for external DDP manager to call when it has DDP initialized set_model_parallel_rank(neox_args.rank) return finish_mpu_init else: # Megatron's MPU is the master. Complete initialization right away. finish_mpu_init() # Compile dataset C++ code. if neox_args.local_rank == 0: from megatron.data.data_utils import compile_helper compile_helper() # Write arguments to tensorboard. _write_args_to_tensorboard(neox_args=neox_args) # No continuation function return None