def model_provider(): """Build the model.""" if eval_metric == 'loss': parallel_output = True elif eval_metric == 'accuracy': parallel_output = False else: raise NotImplementedError('output type for {} evaluation metric ' 'is not supported.'.format(eval_metric)) print_rank_0('building GPT2 model ...') if mpu.get_pipeline_model_parallel_world_size() > 1: # Determine model based on position of stage in pipeline. if mpu.is_pipeline_first_stage(): model = GPT2ModelFirstStage(num_tokentypes=0) elif mpu.is_pipeline_last_stage(): model = GPT2ModelLastStage(parallel_output=parallel_output, num_tokentypes=0) else: model = GPT2ModelIntermediateStage(num_tokentypes=0) else: model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output) return model
def get_gpt2_model(args_others, mp_size=1): from megatron.model import GPT2Model from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': get_test_path('gpt2-vocab.json'), 'merge_file': get_test_path('gpt2-merges.txt'), 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) model = GPT2Model(num_tokentypes=0, parallel_output=False) model.cuda() from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import mpu i = torch.cuda.current_device() model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model
def model_provider(): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_tokentypes=0, parallel_output=False) return model
def model_provider(): """Build the model.""" print_rank_0('building GPT2 model ...') with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), remote_device=get_args().remote_device, enabled=get_args().zero_stage == 3): model = GPT2Model(num_tokentypes=0, parallel_output=True) return model
def model_provider(): """Build the model.""" if eval_metric == 'loss': parallel_output = True elif eval_metric == 'accuracy': parallel_output = False else: raise NotImplementedError('output type for {} evaluation metric ' 'is not supported.'.format(eval_metric)) print_rank_0('building GPT2 model ...') model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output) return model
def model_provider(): """Build the model.""" args = get_args() print_rank_0('building GPT2 model ...') if args.pipe_parallel_size == 0: model = GPT2Model(num_tokentypes=0, parallel_output=True) else: model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology()) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe return model
def model_provider(): """Build the model.""" args = get_args() print_rank_0('building GPT2 model ...') if args.pipe_parallel_size == 0: model = GPT2Model(num_tokentypes=0, parallel_output=True) else: model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology()) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe ## Wandb use_wandb = get_wandb_api_key() is not None set_use_wandb(use_wandb) args_dict = vars(args) if use_wandb: # only display system stats from one worker per machine wandb_settings = wandb.Settings() if is_local_main( ) else wandb.Settings(_disable_stats=True) group_name = args_dict.get('wandb_group') name = f'{socket.gethostname()}-{local_rank()}' if group_name else None try: wandb.init(project="neox", group=group_name, name=name, save_code=False, force=False, entity=args_dict.get('wandb_team'), settings=wandb_settings) except UsageError as e: set_use_wandb(False) print(e) print( 'Skipping wandb. Execute `wandb login` on local or main node machine to enable.' ) if use_wandb: wandb.config.update(args_dict) return model
def model_provider(): """Build the model.""" print_rank_0('building GPT2 model ...') see_memory_usage(f"Before Building Model", force=True) with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), remote_device=get_args().remote_device, deepspeed_config=get_args().deepspeed_config, enabled=get_args().zero_stage == 3): model = GPT2Model(num_tokentypes=0, parallel_output=True) see_memory_usage(f"After Building Model", force=True) if mpu.get_data_parallel_rank() == 0: billion_params = get_parameters_in_billions(model) print( f' > number of parameters on model parallel rank {mpu.get_model_parallel_rank()}\ {round(billion_params, 3)} Billion', flush=True) return model