def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) optimizer = get_optimizer(model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu, dist_init_required=False) if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) else: args.iteration = 0 # get model without FP16 and/or TorchDDP wrappers unwrapped_model = model while hasattr(unwrapped_model, 'module'): unwrapped_model = unwrapped_model.module if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'): print("Initializing ICT from pretrained BERT model", flush=True) unwrapped_model.init_state_dict_from_bert() return model, optimizer, lr_scheduler
def main(): """Main program.""" args = get_args() if args.task == 'LAMBADA': eval_metric = 'accuracy' elif args.task == 'WIKITEXT103': eval_metric = 'loss' else: raise NotImplementedError('{} task is not implemented.'.format( args.task)) # Set up model and load checkpoint. model = get_model(get_model_provider(eval_metric)) if args.load is not None: _ = load_checkpoint(model, None, None) # Data stuff. dataset = build_dataset(args.task) dataloader = build_data_loader(dataset, args.batch_size, args.num_workers, drop_last=False) # Run evaluation. evaluate_and_print_results(args.task, dataloader, model, eval_metric) print_rank_0('done :-)')
def main(): """Main program.""" initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={ 'tokenizer_type': 'GPT2BPETokenizer', 'no_load_rng': True, 'no_load_optim': True }) args = get_args() if args.num_layers_per_virtual_pipeline_stage is not None: print( "Interleaved pipeline schedule is not yet supported for text generation." ) exit() # Set up model and load checkpoint. model = get_model(model_provider) if args.load is not None: _ = load_checkpoint(model, None, None) assert len(model) == 1, "Above condition should have caught this" model = model[0] # Generate samples. if args.num_samples == 0: args.micro_batch_size = 1 if args.sample_input_file != None: generate_samples_input_from_file(model) else: generate_samples_interactive(model) else: generate_and_write_samples_unconditional(model)
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) optimizer, param_groups = get_optimizer(model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu if args.pipe_parallel_size == 0 else None, dist_init_required=False, model_parameters=param_groups if optimizer is None else None) if args.pipe_parallel_size > 0: model.set_batch_fn(model.module._megatron_batch_fn) if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) else: args.iteration = 0 # get model without FP16 and/or TorchDDP wrappers unwrapped_model = model while hasattr(unwrapped_model, 'module'): unwrapped_model = unwrapped_model.module return model, optimizer, lr_scheduler
def main(): """Main program.""" args = get_args() if args.num_layers_per_virtual_pipeline_stage is not None: print("Interleaved pipeline schedule is not yet supported for text generation.") exit() if args.task == 'LAMBADA': eval_metric = 'accuracy' elif args.task == 'WIKITEXT103': eval_metric = 'loss' else: raise NotImplementedError('{} task is not implemented.'.format( args.task)) # Set up model and load checkpoint. model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None) assert len(model) == 1, "Above condition should have caught this" model = model[0] # Data stuff. dataset = build_dataset(args.task) dataloader = build_data_loader(dataset, args.micro_batch_size, args.num_workers, drop_last=False) # Run evaluation. evaluate_and_print_results(args.task, dataloader, model, eval_metric) print_rank_0('done :-)')
def load(self, context: DeepSpeedTrialContext, path: pathlib.Path) -> None: self.neox_args.load = str(path) self.neox_args.iteration = load_checkpoint( neox_args=self.neox_args, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler, inference=False, ) megatron_utils.print_rank_0( f"Loading checkpoint and starting from iteration {self.neox_args.iteration}" )
def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): """Setup model and optimizer.""" model = get_model(neox_args=neox_args, use_cache=use_cache) optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) if neox_args.deepspeed: print_rank_0("DeepSpeed is enabled.") if neox_args.no_load_optim: assert optimizer is None _model_params = None _lr_scheduler = None else: _model_params = param_groups if optimizer is None else None _lr_scheduler = lr_scheduler model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=neox_args, lr_scheduler=_lr_scheduler, dist_init_required=False, model_parameters=_model_params, config_params=neox_args.deepspeed_config, mpu=mpu if not neox_args.is_pipe_parallel else None, ) model.total_params = get_total_params(model.module) print_rank_0(f' > total params: {"{:,}".format(model.total_params)}') if neox_args.is_pipe_parallel: model.set_has_attention_mask(True) model.set_batch_fn(partial(get_batch_pipe, neox_args=neox_args)) else: raise ValueError("Must be using deepspeed to run neox") if neox_args.load is not None: neox_args.iteration = load_checkpoint( neox_args=neox_args, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, iteration=iteration, ) print_rank_0( f"Loading checkpoint and starting from iteration {neox_args.iteration}" ) else: neox_args.iteration = 0 return model, optimizer, lr_scheduler
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) optimizer = get_optimizer(model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) else: args.iteration = 0 return model, optimizer, lr_scheduler
def run_checkpoint_test(yaml_list=None, param_dict=None): from megatron.checkpointing import load_checkpoint from megatron.checkpointing import save_checkpoint model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict, clear_data=True) # save model checkpoint save_checkpoint( neox_args=args_loaded, iteration=42, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, ) # reload model from checkpoint ( reloaded_model, reloaded_optimizer, reloaded_lr_scheduler, args_reloaded, ) = model_setup(yaml_list, param_dict, clear_data=False) iteration = load_checkpoint( neox_args=args_reloaded, model=reloaded_model, optimizer=reloaded_optimizer, lr_scheduler=reloaded_lr_scheduler, ) # ensure same checkpoint is loaded assert (iteration == 42 ), "run_checkpoint_test() iteration loaded from checkpoint correct" # check all weight groups are the same for idx, ((n1, p1), (n2, p2)) in enumerate( zip( list(model.module.named_parameters()), list(reloaded_model.module.named_parameters()), )): assert n1 == n2 params_equal = (p1 == p2).all().item() assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) unwrapped_model = model while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)): unwrapped_model = unwrapped_model.module optimizer = get_megatron_optimizer(unwrapped_model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.load is not None: timers = get_timers() # Extra barrier is added to make sure all ranks report the # max time. torch.distributed.barrier() timers('load checkpoint').start() args.iteration = load_checkpoint(model, optimizer, lr_scheduler) torch.distributed.barrier() timers('load checkpoint').stop() timers.log(['load checkpoint']) else: args.iteration = 0 # We only support local DDP with multiple micro-batches. if get_num_microbatches() > 1: assert args.DDP_impl == 'local' # get model without FP16 and/or TorchDDP wrappers unwrapped_model = model while hasattr(unwrapped_model, 'module'): unwrapped_model = unwrapped_model.module if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'): print("Initializing ICT from pretrained BERT model", flush=True) unwrapped_model.init_state_dict_from_bert() return model, optimizer, lr_scheduler
def main(): """Main program.""" initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) # Set up model and load checkpoint. model = get_model(model_provider) args = get_args() if args.load is not None: _ = load_checkpoint(model, None, None) # Generate samples. if args.num_samples == 0: args.batch_size = 1 if args.sample_input_file != "": generate_samples_input_from_file(model) else: generate_samples_interactive(model) else: generate_and_write_samples_unconditional(model)
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) unwrapped_model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) optimizer = get_megatron_optimizer(unwrapped_model) lr_scheduler = get_learning_rate_scheduler(optimizer) if args.load is not None: timers = get_timers() # Extra barrier is added to make sure all ranks report the # max time. torch.distributed.barrier() timers('load-checkpoint').start() args.iteration = load_checkpoint(model, optimizer, lr_scheduler) torch.distributed.barrier() timers('load-checkpoint').stop() timers.log(['load-checkpoint']) else: args.iteration = 0 # We only support local DDP with multiple micro-batches. if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1: assert args.DDP_impl == 'local' # get model without FP16 and/or TorchDDP wrappers if args.iteration == 0 and len(unwrapped_model) == 1 \ and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'): print_rank_0("Initializing ICT from pretrained BERT model") unwrapped_model[0].init_state_dict_from_bert() if args.fp16: optimizer.reload_model_params() return model, optimizer, lr_scheduler
def main(): args = get_args() if args.api_prompt: # obtain the generations by calling the api generate_samples_by_calling_api() return if args.num_layers_per_virtual_pipeline_stage is not None: print( "Interleaved pipeline schedule is not yet supported for text generation." ) exit() # Set up model and load checkpoint. model = get_model(model_provider, wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None) assert len(model) == 1, "Above condition should have caught this" model = model[0] # perform the prompting generate_samples_by_prompting_input_from_file(model)
def main(): # Arguments do sanity checks on the world size, but we don't care, # so trick it into thinking we are plenty of processes os.environ["WORLD_SIZE"] = f'{2**31}' # Args set_global_variables(extra_args_provider=get_mp_merge_args, args_defaults={ 'use_cpu_initialization': True, 'micro_batch_size': 1, 'no_load_optim': True, 'no_load_rng': True, 'no_save_optim': True, 'no_save_rng': True, 'save_interval': 1 }) args = get_args() if args.pipeline_model_parallel_size > 1: print( "Checkpoints with pipeline model parallelism are not currently supported." ) exit() model_type = args.model_type orig_tensor_model_parallel_size = args.tensor_model_parallel_size args.tensor_model_parallel_size = 1 tokenizer = rebuild_tokenizer(args) print('\n merging model parallel partitions ...') print( ' > number of partitions: {}'.format(orig_tensor_model_parallel_size)) print(' > checkpoint path: {}'.format(args.load)) print(' > model parameters:') print(' number of tokens ................ {} '.format( tokenizer.vocab_size)) print(' number of layers ................ {}'.format(args.num_layers)) print(' hidden size ..................... {}'.format(args.hidden_size)) print(' number of attention heads ....... {}'.format( args.num_attention_heads)) print(' maximum position embeddings ..... {}'.format( args.max_position_embeddings)) # Full model. print('> building the full model ...') mpu.initialize.set_tensor_model_parallel_world_size(1) mpu.initialize.set_tensor_model_parallel_rank(0) mpu.initialize.set_pipeline_model_parallel_world_size(1) mpu.initialize.set_pipeline_model_parallel_rank(0) merged_model = get_model(model_type) # Build and load partitions. partitions = [] iteration = 0 args.tensor_model_parallel_size = orig_tensor_model_parallel_size tokenizer = rebuild_tokenizer(args) mpu.initialize.set_tensor_model_parallel_world_size( args.tensor_model_parallel_size) for rank in range(args.tensor_model_parallel_size): # Reset these since load_checkpoint asserts they are 0, but we are loading # multiple checkpoints in the same process and they get set each time args.consumed_train_samples = 0 args.consumed_valid_samples = 0 mpu.initialize.set_tensor_model_parallel_rank(rank) checkpoint_name, iteration = get_parallel_checkpoint_name(args.load) model_ = get_model(model_type) print(f'> loading {checkpoint_name} ...') load_checkpoint(model_, None, None) print(f'> checkpoint version {get_checkpoint_version()}') partitions.append(model_) # Parameter generators so we can loop through them semiltaneouly. merged_params_gen = merged_model.named_parameters() partitions_params_gen = [ partition.named_parameters() for partition in partitions ] while True: try: # Get the params and check names. name, merged_param = next(merged_params_gen) print(' > working on {} ...'.format(name)) print(' merged type: {}, size: {}'.format( merged_param.dtype, list(merged_param.size()))) partitions_param = [] for rank, partition_params_gen in enumerate(partitions_params_gen): partition_name, partition_param = next(partition_params_gen) assert partition_name == name partitions_param.append(partition_param) print(' partition {} type: {}, size: {}'.format( rank, partition_param.dtype, list(partition_param.size()))) # For the non-parallel parameters, simply copy the rank 0 values. if not hasattr(merged_param, 'tensor_model_parallel'): print(' none-parallel parameter, simple copy from rank 0') with torch.no_grad(): merged_param.data.copy_(partitions_param[0].data) # For parallel parameters, merge the values else: dim = merged_param.partition_dim stride = merged_param.partition_stride print( f' parallel parameter merge with stride {stride} along ' f'dimention {dim}') merge_partitions(merged_param, partitions_param, dim, stride) except StopIteration: break partitions = [] args.tensor_model_parallel_size = 1 args.pipeline_model_parallel_size = args.target_pipeline_model_parallel_size assert args.num_layers % args.pipeline_model_parallel_size == 0, \ 'num_layers must be divisible by target pipeline model parallel size' layers_per_part = args.num_layers // args.pipeline_model_parallel_size tokenizer = rebuild_tokenizer(args) mpu.initialize.set_tensor_model_parallel_world_size( args.tensor_model_parallel_size) mpu.initialize.set_tensor_model_parallel_rank(0) mpu.initialize.set_pipeline_model_parallel_world_size( args.pipeline_model_parallel_size) # regex to parse out layer number from param name layer_re = re.compile('layers\.([0-9]+)') if args.pipeline_model_parallel_size > 1: merged_params = {} for name, merged_param in merged_model.named_parameters(): merged_params[name] = merged_param for rank in range(args.pipeline_model_parallel_size): mpu.initialize.set_pipeline_model_parallel_rank(rank) model = get_model(model_type) def update_layer_num(m): # TODO! This assumes no interleaved pipeline execution layer = int(m.group(1)) layer += rank * layers_per_part return f'layers.{layer}' for dst_name, partition_param in model.named_parameters(): if dst_name == "word_embeddings.weight": # See comment in MegatronModule.initialize_word_embeddings() src_name = "language_model.embedding.word_embeddings.weight" else: # Translate destination layer number (0-N for each partition) # to source layer number (single-model layer number) src_name = re.sub(layer_re, update_layer_num, dst_name) print( f" > copying {src_name} to {dst_name} in rank {rank}'s model" ) partition_param.data.copy_(merged_params[src_name].data) partitions.append(model) else: partitions = [merged_model] for rank, model in enumerate(partitions): mpu.initialize.set_pipeline_model_parallel_rank(rank) print(f"> saving rank {rank}'s model") save_checkpoint(iteration, model, None, None) print('done :-)')
def finetune(train_valid_datasets_provider, model_provider, forward_step=_cross_entropy_forward_step, end_of_epoch_callback_provider=None): """Main finetune function used across all tasks.""" args = get_args() timers = get_timers() # Train and validation data loaders. timers('train/valid/test dataset/dataloder').start() if args.epochs > 0: train_dataset, valid_dataset = train_valid_datasets_provider() train_dataloader, valid_dataloader = _build_train_valid_dataloaders( train_dataset, valid_dataset) timers('train/valid/test dataset/dataloder').stop() # Build calback function. timers('callback function').start() end_of_epoch_callback = None if end_of_epoch_callback_provider is not None: end_of_epoch_callback = end_of_epoch_callback_provider() timers('callback function').stop() # Build model, optimizer and learning rate scheduler. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) timers('model and optimizer').stop() # If pretrained checkpoint is provided and we have not trained for # any iteration (i.e., iteration is zero), then load the pretrained # checkpoint. timers('pretrained checkpoint').start() if args.iteration == 0 and args.pretrained_checkpoint is not None: original_load = args.load args.load = args.pretrained_checkpoint _ = load_checkpoint(model, None, None) args.load = original_load # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16: optimizer._model_params_to_master_params() timers('pretrained checkpoint').stop() # Print setup timing. print_rank_0('done with setups ...') timers.log([ 'train/valid/test dataset/dataloder', 'callback function', 'model and optimizer', 'pretrained checkpoint' ]) print_rank_0('training ...') # Finetune the model. if args.epochs > 0: _train(model, optimizer, lr_scheduler, forward_step, train_dataloader, valid_dataloader, end_of_epoch_callback) # Or just evaluate. else: if end_of_epoch_callback is not None: print_rank_0('evaluation only mode, setting epoch to -1') end_of_epoch_callback(model, epoch=-1, output_predictions=True) print_rank_0('done :-)')
def main(): """Main program.""" drmode = 0 mode = 0 initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) # Set up model and load checkpoint. model = get_model(model_provider) args = get_args() tokenizer = get_tokenizer() if args.load is not None: _ = load_checkpoint(model, None, None) # Generate samples. if drmode == 1: f = open("questions.txt", 'r') if mode == 0: dir = "qa_345M" else: dir = "qa_345M_ip" if drmode == 0: f = open("para.txt", 'r') if mode == 0: dir = "pa_345M" else: dir = "pa_345M_ip" qs = f.readlines() question_list = [] import json for i in qs: question_list.append(i) f.close() fdir = os.listdir() if not (dir in fdir): os.mkdir(dir) import random import jsonlines while True: q = random.choice(question_list) lists = os.listdir(dir) question = q lts = question[:20] + '.jsonl' if (lts in lists): continue #str=generate_token_tensor(str,tokenizer) if mode == 0: output_string = generate_one_text(model, tokenizer, args, question) print(question, output_string) text_dir = dir + "/" already = [] with jsonlines.open(text_dir + question[:20] + '.jsonl', mode='w') as writer: otc = {} otc['question'] = question otc['answer'] = output_string #print(otc) writer.write(otc) else: output_string, output_scores = generate_string( model, tokenizer, args, question) ranklist = np.argsort(output_scores) best_score = output_scores[ranklist[0]] text_dir = dir + "/" already = [] with jsonlines.open(text_dir + question[:20] + '.jsonl', mode='w') as writer: otc = {} otc['question'] = question otc['answer'] = output_string[ranklist[0]] #print(otc) writer.write(otc)
def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() model = get_model(model_provider_func) optimizer, param_groups = get_optimizer(model) lr_scheduler = get_learning_rate_scheduler(optimizer) # Determine if deepspeed config is JSON or filepath. # If JSON then directly load it deepspeed_conf = None if hasattr(args, 'deepspeed_config'): if not os.path.exists(args.deepspeed_config): # If its not a path trying parsing as a JSON string deepspeed_json_conf = args.deepspeed_config if len(deepspeed_json_conf) > 2 and deepspeed_json_conf[ 0] == "'" and deepspeed_json_conf[-1] == "'": deepspeed_json_conf = deepspeed_json_conf[ 1:-1] # Remove shell quotes try: deepspeed_conf = json.loads(deepspeed_json_conf) args.deepspeed_config = None # Pass directy as dictionary to deepspeed except JSONDecodeError: # Not a path or a string raise ValueError( f'The parameter `deepspeed_config` is neither a file path that exists or a JSON string:' f' {args.deepspeed_config}') if args.deepspeed: print_rank_0("DeepSpeed is enabled.") model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=args, lr_scheduler=lr_scheduler, mpu=mpu if args.pipe_parallel_size == 0 else None, dist_init_required=False, model_parameters=param_groups if optimizer is None else None, config_params=deepspeed_conf, ) model.total_params = get_total_params(model.module) print_rank_0(f' > total params: {"{:,}".format(model.total_params)}') if args.pipe_parallel_size > 0: model.set_batch_fn(model.module._megatron_batch_fn) else: raise ValueError("Must be using deepspeed to run neox") if args.load is not None: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) else: args.iteration = 0 # get model without FP16 and/or TorchDDP wrappers unwrapped_model = model while hasattr(unwrapped_model, 'module'): unwrapped_model = unwrapped_model.module return model, optimizer, lr_scheduler
args_defaults={ 'tokenizer_type': 'GPT2BPETokenizer', 'no_load_rng': True, 'no_load_optim': True }) args = get_args() if args.num_layers_per_virtual_pipeline_stage is not None: print( "Interleaved pipeline schedule is not yet supported for text generation." ) exit() # Set up model and load checkpoint model = get_model(model_provider, wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None) assert len(model) == 1, "Above condition should have caught this" model = model[0] if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank( ) == 0: server = MegatronServer(model) server.run("0.0.0.0") while True: choice = torch.cuda.LongTensor(1) torch.distributed.broadcast(choice, 0) if choice[0].item() == 0: generate_and_post_process(model)