def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. model = DDP(model) return model
def save_checkpoint(iteration, model, optimizer, lr_scheduler, args, deepspeed=False): """Save a model checkpoint.""" if deepspeed: save_ds_checkpoint(iteration, model, args) else: # Only rank zer0 of the data parallel writes to the disk. if isinstance(model, torchDDP): model = model.module if mpu.get_data_parallel_rank() == 0: checkpoint_name = get_checkpoint_name(args.save, iteration) print( 'global rank {} is saving checkpoint at iteration {:7d} to {}'. format(torch.distributed.get_rank(), iteration, checkpoint_name)) sd = {} sd['iteration'] = iteration sd['model'] = model.state_dict() # Optimizer stuff. if not args.no_save_optim: if optimizer is not None: sd['optimizer'] = optimizer.state_dict() if lr_scheduler is not None: sd['lr_scheduler'] = lr_scheduler.state_dict() # rng states. if not args.no_save_rng: sd['random_rng_state'] = random.getstate() sd['np_rng_state'] = np.random.get_state() sd['torch_rng_state'] = torch.get_rng_state() sd['cuda_rng_state'] = torch.cuda.get_rng_state() sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker( ).get_states() ensure_directory_exists(checkpoint_name) torch.save(sd, checkpoint_name) print(' successfully saved {}'.format(checkpoint_name)) # Wait so everyone is done (necessary) torch.distributed.barrier() # And update the latest iteration if torch.distributed.get_rank() == 0: tracker_filename = get_checkpoint_tracker_filename(args.save) with open(tracker_filename, 'w') as f: f.write(str(iteration)) # Wait so everyone is done (not necessary) torch.distributed.barrier()
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') assert args.num_attention_heads % args.model_parallel_size == 0 num_local_heads = args.num_attention_heads // args.model_parallel_size deepspeed_sparsity_config = None if DEEPSPEED_WRAP and args.deepspeed: deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads) if deepspeed_sparsity_config is not None: print_rank_0(f"Use sparse attention with mode {args.sparse_mode}") model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, deepspeed_sparsity_config=deepspeed_sparsity_config, sparse_mode=args.sparse_mode) if args.load_huggingface is not None: model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if DEEPSPEED_WRAP and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_checkpoint_name(checkpoints_path, iteration, release=False, zero=False): if release: d = 'release' else: d = 'iter_{:07d}'.format(iteration) if zero: dp_rank = mpu.get_data_parallel_rank() d += '_zero_dp_rank_{}'.format(dp_rank) return os.path.join(checkpoints_path, d, 'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()), 'model_optim_rng.pt')
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') print ("Calling GPT3Model constructor...") model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. print (f"placing the model on device {torch.cuda.current_device()}") model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: rint ("we have NOT halfed the model before, and now we're wrapping it into a fp16_module. For...some reason...") model = FP16_Module(model) # Wrap model for distributed training. print ("Setting up distributed training...") print ("No classic pytorch DDP this time; \nUsing sberbank magic DDP") model = DDP(model) input ("ready to return model") return model
def make_gpt3_dataloaders(args): # Data parallel arguments world_size = mpu.get_data_parallel_world_size() rank = mpu.get_data_parallel_rank() # global_batch_size = args.batch_size * world_size num_workers = args.num_workers # data_dir = args.train_data_path if args.train_data_path else os.path.dirname(args.test_data_path) tokenizer_path = args.load_huggingface if args.load_huggingface else \ (args.tokenizer_path if args.tokenizer_path else os.path.join(os.path.dirname(args.train_data_path), '_tokenizer/')) print_rank_0('Load tokenizer from ' + tokenizer_path) tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) tokenizer.add_special_tokens({"bos_token": "<s>"}) tokenizer.add_special_tokens({"eos_token": "</s>"}) print("Add answer_sep:", args.answer_sep) tokenizer.add_tokens(args.answer_sep) print("Add start_sep", args.start_sep) tokenizer.add_tokens(args.start_sep) print("Add start_sep", args.end_sep) tokenizer.add_tokens(args.end_sep) eod_token = tokenizer.encoder['<pad>'] num_tokens = len(tokenizer) train_dataset_args = RuGpt3DatasetArguments( block_size=args.seq_length, max_files_load=args.max_files_per_process, overwrite_cache=args.overwrite_cache, tqdm=False) eval_dataset_args = RuGpt3DatasetArguments( block_size=args.seq_length, max_files_load=args.max_files_per_process, overwrite_cache=args.overwrite_cache, tqdm=True) def make_data_loader_(data_path, dataset_args): print_rank_0( f'Load RuGPT3 Dataset from {data_path}, {dataset_args.max_files_load} files per process' ) dataset = RuGpt3TextDataset( tokenizer=tokenizer, args=dataset_args, rank=rank, world_size=world_size, file_path=data_path, # cache_prefix=args.cache_prefix all_args=args) # Use a simple sampler with distributed batch sampler. sampler = torch.utils.data.SequentialSampler(dataset) batch_sampler = ResumableBatchSampler(sampler=sampler, batch_size=args.batch_size, drop_last=True) return InfiniteDataLoader(dataset, batch_sampler=batch_sampler, num_workers=num_workers, pin_memory=True) train = make_data_loader_( args.train_data_path, train_dataset_args) if args.train_data_path else None valid = make_data_loader_( args.val_data_path, eval_dataset_args) if args.val_data_path else None test = make_data_loader_( args.test_data_path, eval_dataset_args) if args.test_data_path else None args.do_train = train is not None args.do_valid = valid is not None args.do_test = test is not None return (train, valid, test), num_tokens, eod_token, tokenizer
def load_checkpoint(model, optimizer, lr_scheduler, args, deepspeed=False): """Load a model checkpoint.""" print(f"load_checkpoint has deepspeed {deepspeed}") iteration, release, success = get_checkpoint_iteration(args) if not success: return 0 if deepspeed: print("loading a deepspeed checkpoint...") import torch thingy = torch.cuda.memory_summary(device=torch.cuda.current_device(), abbreviated=False) print(f"reporting memory usage:{thingy}") print("Attempting to free some memory...") try: torch.cuda.empty_cache() except Error as e: print(f"Something went wrong:{e}") print("Trying to collect garbage:") try: import gc gc.collect() except Error as e: print(f"Something went wrong:{e}") thingy = torch.cuda.memory_summary(device=torch.cuda.current_device(), abbreviated=False) print( f"We've tried to free some memory, \nand now are reporting memory usage:{thingy}" ) # load_optim = not args.no_load_optim print("Oh, screw it, let's not load an optimizer...") load_optim = False checkpoint_name, sd = model.load_checkpoint( args.load, iteration, load_optimizer_states=load_optim, load_lr_scheduler_states=load_optim) if checkpoint_name is None: if mpu.get_data_parallel_rank() == 0: print("Unable to load checkpoint.") return iteration else: # Checkpoint. checkpoint_name = get_checkpoint_name(args.load, iteration, release) if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading checkpoint {}'.format( torch.distributed.get_rank(), checkpoint_name)) # Load the checkpoint. sd = torch.load(checkpoint_name, map_location='cpu') if isinstance(model, torchDDP): model = model.module # Model. try: model.load_state_dict(sd['model']) except KeyError: print_rank_0('A metadata file exists but unable to load model ' 'from checkpoint {}, exiting'.format(checkpoint_name)) exit() # Optimizer. if not release and not args.finetune and not args.no_load_optim: try: if optimizer is not None: optimizer.load_state_dict(sd['optimizer']) if lr_scheduler is not None: lr_scheduler.load_state_dict(sd['lr_scheduler']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() # Iterations. if args.finetune or release: iteration = 0 else: try: iteration = sd['iteration'] except KeyError: try: # Backward compatible with older checkpoints iteration = sd['total_iters'] except KeyError: print_rank_0( 'A metadata file exists but Unable to load iteration ' ' from checkpoint {}, exiting'.format(checkpoint_name)) exit() # rng states. if not release and not args.finetune and not args.no_load_rng: try: random.setstate(sd['random_rng_state']) np.random.set_state(sd['np_rng_state']) torch.set_rng_state(sd['torch_rng_state']) torch.cuda.set_rng_state(sd['cuda_rng_state']) mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() torch.distributed.barrier() if mpu.get_data_parallel_rank() == 0: print(' successfully loaded {}'.format(checkpoint_name)) return iteration
def load_checkpoint(model, optimizer, lr_scheduler, args, deepspeed=False): """Load a model checkpoint.""" iteration, release, success = get_checkpoint_iteration(args) if not success: return 0 if deepspeed: load_optim = not args.no_load_optim checkpoint_name, sd = model.load_checkpoint( args.load, iteration, load_optimizer_states=load_optim, load_lr_scheduler_states=load_optim) if checkpoint_name is None: if mpu.get_data_parallel_rank() == 0: print("Unable to load checkpoint.") return iteration else: # Checkpoint. checkpoint_name = get_checkpoint_name(args.load, iteration, release) if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading checkpoint {}'.format( torch.distributed.get_rank(), checkpoint_name)) # Load the checkpoint. sd = torch.load(checkpoint_name, map_location='cpu') if isinstance(model, torchDDP): model = model.module # Model. try: model.load_state_dict(sd['model']) except KeyError: print_rank_0('A metadata file exists but unable to load model ' 'from checkpoint {}, exiting'.format(checkpoint_name)) exit() # Optimizer. if not release and not args.finetune and not args.no_load_optim: try: if optimizer is not None: optimizer.load_state_dict(sd['optimizer']) if lr_scheduler is not None: lr_scheduler.load_state_dict(sd['lr_scheduler']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() # Iterations. if args.finetune or release: iteration = 0 else: try: iteration = sd['iteration'] except KeyError: try: # Backward compatible with older checkpoints iteration = sd['total_iters'] except KeyError: print_rank_0( 'A metadata file exists but Unable to load iteration ' ' from checkpoint {}, exiting'.format(checkpoint_name)) exit() # rng states. if not release and not args.finetune and not args.no_load_rng: try: random.setstate(sd['random_rng_state']) np.random.set_state(sd['np_rng_state']) torch.set_rng_state(sd['torch_rng_state']) torch.cuda.set_rng_state(sd['cuda_rng_state']) mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() torch.distributed.barrier() if mpu.get_data_parallel_rank() == 0: print(' successfully loaded {}'.format(checkpoint_name)) return iteration
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') print ("asserting we have a correct number of attention heads...") assert args.num_attention_heads % args.model_parallel_size == 0 num_local_heads = args.num_attention_heads // args.model_parallel_size deepspeed_sparsity_config = None if DEEPSPEED_WRAP and args.deepspeed: print ("we're using deepspeed, and so we're getting a sparse attention config") deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads) if deepspeed_sparsity_config is not None: print_rank_0(f"Using sparse attention with mode {args.sparse_mode}") print ("Calling GPT3Model constructor...") model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, deepspeed_sparsity_config=deepspeed_sparsity_config, sparse_mode=args.sparse_mode) if args.load_huggingface is not None: print ("Loading huggingface model...") model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if DEEPSPEED_WRAP and args.deepspeed and args.fp16: print ("We've had deepspeed AND fp16, so we're halfing the model...") model.half() # GPU allocation. print (f"placing the model on device {torch.cuda.current_device()}") model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: print ("we've halfed the model before, but now we're wrapping it into a fp16_module. For...some reason...") model = FP16_Module(model) # Wrap model for distributed training. print ("Setting up distributed training...") if USE_TORCH_DDP: i = torch.cuda.current_device() print (f"Using classic pytorch DDP with device {i}") model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: print ("Using sberbank magic DDP") model = DDP(model) # input ("ready to return model") print ("ready to return model") return model