def generate_samples(model, tokenizer, args): print (f"generate_samples was called with model {model} \n and tokenizer {tokenizer}") model.eval() with torch.no_grad(): while True: torch.distributed.barrier(group=mpu.get_model_parallel_group()) terminate_runs = 0 print (f"terminate_runs = {terminate_runs}") if mpu.get_model_parallel_rank() == 0: print ("get_model_parallel_rank() was 0") # raw_text = input("\nContext prompt (stop to exit) >>> ") raw_text = "localStorage.getItem(" while not raw_text: print('Prompt should not be empty!') raw_text = input("\nContext prompt (stop to exit) >>> ") if "stop" in raw_text: terminate_runs = 1 else: context_tokens = tokenizer(raw_text)['input_ids'] context_length = len(context_tokens) if context_length >= args.seq_length // 2: print("\nContext length", context_length, "\nPlease give smaller context (half of the sequence length)!") continue else: print (f"get_model_parallel_rank() was NOT 0 but {mpu.get_model_parallel_rank()}") _ = tokenizer("EMPTY TEXT")['input_ids'] terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) torch.distributed.broadcast(terminate_runs_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) terminate_runs = terminate_runs_tensor[0].item() if terminate_runs == 1: return start_time = time.time() print ("generating...") generated = generate( model, tokenizer, raw_text, out_seq_length=args.out_seq_length, seq_length=args.seq_length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p ) if mpu.get_model_parallel_rank() == 0: print ("We should clear the terminal and print results...") os.system('clear') print("\nTime taken: {:.2f}\n".format(time.time() - start_time), flush=True) print("\nContext:", raw_text, flush=True) print("\nGPT:", generated, flush=True) raw_text = None torch.distributed.barrier(group=mpu.get_model_parallel_group())
def generate_samples(model, tokenizer, args): model.eval() with torch.no_grad(): while True: torch.distributed.barrier(group=mpu.get_model_parallel_group()) terminate_runs = 0 if mpu.get_model_parallel_rank() == 0: raw_text = input("\nContext prompt (stop to exit) >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("\nContext prompt (stop to exit) >>> ") if "stop" in raw_text: terminate_runs = 1 else: context_tokens = tokenizer(raw_text)['input_ids'] context_length = len(context_tokens) if context_length >= args.seq_length // 2: print( "\nContext length", context_length, "\nPlease give smaller context (half of the sequence length)!" ) continue else: _ = tokenizer("EMPTY TEXT")['input_ids'] terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) torch.distributed.broadcast(terminate_runs_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) terminate_runs = terminate_runs_tensor[0].item() if terminate_runs == 1: return start_time = time.time() generated = generate(model, tokenizer, raw_text, out_seq_length=args.out_seq_length, seq_length=args.seq_length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p) if mpu.get_model_parallel_rank() == 0: os.system('clear') print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True) print("\nContext:", raw_text, flush=True) print("\nGPT:", generated, flush=True) raw_text = None torch.distributed.barrier(group=mpu.get_model_parallel_group())
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. model = DDP(model) return model
def get_train_val_test_data(args): """Load the data on rank zero and boradcast number of tokens to all GPUS.""" (train_data, val_data, test_data) = (None, None, None) # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0: (train_data, val_data, test_data), num_tokens, eod_token, tokenizer = make_gpt3_dataloaders(args) before = num_tokens after = before multiple = args.make_vocab_size_divisible_by * mpu.get_model_parallel_world_size() while (after % multiple) != 0: after += 1 print_rank_0( '> padded vocab (size: {}) with {} dummy tokens (new size: {})'.format(before, after - before, after)) print_rank_0('> end-of-document token: {}'.format(eod_token)) token_counts = torch.cuda.LongTensor( [after, eod_token, int(args.do_train), int(args.do_valid), int(args.do_test)]) else: tokenizer = None token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) # Broadcast num tokens. torch.distributed.broadcast(token_counts, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_tokens = token_counts[0].item() eod_token = token_counts[1].item() args.do_train = token_counts[2].item() args.do_valid = token_counts[3].item() args.do_test = token_counts[4].item() return train_data, val_data, test_data, num_tokens, eod_token, tokenizer
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module self.data_parallel_group = mpu.get_data_parallel_group() src_rank = mpu.get_model_parallel_rank() for p in self.module.parameters(): if torch.is_tensor(p): dist.broadcast(p, src_rank, group=self.data_parallel_group) def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): if (self.needs_reduction): self.needs_reduction = False buckets = {} for name, param in self.module.named_parameters(): if param.requires_grad and param.grad is not None: tp = (param.data.type()) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print( "WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case." ) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) dist.all_reduce(coalesced, group=self.data_parallel_group) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) self.hook_handles = [] self.hooks = [] for param in list(self.module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) # handle = param.register_hook(allreduce_hook) #self.hooks.append(allreduce_hook) #self.hook_handles.append(handle) self.allreduce_params = allreduce_params
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') assert args.num_attention_heads % args.model_parallel_size == 0 num_local_heads = args.num_attention_heads // args.model_parallel_size deepspeed_sparsity_config = None if DEEPSPEED_WRAP and args.deepspeed: deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads) if deepspeed_sparsity_config is not None: print_rank_0(f"Use sparse attention with mode {args.sparse_mode}") model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, deepspeed_sparsity_config=deepspeed_sparsity_config, sparse_mode=args.sparse_mode) if args.load_huggingface is not None: model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if DEEPSPEED_WRAP and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_checkpoint_name(checkpoints_path, iteration, release=False, zero=False): if release: d = 'release' else: d = 'iter_{:07d}'.format(iteration) if zero: dp_rank = mpu.get_data_parallel_rank() d += '_zero_dp_rank_{}'.format(dp_rank) return os.path.join(checkpoints_path, d, 'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()), 'model_optim_rng.pt')
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') print ("Calling GPT3Model constructor...") model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. print (f"placing the model on device {torch.cuda.current_device()}") model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: rint ("we have NOT halfed the model before, and now we're wrapping it into a fp16_module. For...some reason...") model = FP16_Module(model) # Wrap model for distributed training. print ("Setting up distributed training...") print ("No classic pytorch DDP this time; \nUsing sberbank magic DDP") model = DDP(model) input ("ready to return model") return model
def get_model(args): """Build the model.""" print_rank_0('building GPT3 model ...') print ("asserting we have a correct number of attention heads...") assert args.num_attention_heads % args.model_parallel_size == 0 num_local_heads = args.num_attention_heads // args.model_parallel_size deepspeed_sparsity_config = None if DEEPSPEED_WRAP and args.deepspeed: print ("we're using deepspeed, and so we're getting a sparse attention config") deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads) if deepspeed_sparsity_config is not None: print_rank_0(f"Using sparse attention with mode {args.sparse_mode}") print ("Calling GPT3Model constructor...") model = GPT3Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, deepspeed_sparsity_config=deepspeed_sparsity_config, sparse_mode=args.sparse_mode) if args.load_huggingface is not None: print ("Loading huggingface model...") model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if DEEPSPEED_WRAP and args.deepspeed and args.fp16: print ("We've had deepspeed AND fp16, so we're halfing the model...") model.half() # GPU allocation. print (f"placing the model on device {torch.cuda.current_device()}") model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: print ("we've halfed the model before, but now we're wrapping it into a fp16_module. For...some reason...") model = FP16_Module(model) # Wrap model for distributed training. print ("Setting up distributed training...") if USE_TORCH_DDP: i = torch.cuda.current_device() print (f"Using classic pytorch DDP with device {i}") model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: print ("Using sberbank magic DDP") model = DDP(model) # input ("ready to return model") print ("ready to return model") return model
def load_checkpoint(model, optimizer, lr_scheduler, args, deepspeed=False): """Load a model checkpoint.""" iteration, release, success = get_checkpoint_iteration(args) if not success: return 0 if deepspeed: load_optim = not args.no_load_optim checkpoint_name, sd = model.load_checkpoint( args.load, iteration, load_optimizer_states=load_optim, load_lr_scheduler_states=load_optim) if checkpoint_name is None: if mpu.get_data_parallel_rank() == 0: print("Unable to load checkpoint.") return iteration else: # Checkpoint. checkpoint_name = get_checkpoint_name(args.load, iteration, release) # Load the checkpoint. if os.path.isfile(checkpoint_name): sd = torch.load(checkpoint_name, map_location='cpu') else: # Try load deepspeed checkpoint with only megatron checkpoint_name = os.path.join( args.load, str(iteration), 'mp_rank_{:02d}_model_states.pt'.format( mpu.get_model_parallel_rank())) sd = torch.load(checkpoint_name, map_location='cpu') if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading checkpoint {}'.format( torch.distributed.get_rank(), checkpoint_name)) if isinstance(model, torchDDP): model = model.module # Model. try: model.load_state_dict(sd['model']) except KeyError: try: model.load_state_dict(sd['module']) except KeyError: print_rank_0( 'A metadata file exists but unable to load model ' 'from checkpoint {}, exiting'.format(checkpoint_name)) exit() # Optimizer. if not release and not args.finetune and not args.no_load_optim: try: if optimizer is not None: optimizer.load_state_dict(sd['optimizer']) if lr_scheduler is not None: lr_scheduler.load_state_dict(sd['lr_scheduler']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() # Iterations. if args.finetune or release: iteration = 0 else: try: iteration = sd['iteration'] except KeyError: try: # Backward compatible with older checkpoints iteration = sd['total_iters'] except KeyError: print_rank_0( 'A metadata file exists but Unable to load iteration ' ' from checkpoint {}, exiting'.format(checkpoint_name)) exit() # rng states. if not release and not args.finetune and not args.no_load_rng: try: random.setstate(sd['random_rng_state']) np.random.set_state(sd['np_rng_state']) torch.set_rng_state(sd['torch_rng_state']) torch.cuda.set_rng_state(sd['cuda_rng_state']) mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() torch.distributed.barrier() if mpu.get_data_parallel_rank() == 0: print(' successfully loaded {}'.format(checkpoint_name)) return iteration