def make_data_loader(dataset, batch_size, args): shuffle = args.shuffle if shuffle: sampler = data_utils.samplers.RandomSampler(dataset, replacement=True, num_samples=batch_size * args.train_iters) else: sampler = torch.utils.data.SequentialSampler(dataset) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) distributed = world_size > 1 drop_last = distributed if distributed: batch_sampler = data_utils.samplers.DistributedBatchSampler( sampler, batch_size, drop_last, rank, world_size) else: batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size, drop_last) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True) return data_loader
def allreduce_gradients(self): """Reduce gradients across data parallel ranks.""" # If we have buffers, simply reduce the data in the buffer. if self._grad_buffers is not None: for _, buffer_ in self._grad_buffers.items(): buffer_.data /= mpu.get_data_parallel_world_size() torch.distributed.all_reduce( buffer_.data, group=mpu.get_data_parallel_group()) else: # Otherwise, bucketize and all-reduce buckets = {} # Pack the buckets. for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) param.main_grad = param.grad # For each bucket, all-reduce and copy all-reduced grads. for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) coalesced /= mpu.get_data_parallel_world_size() torch.distributed.all_reduce( coalesced, group=mpu.get_data_parallel_group()) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def average_losses_across_data_parallel_group(losses): """Reduce a tensor of losses across all GPUs.""" averaged_losses = torch.cat( [loss.clone().detach().view(1) for loss in losses]) torch.distributed.all_reduce(averaged_losses, group=mpu.get_data_parallel_group()) averaged_losses = averaged_losses / \ torch.distributed.get_world_size(group=mpu.get_data_parallel_group()) return averaged_losses
def get_group_world_size_rank(): group = mpu.get_data_parallel_group() rank = torch.distributed.get_rank(group=group) world_size = torch.distributed.get_world_size(group=group) return group, rank, world_size
def calculate_correct_answers(model, dataloader, epoch): """Calculate correct over total answers""" model.eval() with torch.no_grad(): # For all the batches in the dataset. total = 0 correct = 0 for _, batch in enumerate(dataloader): # Run the model forward. images, labels = process_batch(batch) logits = model(images).contiguous().float() # Add output predictions. # Compute the correct answers. predicted = torch.argmax(logits, dim=-1) corrects = (predicted == labels).float() # Add to the counters. total += labels.size(0) correct += corrects.sum().item() model.train() # Reduce. unreduced = torch.cuda.LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) # Print on screen. correct_ans = unreduced[0].item() total_count = unreduced[1].item() return correct_ans, total_count
def get_model(model_provider_func): """Build the model.""" args = get_args() # Build model on cpu. model = model_provider_func() if args.deepspeed: # DeepSpeed handles CUDA, FP16, and DDP components. return model # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training.""" if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model if args.DDP_impl == 'local': model = LocalDDP(model) return model raise NotImplementedError('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl))
def init_model_parallel(self, global_rank: int, world_size: int) -> None: """ Initializes Megatron-LM model parallel if using model parallelism. Args: global_rank (int): the global process index. world_size (int): the total number of GPUs, num_nodes * num_gpus is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM. """ app_state = AppState() # we initialize megatron-lm model parallel and data parallel groups # after initializing DDP with PTL. if app_state.model_parallel_size is not None: if torch.distributed.is_initialized(): mpu.initialize_model_parallel(app_state.model_parallel_size) app_state.model_parallel_group = mpu.get_model_parallel_group() app_state.data_parallel_group = mpu.get_data_parallel_group() app_state.model_parallel_rank = mpu.get_tensor_model_parallel_rank( ) app_state.data_parallel_rank = mpu.get_data_parallel_rank() app_state.data_parallel_size = mpu.get_data_parallel_world_size( ) logging.info(f'mp_rank: {app_state.model_parallel_rank}') logging.info(f'dp_rank: {app_state.data_parallel_rank}') # TODO: get random seed from PTL seed = os.environ.get("PL_GLOBAL_SEED", 1234) # random seed must be set for megatron model parallel init _set_random_seed(seed)
def get_model(model_provider_func): """Build the model.""" args = get_args() # Build model on cpu. model = model_provider_func() # Print number of parameters. if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training.""" if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model if args.DDP_impl == 'local': model = LocalDDP(model) return model raise NotImplementedError('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl))
def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: """ Override for LightningModule DDP initialization. Initializes Megatron-LM model parallel if using model parallelism. Args: global_rank (int): the global process index. world_size (int): the total number of GPUs, num_nodes * num_gpus is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM. """ LightningModule.init_ddp_connection(self, global_rank, world_size, is_slurm_managing_tasks) app_state = AppState() # we initialize megatron-lm model parallel and data parallel groups # after initializing DDP with PTL. if app_state.model_parallel_size is not None: if app_state.model_parallel_group is None: mpu.initialize_model_parallel(app_state.model_parallel_size) app_state.model_parallel_group = mpu.get_model_parallel_group() app_state.data_parallel_group = mpu.get_data_parallel_group() app_state.model_parallel_rank = torch.distributed.get_rank( group=app_state.model_parallel_group) app_state.data_parallel_rank = torch.distributed.get_rank( group=app_state.data_parallel_group) logging.info(f'mp_rank: {app_state.model_parallel_rank}') logging.info(f'dp_rank: {app_state.data_parallel_rank}')
def get_gpt2_model(args_others, mp_size=1): from megatron.model import GPT2Model from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': get_test_path('gpt2-vocab.json'), 'merge_file': get_test_path('gpt2-merges.txt'), 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) model = GPT2Model(num_tokentypes=0, parallel_output=False) model.cuda() from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import mpu i = torch.cuda.current_device() model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model
def __init__(self, module): from megatron import mpu super().__init__( module, mp_group=mpu.get_model_parallel_group(), dp_group=mpu.get_data_parallel_group(), )
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module self.data_parallel_group = mpu.get_data_parallel_group() def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): if (self.needs_reduction): self.needs_reduction = False buckets = {} for name, param in self.module.named_parameters(): if param.requires_grad and param.grad is not None: tp = (param.data.type()) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print( "WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case." ) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) dist.all_reduce(coalesced, group=self.data_parallel_group) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) self.hook_handles = [] self.hooks = [] for param in list(self.module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) # handle = param.register_hook(allreduce_hook) # self.hooks.append(allreduce_hook) # self.hook_handles.append(handle) self.allreduce_params = allreduce_params
def model_provider(): """Build the model.""" print_rank_0('building GPT2 model ...') with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), remote_device=get_args().remote_device, enabled=get_args().zero_stage == 3): model = GPT2Model(num_tokentypes=0, parallel_output=True) return model
def calculate_correct_answers(name, model, dataloader, epoch, output_predictions): """Calculate correct over total answers and return prediction if the `output_predictions` is true.""" start_time = time.time() model.eval() with torch.no_grad(): # For all the batches in the dataset. total = 0 correct = 0 if output_predictions: # This option is only possible when data parallel size is 1. assert mpu.get_data_parallel_world_size() == 1 softmaxes = [] labels = [] ids = [] for _, batch in enumerate(dataloader): # Run the model forward. tokens, types, labels_, attention_mask = process_batch(batch) logits = model(tokens, attention_mask, types) # Add output predictions. if output_predictions: softmaxes.extend( torch.nn.Softmax(dim=-1)( logits.float()).data.cpu().numpy().tolist()) labels.extend(labels_.data.cpu().numpy().tolist()) ids.extend(batch['uid'].cpu().numpy().tolist()) # Compute the correct answers. predicted = torch.argmax(logits, dim=-1) corrects = (predicted == labels_) # Add to the counters. total += labels_.size(0) correct += corrects.sum().item() model.train() # Reduce. unreduced = torch.cuda.LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) # Print on screen. correct_ans = unreduced[0].item() total_count = unreduced[1].item() percent = float(correct_ans) * 100.0 / float(total_count) elapsed_time = time.time() - start_time print_rank_0(' > |epoch: {}| metrics for {}: correct / total ' '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format( epoch, name, correct_ans, total_count, percent, elapsed_time)) if output_predictions: return correct_ans, total_count, (softmaxes, labels, ids) return correct_ans, total_count
def _dp_gather(self, logits): """ Gather logits from all data parallel ranks """ if logits is not None: tensor_list = [torch.zeros_like(logits) for _ in range(self.dp_world_size)] torch.distributed.all_gather( tensor_list, logits, group=mpu.get_data_parallel_group() ) logits = torch.cat(tensor_list, dim=0) return logits
def __init__(self, model, forward_step_fn, neox_args, batch_size=None): self.cache_hook = base.CacheHook(None) self.model = model self.neox_args = neox_args self.tokenizer = neox_args.tokenizer self._device = torch.device(f"cuda:{neox_args.local_rank}") self._eot_token_id = neox_args.tokenizer.eod_id self._max_length = neox_args.max_position_embeddings // 2 self._max_gen_toks = 128 self._vocab_size = neox_args.padded_vocab_size # parallelism args: self.is_main = neox_args.rank == 0 self.is_local_main = neox_args.local_rank == 0 self.is_model_parallel = neox_args.model_parallel_size > 1 self.is_pipe_parallel = self.model.is_pipe_parallel self.is_data_parallel = self.model.is_data_parallel self.is_last_stage = ( True if not self.is_pipe_parallel else model.is_last_stage() ) # only the last stage of the pipeline model will receive the logits self.dp_world_size = mpu.get_data_parallel_world_size() self.dp_rank = mpu.get_data_parallel_rank() self.dp_group = mpu.get_data_parallel_group() self.is_mp_rank_0 = mpu.get_model_parallel_rank() == 0 self._batch_size = batch_size or ( neox_args.batch_size * self.dp_world_size ) # default batch size to bs per gpu * dp size # some utility functions: # we need to patch tokenizer methods, because lm_eval uses them internally: self.tokenizer.encode = self.tokenizer.tokenize self.tokenizer.decode = self.tokenizer.detokenize self._forward_step_fn = partial( forward_step_fn, neox_args=neox_args, timers=None, return_logits=True ) self.generate = partial( generate_samples_from_prompt, neox_args=neox_args, model=model, maximum_tokens=self._max_gen_toks, temperature=0.0, )
def get_model(model_provider_func): """Build the model.""" args = get_args() # Build model on cpu. model = model_provider_func() # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for param in model.parameters(): mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param) # Print number of parameters. if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on (tensor, pipeline) ' 'model parallel rank ({}, {}): {}'.format( mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16Module(model) if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model if args.DDP_impl == 'local': model = LocalDDP(model) return model raise NotImplementedError('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl))
def model_provider(): """Build the model.""" print_rank_0('building GPT2 model ...') see_memory_usage(f"Before Building Model", force=True) with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), remote_device=get_args().remote_device, deepspeed_config=get_args().deepspeed_config, enabled=get_args().zero_stage == 3): model = GPT2Model(num_tokentypes=0, parallel_output=True) see_memory_usage(f"After Building Model", force=True) if mpu.get_data_parallel_rank() == 0: billion_params = get_parameters_in_billions(model) print( f' > number of parameters on model parallel rank {mpu.get_model_parallel_rank()}\ {round(billion_params, 3)} Billion', flush=True) return model
def evaluate(data_loader, model, eval_metric): """Evaluation.""" args = get_args() # Turn on evaluation mode which disables dropout. model.eval() total_output = 0.0 with torch.no_grad(): # For all the batches in the dataset. for iteration, batch in enumerate(data_loader): if iteration % args.log_interval == 0: print_rank_0('> working on iteration: {}'.format(iteration)) # Forward evaluation. output = forward_step(batch, model, eval_metric) # Reduce across processes. torch.distributed.all_reduce(output, group=mpu.get_data_parallel_group()) total_output += output return total_output
def get_rng_state(): """ collect rng state across data parallel ranks """ args = get_args() rng_state = { 'random_rng_state': random.getstate(), 'np_rng_state': np.random.get_state(), 'torch_rng_state': torch.get_rng_state(), 'cuda_rng_state': torch.cuda.get_rng_state(), 'rng_tracker_states': mpu.get_cuda_rng_tracker().get_states() } rng_state_list = None if torch.distributed.is_initialized() and \ mpu.get_data_parallel_world_size() > 1 and \ args.data_parallel_random_init: rng_state_list = \ [None for i in range(mpu.get_data_parallel_world_size())] torch.distributed.all_gather_object( rng_state_list, rng_state, group=mpu.get_data_parallel_group()) else: rng_state_list = [rng_state] return rng_state_list
def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, name): if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0 and \ not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank() == 0 start_time = time.time() print_rank_0(' > building sapmles index mapping for {} ...'.format( name)) # First compile and then import. from megatron.data import helpers samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs, max_num_samples, max_seq_length - 3, # account for added tokens short_seq_prob, seed, verbose) print_rank_0(' > done building sapmles index maping') np.save(indexmap_filename, samples_mapping, allow_pickle=True) print_rank_0(' > saved the index mapping in {}'.format( indexmap_filename)) # Make sure all the ranks have built the mapping print_rank_0(' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format( time.time() - start_time)) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) assert counts[0].item() == ( torch.distributed.get_world_size() // torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) # Load indexed dataset. print_rank_0(' > loading indexed mapping from {}'.format( indexmap_filename)) start_time = time.time() samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( time.time() - start_time)) print_rank_0(' total number of samples: {}'.format( samples_mapping.shape[0])) return samples_mapping
def calculate_correct_answers(name, model, dataloader, epoch, output_predictions): """Calculate correct over total answers and return prediction if the `output_predictions` is true.""" args = get_args() forward_backward_func = get_forward_backward_func() start_time = time.time() for m in model: m.eval() saved_micro_batch_size = args.micro_batch_size saved_global_batch_size = args.global_batch_size ds = dataloader.dataset if hasattr(ds, 'sample_multiplier'): # If our dataset as a sample_multiplier attribute that means # each "sample" from the dataset actually has multiple samples # that will collapse into the batch dimension (for example in # the RACE dataset that has several options), we need to # account for that when setting the micro batch size. sample_multiplier = ds.sample_multiplier else: sample_multiplier = 1 micro_batch_size_times_data_parallel = args.orig_micro_batch_size * args.data_parallel_size num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel def loss_func(output_predictions, labels, output_tensor): logits = output_tensor loss_dict = {} # Add output predictions. if output_predictions: assert False loss_dict['softmaxes'] = torch.nn.Softmax(dim=-1)( logits.float()).data.cpu().numpy().tolist() loss_dict['labels'] = labels.data.cpu().numpy().tolist() loss_dict['ids'] = batch['uid'].cpu().numpy().tolist() # Compute the correct answers. predicted = torch.argmax(logits, dim=-1) corrects = (predicted == labels) # Add to the counters. loss_dict['total'] = labels.size(0) loss_dict['correct'] = corrects.sum().item() return 0, loss_dict # defined inside to capture output_predictions def correct_answers_forward_step(batch, model): try: batch_ = next(batch) except BaseException: batch_ = batch tokens, types, labels, attention_mask = process_batch(batch_) # Forward model. args = get_args() output_tensor = model(tokens, attention_mask, tokentype_ids=types) return output_tensor, partial(loss_func, output_predictions, labels) with torch.no_grad(): # For all the batches in the dataset. total = 0 correct = 0 if output_predictions: # This option is only possible when data parallel size is 1. assert mpu.get_data_parallel_world_size() == 1 softmaxes = [] labels = [] ids = [] for _, batch in enumerate(dataloader): # For evaluation only mode we use drop_last = False to get all the # samples, which means we might not have a full batch, so we # adjust batch_size here to actual batch size of data actual_batch_size = len(batch['label']) # ... applying sample_multiplier if necessary args.micro_batch_size = actual_batch_size * sample_multiplier args.global_batch_size = actual_batch_size * sample_multiplier * num_micro_batches loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model, optimizer=None, timers=None, forward_only=True) for loss_dict in loss_dicts: if output_predictions: softmaxes.extend(loss_dict['softmaxes']) labels.extend(loss_dict['labels']) ids.extend(loss_dict['ids']) total += loss_dict['total'] correct += loss_dict['correct'] for m in model: m.train() args.micro_batch_size = saved_micro_batch_size args.global_batch_size = saved_global_batch_size # Reduce. if mpu.is_pipeline_last_stage(): unreduced = torch.cuda.LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) # Print on screen. correct_ans = unreduced[0].item() total_count = unreduced[1].item() percent = float(correct_ans) * 100.0 / float(total_count) elapsed_time = time.time() - start_time print_rank_last( ' > |epoch: {}| metrics for {}: correct / total ' '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format( epoch, name, correct_ans, total_count, percent, elapsed_time)) if output_predictions: return correct_ans, total_count, (softmaxes, labels, ids) return correct_ans, total_count if output_predictions: return 0, 0, () return 0, 0
def get_model(model_provider_func): """Build the model.""" args = get_args() # Build model. if mpu.get_pipeline_model_parallel_world_size() > 1 and \ args.virtual_pipeline_model_parallel_size is not None: model = [] for i in range(args.virtual_pipeline_model_parallel_size): mpu.set_virtual_pipeline_model_parallel_rank(i) # Set pre_process and post_process only after virtual rank is set. pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() this_model = model_provider_func(pre_process=pre_process, post_process=post_process) model.append(this_model) else: pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() model = model_provider_func(pre_process=pre_process, post_process=post_process) if not isinstance(model, list): model = [model] # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for model_module in model: for param in model_module.parameters(): mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param) # Print number of parameters. if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on (tensor, pipeline) ' 'model parallel rank ({}, {}): {}'.format( mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), sum([ sum([p.nelement() for p in model_module.parameters()]) for model_module in model ])), flush=True) # GPU allocation. for model_module in model: model_module.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16 or args.bf16: model = [Float16Module(model_module, args) for model_module in model] if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = [ torchDDP(model_module, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) for model_module in model ] return model if args.DDP_impl == 'local': model = [ LocalDDP(model_module, args.accumulate_allreduce_grads_in_fp32, args.use_contiguous_buffers_in_ddp) for model_module in model ] return model raise NotImplementedError('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl))
def make_loaders(args): """makes training/val/test""" if args.data_loader == 'tfrecords': return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) if args.data_path is not None: args.train_data = args.data_path data_set_args = { 'path': args.train_data, 'seq_length': seq_length, 'lazy': args.data_loader == 'lazy', 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'non_binary_cols': None, 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'tokenizer_type': args.tokenizer_type, 'tokenizer_model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'parallel_group': mpu.get_data_parallel_group() } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train = None valid = None test = None if args.train_data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid, tokenizer = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test, tokenizer = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader if train is not None and args.batch_size > 0: train = make_data_loader(train, batch_size, args) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, eval_batch_size, args) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, eval_batch_size, args) args.do_test = True else: args.do_test = False return (train, valid, test), tokenizer
def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): """Build the model.""" args = get_args() args.model_type = model_type # Build model. if mpu.get_pipeline_model_parallel_world_size() > 1 and \ args.virtual_pipeline_model_parallel_size is not None: assert model_type != ModelType.encoder_and_decoder, \ "Interleaved schedule not supported for model with both encoder and decoder" model = [] for i in range(args.virtual_pipeline_model_parallel_size): mpu.set_virtual_pipeline_model_parallel_rank(i) # Set pre_process and post_process only after virtual rank is set. pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() this_model = model_provider_func(pre_process=pre_process, post_process=post_process) this_model.model_type = model_type model.append(this_model) else: pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() add_encoder = True add_decoder = True if model_type == ModelType.encoder_and_decoder: if mpu.get_pipeline_model_parallel_world_size() > 1: assert args.pipeline_model_parallel_split_rank is not None, \ "Split rank needs to be specified for model with both encoder and decoder" rank = mpu.get_pipeline_model_parallel_rank() split_rank = args.pipeline_model_parallel_split_rank world_size = mpu.get_pipeline_model_parallel_world_size() pre_process = rank == 0 or rank == split_rank post_process = (rank == (split_rank - 1)) or (rank == (world_size - 1)) add_encoder = mpu.is_pipeline_stage_before_split() add_decoder = mpu.is_pipeline_stage_after_split() model = model_provider_func(pre_process=pre_process, post_process=post_process, add_encoder=add_encoder, add_decoder=add_decoder) else: model = model_provider_func(pre_process=pre_process, post_process=post_process) model.model_type = model_type if not isinstance(model, list): model = [model] # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for model_module in model: for param in model_module.parameters(): mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param) # Print number of parameters. if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on (tensor, pipeline) ' 'model parallel rank ({}, {}): {}'.format( mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), sum([ sum([p.nelement() for p in model_module.parameters()]) for model_module in model ])), flush=True) # GPU allocation. for model_module in model: model_module.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16 or args.bf16: model = [Float16Module(model_module, args) for model_module in model] if wrap_with_ddp: if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = [ torchDDP(model_module, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) for model_module in model ] elif args.DDP_impl == 'local': model = [ LocalDDP(model_module, args.accumulate_allreduce_grads_in_fp32, args.use_contiguous_buffers_in_local_ddp) for model_module in model ] # broad cast params from data parallel src rank to other data parallel ranks if args.data_parallel_random_init: for model_module in model: model_module.broadcast_params() else: raise NotImplementedError('Unknown DDP implementation specified: ' '{}. Exiting.'.format(args.DDP_impl)) return model
def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False): """Get samples mapping for a dataset over fixed size blocks. This function also requires a dataset of the titles for the source documents since their lengths must be taken into account. :return: samples_mapping (BlockSamplesMapping) """ if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{}s'.format(seed) if use_one_sent_docs: indexmap_filename += '_1sentok' indexmap_filename += '.npy' # Build the indexed mapping if not exist. if mpu.get_data_parallel_rank() == 0 and \ not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert block_dataset.doc_idx.dtype == np.int64 assert block_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank() == 0 start_time = time.time() print_rank_0( ' > building samples index mapping for {} ...'.format(name)) from megatron.data import helpers mapping_array = helpers.build_blocks_mapping( block_dataset.doc_idx, block_dataset.sizes, title_dataset.sizes, num_epochs, max_num_samples, max_seq_length - 3, # account for added tokens seed, verbose, use_one_sent_docs) print_rank_0(' > done building samples index mapping') np.save(indexmap_filename, mapping_array, allow_pickle=True) print_rank_0( ' > saved the index mapping in {}'.format(indexmap_filename)) # Make sure all the ranks have built the mapping print_rank_0(' > elapsed time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) assert counts[0].item() == torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) # Load indexed dataset. print_rank_0( ' > loading indexed mapping from {}'.format(indexmap_filename)) start_time = time.time() mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') samples_mapping = BlockSamplesMapping(mapping_array) print_rank_0( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) print_rank_0(' total number of samples: {}'.format( mapping_array.shape[0])) return samples_mapping
def _build_index_mappings(name, data_prefix, documents, sizes, num_samples, seq_length, seed): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. sample-idx: is the start document index and document offset for each training sample. shuffle-idx: maps the sample index into a random index into sample-idx. """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # rng state np_rng = np.random.RandomState(seed=seed) # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) _filename += '_{}sl'.format(seq_length) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_doc_idx.npy' sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: if (not os.path.isfile(doc_idx_filename)) or \ (not os.path.isfile(sample_idx_filename)) or \ (not os.path.isfile(shuffle_idx_filename)): print_rank_0(' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') # For the last epoch, decide whether include the entire epoch # in the global shuffle or not. # If we need only one epoch, then separating last epoch does # not mean anything. if num_epochs == 1: separate_last_epoch = False print(' > only one epoch required, setting ' 'separate_last_epoch to False', flush=True) else: # Get the number of samples for the last epoch num_samples_from_epochs_minus_one = ( (num_epochs - 1) * tokens_per_epoch - 1) // seq_length last_epoch_num_samples = num_samples - \ num_samples_from_epochs_minus_one assert last_epoch_num_samples >= 0, \ 'last epoch number of samples should be non-negative.' num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length assert last_epoch_num_samples < (num_samples_per_epoch + 1), \ 'last epoch number of samples exceeded max value.' # If we have less than 80% of the samples for the last epoch, # seperate out the epoch and treat it differently. # Note: the 80% number is just based on common sense and can # be adjusted if needed. separate_last_epoch = (last_epoch_num_samples < int(0.80 * num_samples_per_epoch)) if separate_last_epoch: string = ' > last epoch number of samples ({}) is smaller '\ 'than 80% of number of samples per epoch ({}), '\ 'setting separate_last_epoch to True' else: string = ' > last epoch number of samples ({}) is larger '\ 'than 80% of number of samples per epoch ({}), '\ 'setting separate_last_epoch to False' print(string.format(last_epoch_num_samples, num_samples_per_epoch), flush=True) # doc-idx. start_time = time.time() doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) np.save(doc_idx_filename, doc_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. from megatron.data import helpers assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, # num_epochs, tokens_per_epoch) np.save(sample_idx_filename, sample_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # shuffle-idx. start_time = time.time() # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) assert counts[0].item() == ( torch.distributed.get_world_size() // torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) # Load mappings. start_time = time.time() print_rank_0(' > loading doc-idx mapping from {}'.format( doc_idx_filename)) doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' > loading sample-idx mapping from {}'.format( sample_idx_filename)) sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' > loading shuffle-idx mapping from {}'.format( shuffle_idx_filename)) shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( time.time() - start_time)) print_rank_0(' total number of samples: {}'.format( sample_idx.shape[0])) print_rank_0(' total number of epochs: {}'.format(num_epochs)) return doc_idx, sample_idx, shuffle_idx
def _build_index_mappings(name, data_prefix, documents, sizes, num_samples, seq_length, seed): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. sample-idx: is the start document index and document offset for each training sample. shuffle-idx: maps the sample index into a random index into sample-idx. """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # rng state np_rng = np.random.RandomState(seed=seed) # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) _filename += '_{}sl'.format(seq_length) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_doc_idx.npy' sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: if (not os.path.isfile(doc_idx_filename)) or \ (not os.path.isfile(sample_idx_filename)) or \ (not os.path.isfile(shuffle_idx_filename)): print_rank_0( ' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') # doc-idx. start_time = time.time() doc_idx = _build_doc_idx(documents, num_epochs, np_rng) np.save(doc_idx_filename, doc_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. from megatron.data.dataset_utils import compile_helper compile_helper() from megatron.data import helpers assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, # num_epochs, tokens_per_epoch) np.save(sample_idx_filename, sample_idx, allow_pickle=True) print_rank_0( ' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # shuffle-idx. start_time = time.time() # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) print_rank_0( ' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) assert counts[0].item() == torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) # Load mappings. start_time = time.time() print_rank_0(' > loading doc-idx mapping from {}'.format(doc_idx_filename)) doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0( ' > loading sample-idx mapping from {}'.format(sample_idx_filename)) sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0( ' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename)) shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) print_rank_0(' total number of samples: {}'.format(sample_idx.shape[0])) print_rank_0(' total number of epochs: {}'.format(num_epochs)) return doc_idx, sample_idx, shuffle_idx
def calculate_correct_answers(name, model, dataloader, epoch, output_predictions): """Calculate correct over total answers and return prediction if the `output_predictions` is true.""" args = get_args() start_time = time.time() model.eval() saved_batch_size = args.micro_batch_size with torch.no_grad(): # For all the batches in the dataset. total = 0 correct = 0 if output_predictions: # This option is only possible when data parallel size is 1. assert mpu.get_data_parallel_world_size() == 1 softmaxes = [] labels = [] ids = [] for _, batch in enumerate(dataloader): # Run the model forward. tokens, types, labels_, attention_mask = process_batch(batch) # For evaluation only mode we use drop_last = False to get all the # samples, which means we might not have a full batch, so we # adjust batch_size here to actual batch size of data actual_batch_size = len(labels_) # ... applying sample_multiplier if necessary ds = dataloader.dataset if hasattr(ds, 'sample_multiplier'): actual_batch_size *= ds.sample_multiplier args.micro_batch_size = actual_batch_size if not mpu.is_pipeline_first_stage(): input_tensor, _ = communicate( tensor_send_next=None, tensor_send_prev=None, recv_forward=True, recv_backward=False) else: input_tensor = None # Forward model. if mpu.is_pipeline_first_stage(): assert input_tensor is None output_tensor = model(tokens, attention_mask, tokentype_ids=types) else: assert input_tensor is not None output_tensor = model(input_tensor, attention_mask) if mpu.is_pipeline_last_stage(): logits = output_tensor # Add output predictions. if output_predictions: softmaxes.extend(torch.nn.Softmax(dim=-1)( logits.float()).data.cpu().numpy().tolist()) labels.extend(labels_.data.cpu().numpy().tolist()) ids.extend(batch['uid'].cpu().numpy().tolist()) # Compute the correct answers. predicted = torch.argmax(logits, dim=-1) corrects = (predicted == labels_) # Add to the counters. total += labels_.size(0) correct += corrects.sum().item() else: communicate( tensor_send_next=output_tensor, tensor_send_prev=None, recv_forward=False, recv_backward=False) model.train() args.micro_batch_size = saved_batch_size # Reduce. if mpu.is_pipeline_last_stage(): unreduced = torch.cuda.LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) # Print on screen. correct_ans = unreduced[0].item() total_count = unreduced[1].item() percent = float(correct_ans) * 100.0 / float(total_count) elapsed_time = time.time() - start_time print_rank_last(' > |epoch: {}| metrics for {}: correct / total ' '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format( epoch, name, correct_ans, total_count, percent, elapsed_time)) if output_predictions: return correct_ans, total_count, (softmaxes, labels, ids) return correct_ans, total_count if output_predictions: return 0, 0, () return 0, 0
def calculate_correct_answers(model, dataloader, epoch): """Calculate correct over total answers""" args = get_args() forward_backward_func = get_forward_backward_func() for m in model: m.eval() def loss_func(labels, output_tensor): logits = output_tensor loss_dict = {} # Compute the correct answers. predicted = torch.argmax(logits, dim=-1) corrects = (predicted == labels).float() # Add to the counters. loss_dict['total'] = labels.size(0) loss_dict['correct'] = corrects.sum().item() return 0, loss_dict #defined inside to capture output_predictions def correct_answers_forward_step(batch, model): try: batch_ = next(batch) except BaseException: batch_ = batch images, labels = process_batch(batch_) # Forward model. args = get_args() output_tensor = model(images) return output_tensor, partial(loss_func, labels) with torch.no_grad(): # For all the batches in the dataset. total = 0 correct = 0 for _, batch in enumerate(dataloader): loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model, optimizer=None, timers=None, forward_only=True) for loss_dict in loss_dicts: total += loss_dict['total'] correct += loss_dict['correct'] for m in model: m.train() # Reduce. if mpu.is_pipeline_last_stage(): unreduced = torch.cuda.LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) # Print on screen. correct_ans = unreduced[0].item() total_count = unreduced[1].item() return correct_ans, total_count