torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE) if __name__ == '__main__': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False initialize_distributed() world_size = torch.distributed.get_world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test set rng state') test_set_cuda_rng_state(tensor_model_parallel_size) tensor_model_parallel_size *= 2 tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test cuda rng tracker') test_cuda_rng_tracker(tensor_model_parallel_size) tensor_model_parallel_size *= 2 tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test model parallel cuda manual seed') test_model_parallel_cuda_manual_seed(tensor_model_parallel_size) tensor_model_parallel_size *= 2
assert split_rank is pipeline_model_parallel_split_rank_ fake_split_rank = 7 parallel_state.set_pipeline_model_parallel_split_rank(fake_split_rank) split_rank = parallel_state.get_pipeline_model_parallel_split_rank() assert split_rank == fake_split_rank # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)') if __name__ == '__main__': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False initialize_distributed() world_size = torch.distributed.get_world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test initialize model parallel') test_initialize_model_parallel(tensor_model_parallel_size) print_separator('test model parallel source rank') test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) print_separator('test pipeline model parallel split rank') test_pipeline_model_parallel_split_rank() tensor_model_parallel_size *= 2
torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE) if __name__ == '__main__': torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False initialize_distributed() world_size = torch.distributed.get_world_size() exceptions = [] print_separator('test initialize affine weight cpu') tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: try: test_initialize_affine_weight(tensor_model_parallel_size, 'cpu') except Exception as e: exceptions.append( f"test_initialize_affine_weight-cpu with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}" ) # Reset groups parallel_state.destroy_model_parallel() break else: tensor_model_parallel_size *= 2 # Reset groups parallel_state.destroy_model_parallel()
pipeline_model_parallel_size = world_size try: forward_backward_func_template( name, forward_backward_func, pipeline_model_parallel_size, forward_only, ) except Exception as e: failures.append( f"\t# {name} failed with pipeline size: {pipeline_model_parallel_size} " f"and forward_only: {forward_only}\n" f"pipeline rank: {parallel_state.get_pipeline_model_parallel_rank()}, " f"virtual pipeline rank: {parallel_state.get_virtual_pipeline_model_parallel_rank()}\n" f"{str(e)}") finally: parallel_state.destroy_model_parallel() else: print_separator(f"{name} works") print_separator("TEST RESULT") if failures: torch.distributed.barrier() if torch.distributed.get_rank() == 0: print("\n".join(failures)) msg = f"{len(failures)} / {n_tests} cases failed" raise RuntimeError(msg) else: torch.distributed.barrier() if torch.distributed.get_rank() == 0: print("### PASS!")
def forward_backward_func_template( name: str, forward_backward_func, pipeline_model_parallel_size: int, forward_only: bool, ) -> None: print_separator( f"name: {name}, pipeline model parallel size: {pipeline_model_parallel_size}" ) virtual_pipeline_model_parallel_size = 2 if name == "interleaving" else None if name == "no_pipelining": # note (mkozuki): `forward_backward_no_pipelining` is **NOTE** compatible with # pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as # tensor_model_parallel_size and set pipeline_model_parallel_size to 1. parallel_state.initialize_model_parallel(1, 1, None) else: # NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is necessary to enable interleaving scheduling # In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and # used ubiquitously but this test uses custom model so it's safe to abuse. parallel_state.initialize_model_parallel( 1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size) if virtual_pipeline_model_parallel_size is not None: # Check the experimental warning message get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size) pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size( ) model = build_model( model_provider_func, wrap_with_ddp=True, virtual_pipeline_model_parallel_size= virtual_pipeline_model_parallel_size, ) assert isinstance(model, list) assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size) _param_groups = _get_params_for_weight_decay_optimization(model) torch.optim.Adam(_param_groups, lr=1e-4) tensor_shape = [ batch_size // parallel_state.get_data_parallel_world_size(), hidden_size ] batch = (torch.randn(tensor_shape).cuda(), ) tensor_shape[0] = micro_batch_size update_num_microbatches(0) forward_backward_func(fwd_step_func, batch, model, forward_only=forward_only, tensor_shape=tensor_shape) if not forward_only: for m in model: for p in m.parameters(): if p.grad is None: raise RuntimeError("grad not found") torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE)
) assert not parallel_state.model_parallel_is_initialized() parallel_state.initialize_model_parallel(tensor_model_parallel_size) assert parallel_state.model_parallel_is_initialized() # Checks src_rank = torch.distributed.get_rank() - parallel_state.get_tensor_model_parallel_rank() assert parallel_state.get_tensor_model_parallel_src_rank() == src_rank # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print('>> passed the test :-)') if __name__ == '__main__': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False initialize_distributed() world_size = torch.distributed.get_world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test initialize model parallel') test_initialize_model_parallel(tensor_model_parallel_size) print_separator('test model parallel source rank') test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) tensor_model_parallel_size *= 2
name, forward_backward_func, pipeline_model_parallel_size, forward_only, dtype=dtype, grad_scaler=grad_scaler, deallocate_pipeline_outputs=deallocate_pipeline_outputs, data_parallel_size=data_parallel_size, ) except Exception as e: failures.append( f"\t# {name} failed with pipeline size: {pipeline_model_parallel_size} " f"and forward_only: {forward_only}\n" f"pipeline rank: {parallel_state.get_pipeline_model_parallel_rank()}, " f"virtual pipeline rank: {parallel_state.get_virtual_pipeline_model_parallel_rank()}\n" f"{str(e)}") print(failures[-1]) finally: parallel_state.destroy_model_parallel() print_separator("TEST RESULT") if failures: torch.distributed.barrier() if torch.distributed.get_rank() == 0: print("\n".join(failures)) msg = f"{len(failures)} / {n_tests} cases failed" raise RuntimeError(msg) else: torch.distributed.barrier() if torch.distributed.get_rank() == 0: print("### PASS!")
target_size = functools.reduce(operator.mul, key_size_t[key], 1) assert key_numel[key] == target_size total_numel_t += target_size assert total_numel == total_numel_t data_b = data_utils.broadcast_data(keys, data, torch.int64) for key in keys: tensor = data_t[key].cuda() assert data_b[key].sub(tensor).abs().max() == 0 # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE) if __name__ == '__main__': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False initialize_distributed() world_size = torch.distributed.get_world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test test broadcast data') test_broadcast_data(tensor_model_parallel_size) tensor_model_parallel_size *= 2
error = loss_torch.sub_(loss_mpu).abs().max() print(' max error in loss on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = grad_torch.sub_(grad_mpu).abs().max() print(' max error in grad on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE) if __name__ == '__main__': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False initialize_distributed() world_size = torch.distributed.get_world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test cross entropy') test_cross_entropy(tensor_model_parallel_size) tensor_model_parallel_size *= 2
def run_interleaved_with_dynamic_batch_size( pipeline_model_parallel_size: int, forward_only: bool, BatchSamplerCls, ) -> None: args = global_vars.get_args() _reconfigure_microbatch_calculator( args.rank, args.rampup_batch_size, args.global_batch_size, args.micro_batch_size, 1, # args.data_parallel_size, ) virtual_pipeline_model_parallel_size = 2 # NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is a requisite for the interleaving scheduling # In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and # used ubiquitously but this test uses custom model so it's safe to abuse. parallel_state.initialize_model_parallel( 1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size) pipeline_model_parallel_size = ( parallel_state.get_pipeline_model_parallel_world_size()) print_separator( f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}" ) model = build_model( model_provider_func, wrap_with_ddp=True, virtual_pipeline_model_parallel_size= virtual_pipeline_model_parallel_size, hidden_size=HIDDEN_SIZE, ) assert isinstance(model, list) assert len(model) == virtual_pipeline_model_parallel_size optimizer = torch.optim.Adam( _get_params_for_weight_decay_optimization(model)) initial_local_minibatch_size = get_num_microbatches() * micro_batch_size dataset = Dataset(NUM_SAMPLES) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=BatchSamplerCls( NUM_SAMPLES, 0, initial_local_minibatch_size, parallel_state.get_data_parallel_rank(), parallel_state.get_data_parallel_world_size(), ), ) data_iter = iter(data_loader) def get_num_samples(batch): if isinstance(batch, torch.Tensor): return len(batch) assert isinstance(batch, (list, tuple)) return [get_num_samples(b) for b in batch] tensor_shape = [micro_batch_size, HIDDEN_SIZE, HIDDEN_SIZE] consumed_samples = 0 for i in range(NUM_ITERATIONS): update_num_microbatches(consumed_samples, consistency_check=False) local_batch_size = get_num_microbatches() * micro_batch_size data_iter._index_sampler.local_minibatch_size = local_batch_size local_mini_batch = next(data_iter) _logger.info(f"iter: {i} / {NUM_ITERATIONS} " f"local batchsize: {get_num_samples(local_mini_batch)} " f"consumed_samples: {consumed_samples} / {NUM_SAMPLES}") _forward_backward_pipelining_with_interleaving( fwd_step_func, local_mini_batch, model, forward_only=forward_only, tensor_shape=tensor_shape, ) consumed_samples += (parallel_state.get_data_parallel_world_size() * get_num_microbatches() * micro_batch_size) if not forward_only: for m in model: for p in m.parameters(): if p.grad is None: raise RuntimeError("grad not found") else: optimizer.zero_grad(set_to_none=True) torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE)