def train(model, optim, pipeline_model_parallel_size, async_comm): sequence_len = global_vars.get_args().seq_length micro_batch_size = global_vars.get_args().micro_batch_size hidden_size = global_vars.get_args().hidden_size fwd_bwd_func = forward_backward_pipelining_without_interleaving tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size) runtime = 0 # training loop for i in range(3): since = time.time() if torch.distributed.get_rank() == 0: print("begin iter", i) batch = [ generate_fancy_data_labels(args.seq_length, args.global_batch_size) for _ in range(pipeline_model_parallel_size) ] if torch.distributed.get_rank() == 0: print("finished making batch...") optim.zero_grad() fwd_bwd_func( fwd_step_func, batch, model, forward_only=False, tensor_shape=tensor_shape, async_comm=async_comm ) if torch.distributed.get_rank() == 0: print("finished forward step") optim.step() if torch.distributed.get_rank() == 0: print("finished iter", i) runtime += time.time() - since return runtime / 3.0
def train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size): sequence_len = global_vars.get_args().seq_length micro_batch_size = global_vars.get_args().micro_batch_size hidden_size = global_vars.get_args().hidden_size forward_backward_func = get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size) tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size) for _ in range(16): batch = generate_fancy_data_labels(sequence_len, batch_size) optim.zero_grad() forward_backward_func(fwd_step_func, batch, model, forward_only=False, tensor_shape=tensor_shape) optim.step()
def initialize_distributed(backend='nccl'): """Initialize torch.distributed.""" # Get local rank in case it is provided. # parser = argparse.ArgumentParser() # parser.add_argument('--local_rank', type=int, default=None, # help='local rank passed from distributed launcher') # args = parser.parse_args() args = global_vars.get_args() local_rank = args.local_rank # Get rank and world size. rank = int(os.getenv('RANK', '0')) world_size = int(os.getenv("WORLD_SIZE", '1')) print('> initializing torch.distributed with local rank: {}, ' 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) # Set the device id. device = rank % torch.cuda.device_count() if local_rank is not None: device = local_rank torch.cuda.set_device(device) # Call the init process. init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port torch.distributed.init_process_group(backend=backend, world_size=world_size, rank=rank, init_method=init_method)
def test_column_parallel_linear_with_async_allreduce_custom_amp( tensor_model_parallel_size): dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else ( torch.half, ) parallel_state.initialize_model_parallel(tensor_model_parallel_size) tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size( ) seed = 12345 set_random_seed(seed) input_size_coeff = 13 input_size = input_size_coeff * tensor_model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * tensor_model_parallel_size batch_size = 7 for dtype in dtypes: # Network identity_layer = IdentityLayer3D(batch_size, batch_size, input_size).to(device="cuda", dtype=dtype) linear_layer = layers.ColumnParallelLinear( input_size, output_size, keep_master_weight_for_test=True, params_dtype=global_vars.get_args().params_dtype, use_cpu_initialization=global_vars.get_args(). use_cpu_initialization, ).to(device="cuda", dtype=dtype) # Forward loss_weight = torch.randn([batch_size, output_size]).cuda() output, _ = linear_layer(identity_layer()) loss = torch.mul(output, loss_weight).sum() loss.backward() torch.distributed.barrier() assert output.dtype == dtype # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(' >> passed the test :-)')
def __init__(self, mpu_vocab_size, hidden_size, init_method, layernorm_epsilon, parallel_output): super(BertLMHead, self).__init__() args = get_args() self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) # TODO: do we need this? # mpu.set_tensor_model_parallel_attributes(self.bias, True, 0, 1) self.parallel_output = parallel_output self.dense = get_linear_layer(hidden_size, hidden_size, init_method) self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.gelu = torch.nn.functional.gelu if args.openai_gelu: self.gelu = openai_gelu elif args.onnx_safe: self.gelu = erf_gelu
def initialize_distributed(backend="nccl"): """Initialize torch.distributed.""" # Get local rank in case it is provided. # parser = argparse.ArgumentParser() # parser.add_argument('--local_rank', type=int, default=None, # help='local rank passed from distributed launcher') # args = parser.parse_args() if backend not in ("nccl", "ucc"): raise RuntimeError( f"Currently only nccl & ucc are supported but {backend}") if backend == "ucc": import torch_ucc # NOQA args = global_vars.get_args() local_rank = args.local_rank # Get rank and world size. rank = int(os.getenv("RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) print("> initializing torch.distributed with local rank: {}, " "rank: {}, world size: {}".format(local_rank, rank, world_size)) # Set the device id. device = rank % torch.cuda.device_count() if local_rank is not None: device = local_rank torch.cuda.set_device(device) # Call the init process. init_method = "tcp://" master_ip = os.getenv("MASTER_ADDR", "localhost") master_port = os.getenv("MASTER_PORT", "6000") init_method += master_ip + ":" + master_port torch.distributed.init_process_group( backend=backend, world_size=world_size, rank=rank, init_method=init_method, timeout=datetime.timedelta(seconds=60), )
def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True, pre_process=True, post_process=True, cpu_offload=False): super(BertModel, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process self.cpu_offload = cpu_offload init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) self.initialize_word_embeddings(init_method_normal) if self.post_process: self.lm_head = BertLMHead(self.word_embeddings_weight().size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
runtime += time.time() - since return runtime / 3.0 if __name__ == "__main__": init = True for async_comm in (False, True): global fancy_data global effective_length if init: init = False global_vars.set_global_variables() fancy_data = download_fancy_data() args = global_vars.get_args() effective_length = fancy_data.size(0) // args.seq_length effective_length = fancy_data.size(0) - args.seq_length initialize_distributed("nccl") world_size = torch.distributed.get_world_size() failure = None args.padded_vocab_size = 128 batch_size = args.global_batch_size micro_batch_size = args.micro_batch_size setup_microbatch_calculator( args.rank, args.rampup_batch_size, args.global_batch_size, args.micro_batch_size,
batch = generate_fancy_data_labels(sequence_len, batch_size) optim.zero_grad() forward_backward_func( fwd_step_func, batch, model, forward_only=False, tensor_shape=tensor_shape, async_comm=async_comm, ) optim.step() if __name__ == "__main__": global fancy_data global effective_length global_vars.set_global_variables() fancy_data = download_fancy_data() effective_length = fancy_data.size(0) // global_vars.get_args().seq_length effective_length = fancy_data.size(0) - global_vars.get_args().seq_length initialize_distributed("nccl") world_size = torch.distributed.get_world_size() failure = None init = True try: for virtual_pipeline_model_parallel_size in (2, None): async_comm = virtual_pipeline_model_parallel_size is None data_idx = 0 ONCE = False if init: init = False args = global_vars.get_args() args.padded_vocab_size = 128 # needed in standalone gpt
def test_row_parallel_linear(tensor_model_parallel_size): parallel_state.initialize_model_parallel(tensor_model_parallel_size) if torch.distributed.get_rank() == 0: print('> testing RowParallelLinear with model parallel ' 'size: {}'.format(tensor_model_parallel_size)) tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size( ) seed = 12345 set_random_seed(seed) input_size_coeff = 13 input_size = input_size_coeff * tensor_model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * tensor_model_parallel_size batch_size = 7 # Network identity_layer = IdentityLayer2D(batch_size, input_size).cuda() linear_layer = layers.RowParallelLinear( input_size, output_size, keep_master_weight_for_test=True, params_dtype=global_vars.get_args().params_dtype, use_cpu_initialization=global_vars.get_args().use_cpu_initialization, ).cuda() loss_weight = torch.randn([batch_size, output_size]).cuda() # Forward input_ = identity_layer() output, _ = linear_layer(input_) loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() # Values. dLdY = loss_weight X = identity_layer.weight A = linear_layer.master_weight.cuda() dLdA = torch.matmul(dLdY.t(), X) dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) dLdX = torch.matmul(dLdY, A) rank = parallel_state.get_tensor_model_parallel_rank() my_dLdA = torch.split(dLdA, input_size_coeff, dim=1)[rank].contiguous().clone() error = my_dLdA.sub(linear_layer.weight.grad).abs().max() torch.distributed.barrier() print(' error in dLdA on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = dLdb.sub(linear_layer.bias.grad).abs().max() torch.distributed.barrier() print(' error in dLdb on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 error = dLdX.sub(identity_layer.weight.grad).abs().max() torch.distributed.barrier() print(' error in dLdX on global rank {}: {}'.format( torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(' >> passed the test :-)')
def test_column_parallel_linear(tensor_model_parallel_size): parallel_state.initialize_model_parallel(tensor_model_parallel_size) if torch.distributed.get_rank() == 0: print('> testing ColumnParallelLinear with model parallel ' 'size: {}'.format(tensor_model_parallel_size)) tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size( ) seed = 12345 set_random_seed(seed) input_size_coeff = 13 input_size = input_size_coeff * tensor_model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * tensor_model_parallel_size batch_size = 7 hidden_size = 9 # Network gradient_accumulation_fusion = True identity_layer = IdentityLayer3D(batch_size, hidden_size, input_size).cuda() linear_layer = layers.ColumnParallelLinear( input_size, output_size, keep_master_weight_for_test=True, params_dtype=global_vars.get_args().params_dtype, use_cpu_initialization=global_vars.get_args().use_cpu_initialization, gradient_accumulation_fusion=gradient_accumulation_fusion, ).cuda() with torch.no_grad(): linear_layer.weight.main_grad = torch.randn_like(linear_layer.weight) loss_weight = torch.randn([batch_size, hidden_size, output_size]).cuda() # Forward input_ = identity_layer() output, _ = linear_layer(input_) assert list(output.shape) == [batch_size, hidden_size, output_size] loss = torch.mul(output, loss_weight).sum() # Backward loss.backward() # TODO (mkozuki): Fix the following commented out lines # as `gradient_accumulation_fusion` only takes 3D tensors. # Values. # dLdY = loss_weight # (7, 9, 17) # X = identity_layer.weight # (7, 9, 13) # A = linear_layer.master_weight.cuda() # (17, 13) # print(f"dLdY.shape, X.shape, A.shape = {dLdY.shape, X.shape, A.shape}") # dLdA = torch.matmul(dLdY.view(-1, 17).t(), X.view(-1, 13)) # print(f"dLdA.shape = {dLdA.shape}") # ones = torch.ones(batch_size, hidden_size, 1).cuda() # print(f"dLdY.shape, ones.shape = {dLdY.shape, ones.shape}") # dLdb = torch.matmul(ones, dLdY).view(-1) # dLdX = torch.matmul(dLdY, A) # rank = parallel_state.get_tensor_model_parallel_rank() # my_dLdA = torch.split(dLdA, output_size_coeff, # dim=0)[rank].contiguous().clone() # error = my_dLdA.sub(linear_layer.weight.grad).abs().max() # torch.distributed.barrier() # print(' error in dLdA on global rank {}: {}'.format( # torch.distributed.get_rank(), error)) # assert error < 1.0e-6 # my_dLdb = torch.split(dLdb, output_size_coeff, # dim=0)[rank].contiguous().clone() # error = my_dLdb.sub(linear_layer.bias.grad).abs().max() # torch.distributed.barrier() # print(' error in dLdb on global rank {}: {}'.format( # torch.distributed.get_rank(), error)) # assert error < 1.0e-6 # error = dLdX.sub(identity_layer.weight.grad).abs().max() # torch.distributed.barrier() # print(' error in dLdX on global rank {}: {}'.format( # torch.distributed.get_rank(), error)) # assert error < 1.0e-6 # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(' >> passed the test :-)')
def test_initialize_affine_weight(tensor_model_parallel_size, device): parallel_state.initialize_model_parallel(tensor_model_parallel_size) if torch.distributed.get_rank() == 0: print('> testing initialize_affine_weight with model parallel ' 'size: {}'.format(tensor_model_parallel_size)) tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size( ) seed = 12345 input_size_coeff = 13 input_size = input_size_coeff * tensor_model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * tensor_model_parallel_size # --------------- # Column parallel # --------------- weight = torch.empty(output_size_coeff, input_size) set_random_seed(seed) if device == 'cpu': layers._initialize_affine_weight_cpu( weight, output_size, input_size, output_size_coeff, 0, torch.nn.init.normal_, params_dtype=global_vars.get_args().params_dtype, ) else: layers._initialize_affine_weight_gpu(weight, torch.nn.init.normal_, 0) # Target. set_random_seed(seed) master_weight = torch.empty(output_size, input_size) torch.nn.init.normal_(master_weight) rank = parallel_state.get_tensor_model_parallel_rank() my_weight = torch.split(master_weight, output_size_coeff, dim=0)[rank].contiguous().clone() # Compare. error = weight.sub(my_weight).abs().max() torch.distributed.barrier() print(' column parallel max error (should be zero) on global rank ' '{}: {}'.format(torch.distributed.get_rank(), error)) assert error < 1.0e-6 # ------------ # Row parallel # ------------ weight = torch.empty(output_size, input_size_coeff) set_random_seed(seed) if device == 'cpu': layers._initialize_affine_weight_cpu( weight, output_size, input_size, input_size_coeff, 1, torch.nn.init.normal_, params_dtype=global_vars.get_args().params_dtype) else: layers._initialize_affine_weight_gpu(weight, torch.nn.init.normal_, 1) # Target. set_random_seed(seed) master_weight = torch.empty(output_size, input_size) torch.nn.init.normal_(master_weight) rank = parallel_state.get_tensor_model_parallel_rank() my_weight = torch.split(master_weight, input_size_coeff, dim=1)[rank].contiguous().clone() # Compare. error = weight.sub(my_weight).abs().max() torch.distributed.barrier() print(' row parallel max error (should be zero) on global rank ' '{}: {}'.format(torch.distributed.get_rank(), error)) assert error < 1.0e-6 # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(' >> passed the test :-)')
def run_interleaved_with_dynamic_batch_size( pipeline_model_parallel_size: int, forward_only: bool, BatchSamplerCls, ) -> None: args = global_vars.get_args() _reconfigure_microbatch_calculator( args.rank, args.rampup_batch_size, args.global_batch_size, args.micro_batch_size, 1, # args.data_parallel_size, ) virtual_pipeline_model_parallel_size = 2 # NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is a requisite for the interleaving scheduling # In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and # used ubiquitously but this test uses custom model so it's safe to abuse. parallel_state.initialize_model_parallel( 1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size) pipeline_model_parallel_size = ( parallel_state.get_pipeline_model_parallel_world_size()) print_separator( f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}" ) model = build_model( model_provider_func, wrap_with_ddp=True, virtual_pipeline_model_parallel_size= virtual_pipeline_model_parallel_size, hidden_size=HIDDEN_SIZE, ) assert isinstance(model, list) assert len(model) == virtual_pipeline_model_parallel_size optimizer = torch.optim.Adam( _get_params_for_weight_decay_optimization(model)) initial_local_minibatch_size = get_num_microbatches() * micro_batch_size dataset = Dataset(NUM_SAMPLES) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=BatchSamplerCls( NUM_SAMPLES, 0, initial_local_minibatch_size, parallel_state.get_data_parallel_rank(), parallel_state.get_data_parallel_world_size(), ), ) data_iter = iter(data_loader) def get_num_samples(batch): if isinstance(batch, torch.Tensor): return len(batch) assert isinstance(batch, (list, tuple)) return [get_num_samples(b) for b in batch] tensor_shape = [micro_batch_size, HIDDEN_SIZE, HIDDEN_SIZE] consumed_samples = 0 for i in range(NUM_ITERATIONS): update_num_microbatches(consumed_samples, consistency_check=False) local_batch_size = get_num_microbatches() * micro_batch_size data_iter._index_sampler.local_minibatch_size = local_batch_size local_mini_batch = next(data_iter) _logger.info(f"iter: {i} / {NUM_ITERATIONS} " f"local batchsize: {get_num_samples(local_mini_batch)} " f"consumed_samples: {consumed_samples} / {NUM_SAMPLES}") _forward_backward_pipelining_with_interleaving( fwd_step_func, local_mini_batch, model, forward_only=forward_only, tensor_shape=tensor_shape, ) consumed_samples += (parallel_state.get_data_parallel_world_size() * get_num_microbatches() * micro_batch_size) if not forward_only: for m in model: for p in m.parameters(): if p.grad is None: raise RuntimeError("grad not found") else: optimizer.zero_grad(set_to_none=True) torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE)