if __name__ == "__main__": init = True for async_comm in (False, True): global fancy_data global effective_length if init: init = False global_vars.set_global_variables() fancy_data = download_fancy_data() args = global_vars.get_args() effective_length = fancy_data.size(0) // args.seq_length effective_length = fancy_data.size(0) - args.seq_length initialize_distributed("nccl") world_size = torch.distributed.get_world_size() failure = None args.padded_vocab_size = 128 batch_size = args.global_batch_size micro_batch_size = args.micro_batch_size setup_microbatch_calculator( args.rank, args.rampup_batch_size, args.global_batch_size, args.micro_batch_size, args.data_parallel_size, # args.data_parallel_size, ) world_size = torch.distributed.get_world_size()
# Reset the tracker tensor_parallel.random.get_cuda_rng_tracker().reset() # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE) if __name__ == '__main__': torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False initialize_distributed() world_size = torch.distributed.get_world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test set rng state') test_set_cuda_rng_state(tensor_model_parallel_size) tensor_model_parallel_size *= 2 tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: print_separator('test cuda rng tracker') test_cuda_rng_tracker(tensor_model_parallel_size) tensor_model_parallel_size *= 2 tensor_model_parallel_size = 1