def worker_fn(gpu_id: int, world_size: int, batch_size: int, sync_file: str): init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) embeddings = torch.full(size=(batch_size, 3), fill_value=float(gpu_id), requires_grad=True).cuda(gpu_id) gathered = SimclrInfoNCECriterion.gather_embeddings(embeddings) if world_size == 1: assert gathered.equal( torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], device=f"cuda:{gpu_id}")) if world_size == 2: assert gathered.equal( torch.tensor( [ [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], ], device=f"cuda:{gpu_id}", )) assert gathered.requires_grad
def worker_fn(gpu_id: int, world_size: int, batch_size: int, sync_file: str): init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) EMBEDDING_DIM = 128 criterion = BarlowTwinsCriterion(lambda_=0.0051, scale_loss=0.024, embedding_dim=EMBEDDING_DIM) embeddings = torch.randn((batch_size, EMBEDDING_DIM), dtype=torch.float32, requires_grad=True).cuda() criterion(embeddings).backward()
def _pretraining_worker( gpu_id: int, with_fsdp: bool, with_activation_checkpointing: bool, with_larc: bool, sync_file: str, result_file: str, ): init_distributed_on_file(world_size=2, gpu_id=gpu_id, sync_file=sync_file) torch.manual_seed(0) torch.backends.cudnn.deterministic = True # Create the inputs batch = torch.randn(size=(8, 3, 224, 224)).cuda() target = torch.tensor(0.0).cuda() # Create a fake model based on SWAV blocks config = TestRegnetFSDP._create_pretraining_config( with_fsdp, with_activation_checkpointing, with_larc=with_larc) model = build_model(config["MODEL"], config["OPTIMIZER"]) model = model.cuda() if with_fsdp: model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG) else: model = DistributedDataParallel(model, device_ids=[gpu_id]) criterion = SwAVLoss(loss_config=config["LOSS"]["swav_loss"]) optimizer = build_optimizer(config["OPTIMIZER"]) optimizer.set_param_groups(model.parameters()) # Run a few iterations and collect the losses losses = [] num_iterations = 5 for iteration in range(num_iterations): out = model(batch) loss = criterion(out[0], target) if gpu_id == 0: losses.append(loss.item()) optimizer.zero_grad() loss.backward() if iteration <= 2: for name, param in model.named_parameters(): if "prototypes" in name: param.grad = None optimizer.step(where=float(iteration / num_iterations)) # Store the losses in a file to compare several methods if gpu_id == 0: with open(result_file, "wb") as f: pickle.dump(losses, f)
def _test_synch_bn_pytorch_worker(gpu_id: int, world_size: int, group_size: int, sync_file: str): torch.cuda.set_device(gpu_id) init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) config = AttrDict({ "MODEL": { "SYNC_BN_CONFIG": { "SYNC_BN_TYPE": "pytorch", "GROUP_SIZE": group_size, } }, "DISTRIBUTED": { "NUM_PROC_PER_NODE": world_size, "NUM_NODES": 1, "NCCL_DEBUG": False, "NCCL_SOCKET_NTHREADS": 4, }, }) set_env_vars(local_rank=gpu_id, node_id=0, cfg=config) channels = 8 model = nn.Sequential( nn.BatchNorm2d(num_features=channels), nn.AdaptiveAvgPool2d(output_size=(1, 1)), ) model = convert_sync_bn(config, model).cuda(gpu_id) model = DistributedDataParallel(model, device_ids=[gpu_id]) x = torch.full(size=(5, channels, 4, 4), fill_value=float(gpu_id)) model(x) running_mean = model.module[0].running_mean.cpu() print(gpu_id, running_mean) if group_size == 1: if gpu_id == 0: assert torch.allclose(running_mean, torch.full(size=(8, ), fill_value=0.0)) elif gpu_id == 1: assert torch.allclose(running_mean, torch.full(size=(8, ), fill_value=0.1)) else: if gpu_id in {0, 1}: assert torch.allclose(running_mean, torch.full(size=(8, ), fill_value=0.05))
def _worker(gpu_id: int, sync_file: str, world_size: int): torch.manual_seed(0) os.environ["RANK"] = str(gpu_id) init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) torch.backends.cudnn.deterministic = True config = TestCheckpointConversion._create_fsdp_model_config( with_fsdp=True) model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id) model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG) optimizer = optim.SGD(model.parameters(), lr=1e-4) # Fake inputs num_iterations = 5 batch_size = 3 torch.manual_seed(gpu_id) fake_inputs = torch.randn(size=(num_iterations, batch_size, 3, 96, 96)) fake_targets = torch.randn(size=(num_iterations, batch_size)) # Fake training loop criterion = nn.MSELoss() for iteration in range(num_iterations): fake_input = fake_inputs[iteration].cuda(gpu_id) fake_target = fake_targets[iteration].cuda(gpu_id) output1, output2 = model(fake_input)[0] loss = criterion(output1.sum(axis=-1), fake_target) + criterion( output2.sum(axis=-1), fake_target) if gpu_id == 0: print(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() # Save a bunch of checkpoint, one by shard checkpoint_writer = CheckpointWriter( checkpoint_folder=".", is_final_train_phase=True, mode="iteration", mode_num=0, backend="disk", ) content = { "classy_state_dict": { "base_model": { "model": { "trunk": model.trunk.local_state_dict() }, "meta": { "trunk": model.trunk.local_metadata_dict() }, } } } checkpoint_writer.save_sharded_checkpoint(content, shard_rank=gpu_id, world_size=world_size) dist.barrier() print(os.listdir(".")) # Convert the checkpoint to consolidated and sliced checkpoints if gpu_id == 0: CheckpointFormatConverter.sharded_to_consolidated_checkpoint( "checkpoint.torch", "checkpoint_conso.torch") CheckpointFormatConverter.sharded_to_sliced_checkpoint( "checkpoint.torch", "checkpoint_sliced.torch") dist.barrier() print(os.listdir(".")) # Now create models initialized from the previous checkpoint and compare them fake_test_input = torch.randn(size=(1, 3, 96, 96)).cuda(gpu_id) shard_cp = CheckpointLoader.load_and_broadcast_init_weights( "checkpoint.torch", device=torch.device("cpu")) shard_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id) shard_model = fsdp_wrapper(shard_model, **config.MODEL.FSDP_CONFIG) shard_model.init_model_from_weights_params_file(config, shard_cp) conso_cp = CheckpointLoader.load_and_broadcast_init_weights( "checkpoint_conso.torch", device=torch.device("cpu")) conso_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id) conso_model = fsdp_wrapper(conso_model, **config.MODEL.FSDP_CONFIG) conso_model.init_model_from_weights_params_file(config, conso_cp) slice_cp = CheckpointLoader.load_and_broadcast_init_weights( "checkpoint_sliced.torch", device=torch.device("cpu")) slice_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id) slice_model = fsdp_wrapper(slice_model, **config.MODEL.FSDP_CONFIG) slice_model.init_model_from_weights_params_file(config, slice_cp) # Verifying that the models are equivalent if gpu_id == 0: slice_state_dict = slice_model.local_state_dict() conso_state_dict = conso_model.local_state_dict() assert set(slice_state_dict.keys()) == set(conso_state_dict.keys()) for k in slice_state_dict.keys(): slice_val = slice_state_dict[k] conso_val = conso_state_dict[k] assert torch.allclose( slice_val, conso_val ), f"Difference for key {k}: {slice_val} VS {conso_val}" dist.barrier() with torch.no_grad(): ref_out = model.trunk(fake_test_input)[0] shard_out = shard_model.trunk(fake_test_input)[0] conso_out = conso_model.trunk(fake_test_input)[0] slice_out = slice_model.trunk(fake_test_input)[0] assert torch.allclose( ref_out, shard_out), f"{ref_out.sum()} vs {shard_out.sum()}" assert torch.allclose( ref_out, conso_out), f"{ref_out.sum()} vs {conso_out.sum()}" assert torch.allclose( ref_out, slice_out), f"{ref_out.sum()} vs {slice_out.sum()}"
def _norm_computation_worker(gpu_id: int, sync_file: str, world_size: int): init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) torch.manual_seed(0) torch.backends.cudnn.deterministic = True num_iterations = 10 batch_size = 128 torch.manual_seed(gpu_id) fake_inputs = torch.randn(size=(num_iterations, batch_size, 129)) fake_targets = torch.randn(size=(num_iterations, batch_size)) losses = {} for with_fsdp in [False, True]: torch.manual_seed(0) torch.cuda.manual_seed(0) losses[with_fsdp] = [] # Create a simple model model = nn.Sequential(nn.Linear(129, 128), nn.ReLU(), nn.Linear(128, 10)) model = model.cuda(gpu_id) # Setting up FSDP vs DDP with LARC larc_config = { "clip": False, "trust_coefficient": 0.01, "eps": 0.00000001 } optimizer = optim.SGD(model.parameters(), lr=1e-2, weight_decay=1e-4, momentum=0.9) if with_fsdp: model = FullyShardedDataParallel(model, flatten_parameters=False) optimizer = LARC_FSDP(optimizer, distributed_norm=True, **larc_config) else: model = DistributedDataParallel(model, device_ids=[gpu_id]) optimizer = LARC_FSDP(optimizer, distributed_norm=False, **larc_config) # Training loop criterion = nn.MSELoss() for iteration in range(num_iterations): fake_input = fake_inputs[iteration].cuda(gpu_id) fake_target = fake_targets[iteration].cuda(gpu_id) output = model(fake_input) loss = criterion(output.sum(axis=-1), fake_target) if gpu_id == 0: losses[with_fsdp].append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() if gpu_id == 0: for with_fsdp in [False, True]: print(losses[with_fsdp]) if world_size > 1: losses[with_fsdp] = [ round(loss, 5) for loss in losses[with_fsdp] ] assert losses[False] == losses[True]
def _layer_memory_tracking_worker(gpu_id: int, sync_file: str, world_size: int): init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) torch.manual_seed(0) torch.backends.cudnn.deterministic = True torch.manual_seed(gpu_id) batch_size = 16 fake_inputs = torch.randn(size=(batch_size, 10)).cuda(gpu_id) fake_targets = torch.randn(size=(batch_size, 10)).cuda(gpu_id) fake_criterion = nn.MSELoss() torch.manual_seed(0) torch.cuda.manual_seed(0) # Create a global group and a tracker around it group = dist.new_group() group = ProcessGroupTracker(group) # Create a simple model model = nn.Sequential( nn.Linear(10, 10).cuda(gpu_id), nn.ReLU(), FullyShardedDataParallel( nn.Linear(10, 10).cuda(gpu_id), flatten_parameters=False, process_group=group, ), nn.ReLU(), FullyShardedDataParallel( nn.Linear(10, 10).cuda(gpu_id), flatten_parameters=True, process_group=group, ), ) model = model.cuda(gpu_id) model = FullyShardedDataParallel(model, flatten_parameters=False, process_group=group) # Setup the tracking of the model tracker = LayerwiseMemoryTracker() tracker.monitor(model) # Fake forward / backward pass fake_criterion(model(fake_inputs), fake_targets).backward() # Collect results of all gathers (the feature specific to FSDP) tracker.stop() all_gathered_traces = [ (t.module_name, t.all_gathered, t.cumul_all_gathered) for t in tracker.memory_traces if t.all_gathered > 0 ] assert all_gathered_traces == [ ("_fsdp_wrapped_module.0", 440, 440), ("_fsdp_wrapped_module.2._fsdp_wrapped_module", 440, 880), ("_fsdp_wrapped_module.4._fsdp_wrapped_module._fpw_module", 440, 880), ("_fsdp_wrapped_module.4._fsdp_wrapped_module._fpw_module", 440, 0), ("_fsdp_wrapped_module.2._fsdp_wrapped_module", 440, 0), ]