def test_auto_methods_gloo(distributed_context_single_node_gloo): ws = distributed_context_single_node_gloo["world_size"] _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="WeightedRandomSampler") _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="DistributedSampler") device = idist.device() _test_auto_model_optimizer(ws, device) if ws > 1 and device.type == "cpu": error_type = AssertionError if LooseVersion( torch.__version__) <= LooseVersion("1.9.0") else ValueError with pytest.raises( error_type, match=r"SyncBatchNorm layers only work with GPU modules"): model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) auto_model(model, sync_bn=True)
def test_auto_methods_gloo(distributed_context_single_node_gloo): ws = distributed_context_single_node_gloo["world_size"] _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="WeightedRandomSampler") _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="DistributedSampler") device = idist.device() _test_auto_model_optimizer(ws, device) if ws > 1 and device.type == "cpu": # Pytorch <= 1.9.0 => AssertionError # Pytorch > 1.9 => ValueError # https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py#L1498 with pytest.raises( (AssertionError, ValueError), match=r"SyncBatchNorm layers only work with GPU modules"): model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) auto_model(model, sync_bn=True)
def test_auto_methods_nccl(distributed_context_single_node_nccl): ws = distributed_context_single_node_nccl["world_size"] _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=10) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1, sampler_name="WeightedRandomSampler") device = idist.device() _test_auto_model_optimizer(ws, device) if ws > 1: with pytest.raises(ValueError, match=r"Argument kwargs should not contain 'device_ids'"): auto_model(nn.Linear(1, 1), device_ids=[0])
def _test_auto_model_optimizer(ws, device): # Test auto_model model = nn.Linear(10, 10) model = auto_model(model) bnd = idist.backend() if ws > 1 and device in ("cuda", "cpu"): if idist.has_native_dist_support and bnd in ("nccl" or "gloo"): assert isinstance(model, nn.parallel.DistributedDataParallel) elif idist.has_hvd_support and bnd in ("horovod", ): assert isinstance(model, nn.Module) elif device != "cpu" and torch.cuda.is_available( ) and torch.cuda.device_count() > 1: assert isinstance(model, nn.parallel.DataParallel) else: assert isinstance(model, nn.Module) assert all([p.device.type == device for p in model.parameters()]), "{} vs {}".format( [p.device.type for p in model.parameters()], device) # Test auto_optim optimizer = optim.SGD(model.parameters(), lr=0.01) optimizer = auto_optim(optimizer) if idist.has_xla_support and "xla" in device: assert isinstance(optimizer, optim.SGD) and hasattr( optimizer, "wrapped_optimizer") elif idist.has_hvd_support and bnd in ("horovod", ): assert isinstance(optimizer, optim.SGD) and hasattr( optimizer, "_allreduce_grad_async") else: assert isinstance(optimizer, optim.SGD) and not hasattr( optimizer, "wrapped_optimizer")
def _test_auto_model(model, ws, device, sync_bn=False, **kwargs): model = auto_model(model, sync_bn=sync_bn, **kwargs) bnd = idist.backend() if ws > 1 and torch.device(device).type in ("cuda", "cpu"): if idist.has_native_dist_support and bnd in ("nccl", "gloo"): assert isinstance(model, nn.parallel.DistributedDataParallel) if sync_bn: assert any( [isinstance(m, nn.SyncBatchNorm) for m in model.modules()]) if "find_unused_parameters" in kwargs: assert model.find_unused_parameters == kwargs[ "find_unused_parameters"] elif idist.has_hvd_support and bnd in ("horovod", ): assert isinstance(model, nn.Module) elif device != "cpu" and torch.cuda.is_available( ) and torch.cuda.device_count() > 1: assert isinstance(model, nn.parallel.DataParallel) else: assert isinstance(model, nn.Module) assert all( [ p.device.type == torch.device(device).type for p in model.parameters() ] ), f"{[p.device.type for p in model.parameters()]} vs {torch.device(device).type}"
def _test_auto_model_optimizer(ws, device): # Test auto_model model = nn.Linear(10, 10) model = auto_model(model) if ws > 1: assert isinstance(model, nn.parallel.DistributedDataParallel) elif device != "cpu" and torch.cuda.is_available( ) and torch.cuda.device_count() > 1: assert isinstance(model, nn.parallel.DataParallel) else: assert isinstance(model, nn.Module) assert all([p.device.type == device for p in model.parameters()]), "{} vs {}".format( [p.device.type for p in model.parameters()], device) # Test auto_optim optimizer = optim.SGD(model.parameters(), lr=0.01) optimizer = auto_optim(optimizer) if "xla" in device: assert isinstance(optimizer, optim.SGD) and hasattr( optimizer, "wrapped_optimizer") else: assert isinstance(optimizer, optim.SGD) and not hasattr( optimizer, "wrapped_optimizer")
def test_auto_methods_gloo(distributed_context_single_node_gloo): ws = distributed_context_single_node_gloo["world_size"] _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="WeightedRandomSampler") _test_auto_model_optimizer(ws, "cpu") if ws > 1: with pytest.raises( AssertionError, match=r"SyncBatchNorm layers only work with GPU modules"): model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) auto_model(model, sync_bn=True)
def _test_auto_model(model, ws, device, sync_bn=False): model = auto_model(model, sync_bn=sync_bn) bnd = idist.backend() if ws > 1 and device in ("cuda", "cpu"): if idist.has_native_dist_support and bnd in ("nccl" or "gloo"): assert isinstance(model, nn.parallel.DistributedDataParallel) if sync_bn: assert any( [isinstance(m, nn.SyncBatchNorm) for m in model.modules()]) elif idist.has_hvd_support and bnd in ("horovod", ): assert isinstance(model, nn.Module) elif device != "cpu" and torch.cuda.is_available( ) and torch.cuda.device_count() > 1: assert isinstance(model, nn.parallel.DataParallel) else: assert isinstance(model, nn.Module) assert all([p.device.type == device for p in model.parameters()]), "{} vs {}".format( [p.device.type for p in model.parameters()], device)