def _test_distrib_config(local_rank, backend, ws, true_device, rank=None): assert idist.backend() == backend, "{} vs {}".format( idist.backend(), backend) this_device = idist.device() assert isinstance(this_device, torch.device) if backend == "nccl": true_device = torch.device("{}:{}".format(true_device, local_rank)) assert this_device == true_device, "{} vs {}".format( this_device, true_device) elif backend == "gloo": assert this_device == torch.device(true_device) elif backend == "xla-tpu": assert true_device in this_device.type if rank is None: if idist.model_name() == "native-dist": rank = dist.get_rank() assert idist.get_rank() == rank assert idist.get_world_size() == ws assert idist.get_local_rank() == local_rank assert idist.model_name() in ("native-dist", "xla-dist") _sanity_check()
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None): assert idist.backend() == backend, f"{idist.backend()} vs {backend}" this_device = idist.device() assert isinstance(this_device, torch.device) if backend in ("nccl", "horovod") and "cuda" in this_device.type: true_device = torch.device(f"{true_device}:{local_rank}") assert this_device == true_device, f"{this_device} vs {true_device}" elif backend in ("gloo", "horovod"): assert this_device == torch.device(true_device) elif backend == "xla-tpu": assert true_device in this_device.type if rank is None: if idist.model_name() == "native-dist": rank = dist.get_rank() if rank is not None: assert idist.get_rank() == rank assert idist.get_world_size() == ws assert idist.get_local_rank() == local_rank assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist") _sanity_check()
def test_no_distrib(capsys): assert idist.backend() is None if torch.cuda.is_available(): assert idist.device().type == "cuda" else: assert idist.device().type == "cpu" assert idist.get_rank() == 0 assert idist.get_world_size() == 1 assert idist.get_local_rank() == 0 assert idist.model_name() == "serial" from ignite.distributed.utils import _model, _SerialModel _sanity_check() assert isinstance(_model, _SerialModel) idist.show_config() captured = capsys.readouterr() out = captured.err.split("\r") out = list(map(lambda x: x.strip(), out)) out = list(filter(None, out)) assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[ -1] assert "ignite.distributed.utils INFO: backend: None" in out[-1] if torch.cuda.is_available(): assert "ignite.distributed.utils INFO: device: cuda" in out[-1] else: assert "ignite.distributed.utils INFO: device: cpu" in out[-1] assert "ignite.distributed.utils INFO: rank: 0" in out[-1] assert "ignite.distributed.utils INFO: local rank: 0" in out[-1] assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
def _test_frequency_with_engine(workers=None, lower_bound_factor=0.8, every=1): if workers is None: workers = idist.get_world_size() artificial_time = 1.0 / workers # seconds total_tokens = 400 // workers batch_size = 128 // workers estimated_wps = batch_size * workers / artificial_time def update_fn(engine, batch): time.sleep(artificial_time) return {"ntokens": len(batch)} engine = Engine(update_fn) wps_metric = Frequency(output_transform=lambda x: x["ntokens"]) event = Events.ITERATION_COMPLETED(every=every) wps_metric.attach(engine, "wps", event_name=event) @engine.on(event) def assert_wps(e): wps = e.state.metrics["wps"] # Skip iterations 2, 3, 4 if backend is Horovod on CUDA, # wps is abnormally low for these iterations # otherwise, other values of wps are OK if idist.model_name() == "horovod-dist" and e.state.iteration in (2, 3, 4): return assert estimated_wps * lower_bound_factor < wps <= estimated_wps, "{}: {} < {} < {}".format( e.state.iteration, estimated_wps * lower_bound_factor, wps, estimated_wps ) data = [[i] * batch_size for i in range(0, total_tokens, batch_size)] max_epochs = 1 if idist.model_name() != "horovod-dist" else 2 engine.run(data, max_epochs=2)
def assert_wps(e): wps = e.state.metrics["wps"] # Skip iterations 2, 3, 4 if backend is Horovod on CUDA, # wps is abnormally low for these iterations # otherwise, other values of wps are OK if idist.model_name() == "horovod-dist" and e.state.iteration in (2, 3, 4): return assert estimated_wps * lower_bound_factor < wps <= estimated_wps, "{}: {} < {} < {}".format( e.state.iteration, estimated_wps * lower_bound_factor, wps, estimated_wps )
def _test_func(index, ws, device, backend, true_init_method): assert 0 <= index < ws assert index == idist.get_local_rank() assert ws == idist.get_world_size() assert torch.device(device).type == idist.device().type assert backend == idist.backend() if idist.model_name() == "native-dist": from ignite.distributed.utils import _model assert _model._init_method == true_init_method
def training(local_rank, config, **kwargs): import time time.sleep(idist.get_rank() * 0.1) print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}") t = torch.tensor([idist.get_rank()], device=idist.device()) t = idist.all_reduce(t) t = t.item() ws = idist.get_world_size() assert t == ws * (ws - 1) / 2, f"{t} vs {ws}" assert local_rank == idist.get_local_rank() # Test init method: if idist.model_name() == "native-dist": from ignite.distributed.utils import _model true_init_method = config.get("true_init_method", None) assert true_init_method is not None, true_init_method assert _model._init_method == true_init_method
def log_basic_info(logger: Logger, config: Any) -> None: """Logging about pytorch, ignite, configurations, gpu system distributed settings. Parameters ---------- logger Logger instance for logging config config object to log """ import ignite logger.info("PyTorch version: %s", torch.__version__) logger.info("Ignite version: %s", ignite.__version__) if torch.cuda.is_available(): # explicitly import cudnn as # torch.backends.cudnn can not be pickled with hvd spawning procs from torch.backends import cudnn logger.info("GPU device: %s", torch.cuda.get_device_name(idist.get_local_rank())) logger.info("CUDA version: %s", torch.version.cuda) logger.info("CUDNN version: %s", cudnn.version()) logger.info("Configuration: %s", pformat(vars(config))) if idist.get_world_size() > 1: logger.info("distributed configuration: %s", idist.model_name()) logger.info("backend: %s", idist.backend()) logger.info("device: %s", idist.device().type) logger.info("hostname: %s", idist.hostname()) logger.info("world size: %s", idist.get_world_size()) logger.info("rank: %s", idist.get_rank()) logger.info("local rank: %s", idist.get_local_rank()) logger.info("num processes per node: %s", idist.get_nproc_per_node()) logger.info("num nodes: %s", idist.get_nnodes()) logger.info("node rank: %s", idist.get_node_rank())