Example #1
0
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None):
    assert idist.backend() == backend, "{} vs {}".format(
        idist.backend(), backend)

    this_device = idist.device()
    assert isinstance(this_device, torch.device)
    if backend == "nccl":
        true_device = torch.device("{}:{}".format(true_device, local_rank))
        assert this_device == true_device, "{} vs {}".format(
            this_device, true_device)
    elif backend == "gloo":
        assert this_device == torch.device(true_device)
    elif backend == "xla-tpu":
        assert true_device in this_device.type

    if rank is None:
        if idist.model_name() == "native-dist":
            rank = dist.get_rank()
            assert idist.get_rank() == rank

    assert idist.get_world_size() == ws
    assert idist.get_local_rank() == local_rank

    assert idist.model_name() in ("native-dist", "xla-dist")

    _sanity_check()
Example #2
0
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None):
    assert idist.backend() == backend, f"{idist.backend()} vs {backend}"

    this_device = idist.device()
    assert isinstance(this_device, torch.device)
    if backend in ("nccl", "horovod") and "cuda" in this_device.type:
        true_device = torch.device(f"{true_device}:{local_rank}")
        assert this_device == true_device, f"{this_device} vs {true_device}"
    elif backend in ("gloo", "horovod"):
        assert this_device == torch.device(true_device)
    elif backend == "xla-tpu":
        assert true_device in this_device.type

    if rank is None:
        if idist.model_name() == "native-dist":
            rank = dist.get_rank()

    if rank is not None:
        assert idist.get_rank() == rank

    assert idist.get_world_size() == ws
    assert idist.get_local_rank() == local_rank

    assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist")

    _sanity_check()
Example #3
0
def test_no_distrib(capsys):

    assert idist.backend() is None
    if torch.cuda.is_available():
        assert idist.device().type == "cuda"
    else:
        assert idist.device().type == "cpu"
    assert idist.get_rank() == 0
    assert idist.get_world_size() == 1
    assert idist.get_local_rank() == 0
    assert idist.model_name() == "serial"

    from ignite.distributed.utils import _model, _SerialModel

    _sanity_check()
    assert isinstance(_model, _SerialModel)

    idist.show_config()
    captured = capsys.readouterr()
    out = captured.err.split("\r")
    out = list(map(lambda x: x.strip(), out))
    out = list(filter(None, out))
    assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[
        -1]
    assert "ignite.distributed.utils INFO: backend: None" in out[-1]
    if torch.cuda.is_available():
        assert "ignite.distributed.utils INFO: device: cuda" in out[-1]
    else:
        assert "ignite.distributed.utils INFO: device: cpu" in out[-1]
    assert "ignite.distributed.utils INFO: rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: local rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
Example #4
0
def _test_frequency_with_engine(workers=None, lower_bound_factor=0.8, every=1):

    if workers is None:
        workers = idist.get_world_size()

    artificial_time = 1.0 / workers  # seconds
    total_tokens = 400 // workers
    batch_size = 128 // workers

    estimated_wps = batch_size * workers / artificial_time

    def update_fn(engine, batch):
        time.sleep(artificial_time)
        return {"ntokens": len(batch)}

    engine = Engine(update_fn)
    wps_metric = Frequency(output_transform=lambda x: x["ntokens"])
    event = Events.ITERATION_COMPLETED(every=every)
    wps_metric.attach(engine, "wps", event_name=event)

    @engine.on(event)
    def assert_wps(e):
        wps = e.state.metrics["wps"]
        # Skip iterations 2, 3, 4 if backend is Horovod on CUDA,
        # wps is abnormally low for these iterations
        # otherwise, other values of wps are OK
        if idist.model_name() == "horovod-dist" and e.state.iteration in (2, 3, 4):
            return
        assert estimated_wps * lower_bound_factor < wps <= estimated_wps, "{}: {} < {} < {}".format(
            e.state.iteration, estimated_wps * lower_bound_factor, wps, estimated_wps
        )

    data = [[i] * batch_size for i in range(0, total_tokens, batch_size)]
    max_epochs = 1 if idist.model_name() != "horovod-dist" else 2
    engine.run(data, max_epochs=2)
Example #5
0
 def assert_wps(e):
     wps = e.state.metrics["wps"]
     # Skip iterations 2, 3, 4 if backend is Horovod on CUDA,
     # wps is abnormally low for these iterations
     # otherwise, other values of wps are OK
     if idist.model_name() == "horovod-dist" and e.state.iteration in (2, 3, 4):
         return
     assert estimated_wps * lower_bound_factor < wps <= estimated_wps, "{}: {} < {} < {}".format(
         e.state.iteration, estimated_wps * lower_bound_factor, wps, estimated_wps
     )
Example #6
0
def _test_func(index, ws, device, backend, true_init_method):
    assert 0 <= index < ws
    assert index == idist.get_local_rank()
    assert ws == idist.get_world_size()
    assert torch.device(device).type == idist.device().type
    assert backend == idist.backend()

    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        assert _model._init_method == true_init_method
Example #7
0
def training(local_rank, config, **kwargs):

    import time

    time.sleep(idist.get_rank() * 0.1)

    print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}")

    t = torch.tensor([idist.get_rank()], device=idist.device())
    t = idist.all_reduce(t)
    t = t.item()
    ws = idist.get_world_size()
    assert t == ws * (ws - 1) / 2, f"{t} vs {ws}"
    assert local_rank == idist.get_local_rank()

    # Test init method:
    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        true_init_method = config.get("true_init_method", None)
        assert true_init_method is not None, true_init_method
        assert _model._init_method == true_init_method
Example #8
0
def log_basic_info(logger: Logger, config: Any) -> None:
    """Logging about pytorch, ignite, configurations, gpu system
    distributed settings.

    Parameters
    ----------
    logger
        Logger instance for logging
    config
        config object to log
    """
    import ignite

    logger.info("PyTorch version: %s", torch.__version__)
    logger.info("Ignite version: %s", ignite.__version__)
    if torch.cuda.is_available():
        # explicitly import cudnn as
        # torch.backends.cudnn can not be pickled with hvd spawning procs
        from torch.backends import cudnn

        logger.info("GPU device: %s", torch.cuda.get_device_name(idist.get_local_rank()))
        logger.info("CUDA version: %s", torch.version.cuda)
        logger.info("CUDNN version: %s", cudnn.version())

    logger.info("Configuration: %s", pformat(vars(config)))

    if idist.get_world_size() > 1:
        logger.info("distributed configuration: %s", idist.model_name())
        logger.info("backend: %s", idist.backend())
        logger.info("device: %s", idist.device().type)
        logger.info("hostname: %s", idist.hostname())
        logger.info("world size: %s", idist.get_world_size())
        logger.info("rank: %s", idist.get_rank())
        logger.info("local rank: %s", idist.get_local_rank())
        logger.info("num processes per node: %s", idist.get_nproc_per_node())
        logger.info("num nodes: %s", idist.get_nnodes())
        logger.info("node rank: %s", idist.get_node_rank())