Esempio n. 1
0
    def manual_running_avg_acc(engine):
        i = engine.state.iteration - 1

        true_acc_metric.reset()
        for j in range(idist.get_world_size()):
            output = (
                torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
                torch.from_numpy(all_y_true_batch_values[j, i, :]),
            )
            true_acc_metric.update(output)

        batch_acc = true_acc_metric._num_correct * 1.0 / true_acc_metric._num_examples

        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]
Esempio n. 2
0
def test_idist_methods_overhead_nccl(distributed_context_single_node_nccl):
    import time

    n = 100000
    start = time.time()
    for _ in range(n):
        _ = idist.get_world_size()
        _ = idist.get_rank()
    elapsed = time.time() - start
    t1 = elapsed / n

    start = time.time()
    for _ in range(n):
        _ = dist.get_world_size()
        _ = idist.get_rank()
    elapsed = time.time() - start
    t2 = elapsed / n

    assert t2 * 3 > t1, "{} * 3 vs {}".format(t2, t1)
Esempio n. 3
0
    def _test(barrier):
        engine = Engine(lambda e, b: b)

        batch_sum = torch.tensor(0).to(device)

        @engine.on(Events.ITERATION_COMPLETED)
        @idist.one_rank_only(with_barrier=barrier)  # ie rank == 0
        def _(_):
            batch_sum.data += torch.tensor(engine.state.batch).to(device)

        engine.run([1, 2, 3], max_epochs=2)

        value_list = idist.all_gather(tensor=batch_sum)

        for r in range(idist.get_world_size()):
            if r == 0:
                assert value_list[r].item() == 12
            else:
                assert value_list[r].item() == 0
Esempio n. 4
0
        def another_wrapper(self: Metric, *args: Any,
                            **kwargs: Any) -> Callable:
            if not isinstance(self, Metric):
                raise RuntimeError(
                    "Decorator sync_all_reduce should be used on ignite.metric.Metric class methods only"
                )
            ws = idist.get_world_size()
            if len(attrs) > 0 and not self._is_reduced:
                if ws > 1:
                    for attr in attrs:
                        t = getattr(self, attr, None)
                        if t is not None:
                            t = idist.all_reduce(t)
                            self._is_reduced = True
                            setattr(self, attr, t)
                else:
                    self._is_reduced = True

            return func(self, *args, **kwargs)
Esempio n. 5
0
def _test_distrib_all_gather(device):

    res = torch.tensor(idist.all_gather(10), device=device)
    true_res = torch.tensor([10,] * idist.get_world_size(), device=device)
    assert (res == true_res).all()

    t = torch.tensor(idist.get_rank(), device=device)
    res = idist.all_gather(t)
    true_res = torch.tensor([i for i in range(idist.get_world_size())], device=device)
    assert (res == true_res).all()

    x = "test-test"
    if idist.get_rank() == 0:
        x = "abc"
    res = idist.all_gather(x)
    true_res = ["abc",] + ["test-test"] * (idist.get_world_size() - 1)
    assert res == true_res

    base_x = "tests/ignite/distributed/utils/test_native.py" * 2000
    x = base_x
    if idist.get_rank() == 0:
        x = "abc"

    res = idist.all_gather(x)
    true_res = ["abc",] + [base_x] * (idist.get_world_size() - 1)
    assert res == true_res

    t = torch.arange(100, device=device).reshape(4, 25) * (idist.get_rank() + 1)
    in_dtype = t.dtype
    res = idist.all_gather(t)
    assert res.shape == (idist.get_world_size() * 4, 25)
    assert res.dtype == in_dtype
    true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device)
    for i in range(idist.get_world_size()):
        true_res[i * 4 : (i + 1) * 4, ...] = torch.arange(100, device=device).reshape(4, 25) * (i + 1)
    assert (res == true_res).all()

    if idist.get_world_size() > 1:
        with pytest.raises(TypeError, match=r"Unhandled input type"):
            idist.all_reduce([0, 1, 2])
Esempio n. 6
0
def _test_macro_distrib_integration(device):

    from ignite.engine import Engine

    rank = idist.get_rank()

    size = len(corpus.chunks)

    data = []
    for c in corpus.chunks:
        data += idist.get_world_size() * [c]

    def update(_, i):
        return data[i + size * rank]

    def _test(metric_device):
        engine = Engine(update)
        m = Bleu(ngram=4, smooth="smooth2")
        m.attach(engine, "bleu")

        engine.run(data=list(range(size)), max_epochs=1)

        assert "bleu" in engine.state.metrics

        ref_bleu = 0
        for candidates, references in data:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                ref_bleu += sentence_bleu(
                    references[0],
                    candidates[0],
                    weights=[0.25, 0.25, 0.25, 0.25],
                    smoothing_function=SmoothingFunction().method2,
                )

        assert pytest.approx(
            engine.state.metrics["bleu"]) == ref_bleu / len(data)

    _test("cpu")

    if device.type != "xla":
        _test(idist.device())
    def _test(metric_device):
        engine = Engine(update)

        m = RootMeanSquaredError(device=metric_device)
        m.attach(engine, "rmse")

        data = list(range(n_iters))
        engine.run(data=data, max_epochs=1)

        assert "rmse" in engine.state.metrics
        res = engine.state.metrics["rmse"]

        y_preds_full = []
        for i in range(idist.get_world_size()):
            y_preds_full.append((i + 1) * torch.ones(offset))
        y_preds_full = torch.stack(y_preds_full).to(device).flatten()

        true_res = np.sqrt(np.mean(np.square((y_true - y_preds_full).cpu().numpy())))

        assert pytest.approx(res, rel=tol) == true_res
Esempio n. 8
0
def initialize(config):
    model = get_model(config.model, config.model_dir, config.dropout, config.n_fc, config.num_classes)

    config.learning_rate *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    optimizer = idist.auto_optim(optimizer)
    loss_fn = nn.BCEWithLogitsLoss()

    le = config.num_iters_per_epoch
    milestones_values = [
        (0, 0.0),
        (le * config.num_warmup_epochs, config.learning_rate),
        (le * config.max_epochs, 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    return model, optimizer, loss_fn, lr_scheduler
Esempio n. 9
0
def _test_distrib_all_reduce(device):

    res = idist.all_reduce(10)
    assert res == 10 * idist.get_world_size()

    t = torch.tensor(10, device=device)
    res = idist.all_reduce(t)
    assert res.item() == 10 * idist.get_world_size()

    rank = idist.get_rank()
    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t)
    assert res.item() == sum(
        [i * 2.0 + 1.0 for i in range(idist.get_world_size())])

    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t, "MIN").item()
    true_val = min([i * 2 + 1 for i in range(idist.get_world_size())])
    assert res == true_val, f"{res} vs {true_val}"

    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t, "MAX").item()
    true_val = max([i * 2.0 + 1.0 for i in range(idist.get_world_size())])
    assert res == true_val, f"{res} vs {true_val}"

    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t, "PRODUCT").item()
    true_val = 1
    for v in [i * 2.0 + 1.0 for i in range(idist.get_world_size())]:
        true_val *= v
    assert res == true_val, f"{res} vs {true_val}"

    if idist.get_world_size() > 1:
        with pytest.raises(TypeError, match=r"Unhandled input type"):
            idist.all_reduce("abc")

        with pytest.raises(ValueError,
                           match=r"Unsupported reduction operation"):
            idist.all_reduce(10, op="ABC")

        t = torch.tensor([0, 1, 2])
        res = idist.all_reduce(t)
        assert res.device == t.device, f"{res.device} vs {t.device}"
Esempio n. 10
0
    def __init__(
        self,
        output_transform: Callable = lambda x: x,
        average: bool = False,
        is_multilabel: bool = False,
        device: Union[str, torch.device] = torch.device("cpu"),
    ):
        if idist.get_world_size() > 1:
            if (not average) and is_multilabel:
                warnings.warn(
                    "Precision/Recall metrics do not work in distributed setting when average=False "
                    "and is_multilabel=True. Results are not reduced across computing devices. Computed result "
                    "corresponds to the local rank's (single process) result.",
                    RuntimeWarning,
                )

        self._average = average
        self.eps = 1e-20
        super(_BasePrecisionRecall, self).__init__(
            output_transform=output_transform, is_multilabel=is_multilabel, device=device
        )
Esempio n. 11
0
def _test_distrib_integration(device, tol=1e-6):
    import numpy as np

    from ignite.engine import Engine

    rank = idist.get_rank()
    n_iters = 100
    s = 10
    offset = n_iters * s

    y_true = torch.arange(0, offset * idist.get_world_size(), dtype=torch.float).to(device)
    y_preds = (rank + 1) * torch.ones(offset, dtype=torch.float).to(device)

    def update(engine, i):
        return y_preds[i * s : (i + 1) * s], y_true[i * s + offset * rank : (i + 1) * s + offset * rank]

    def _test(metric_device):
        engine = Engine(update)

        m = RootMeanSquaredError(device=metric_device)
        m.attach(engine, "rmse")

        data = list(range(n_iters))
        engine.run(data=data, max_epochs=1)

        assert "rmse" in engine.state.metrics
        res = engine.state.metrics["rmse"]

        y_preds_full = []
        for i in range(idist.get_world_size()):
            y_preds_full.append((i + 1) * torch.ones(offset))
        y_preds_full = torch.stack(y_preds_full).to(device).flatten()

        true_res = np.sqrt(np.mean(np.square((y_true - y_preds_full).cpu().numpy())))

        assert pytest.approx(res, rel=tol) == true_res

    _test("cpu")
    if device.type != "xla":
        _test(idist.device())
Esempio n. 12
0
def training(local_rank, config, **kwargs):

    import time

    time.sleep(idist.get_rank() * 0.1)

    print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}")

    t = torch.tensor([idist.get_rank()], device=idist.device())
    t = idist.all_reduce(t)
    t = t.item()
    ws = idist.get_world_size()
    assert t == ws * (ws - 1) / 2, f"{t} vs {ws}"
    assert local_rank == idist.get_local_rank()

    # Test init method:
    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        true_init_method = config.get("true_init_method", None)
        assert true_init_method is not None, true_init_method
        assert _model._init_method == true_init_method
Esempio n. 13
0
    def _test(metric_device):
        n_iters = 60
        s = 16
        offset = n_iters * s

        n_probabilities = 10
        y = torch.rand(offset * idist.get_world_size(), n_probabilities)

        def update(_, i):
            return y[i * s + rank * offset:(i + 1) * s + rank * offset, :]

        engine = Engine(update)
        m = InceptionScore(num_features=n_probabilities,
                           feature_extractor=torch.nn.Identity(),
                           device=metric_device)
        m.attach(engine, "InceptionScore")

        engine.run(data=list(range(n_iters)), max_epochs=1)

        assert "InceptionScore" in engine.state.metrics

        assert pytest.approx(calculate_inception_score(y)) == m.compute()
Esempio n. 14
0
    def __init__(
        self,
        output_transform: Callable = lambda x: x,
        device: Optional[Union[str, torch.device]] = None,
    ):
        self._output_transform = output_transform

        # Check device if distributed is initialized:
        if idist.get_world_size() > 1:

            # check if reset and update methods are decorated. Compute may not be decorated
            if not (hasattr(self.reset, "_decorated")
                    and hasattr(self.update, "_decorated")):
                warnings.warn(
                    "{} class does not support distributed setting. Computed result is not collected "
                    "across all computing devices".format(
                        self.__class__.__name__),
                    RuntimeWarning,
                )
        self._device = device
        self._is_reduced = False
        self.reset()
Esempio n. 15
0
    def _test(data):
        if sampler_name is None:
            sampler = None
        elif sampler_name == "WeightedRandomSampler":
            sampler = WeightedRandomSampler(weights=torch.ones(100), num_samples=100)
        else:
            raise RuntimeError(f"Unknown sampler name: {sampler_name}")

        # Test auto_dataloader
        assert idist.get_world_size() == ws, f"{idist.get_world_size()} vs {ws}"

        shuffle = sampler is None if not isinstance(data, IterableDataset) else False
        dataloader = auto_dataloader(
            data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, shuffle=shuffle
        )

        assert isinstance(dataloader, dl_type)
        if hasattr(dataloader, "_loader"):
            dataloader = dataloader._loader
        if ws < batch_size:
            assert dataloader.batch_size == batch_size // ws
        else:
            assert dataloader.batch_size == batch_size
        if ws <= num_workers:
            assert dataloader.num_workers == (num_workers + nproc - 1) // nproc
        else:
            assert dataloader.num_workers == num_workers

        if isinstance(data, IterableDataset):
            sampler_type = _InfiniteConstantSampler
        elif ws > 1:
            sampler_type = DistributedSampler if sampler is None else DistributedProxySampler
        else:
            sampler_type = RandomSampler if sampler is None else type(sampler)

        assert isinstance(dataloader.sampler, sampler_type)

        if isinstance(dataloader, DataLoader):
            assert dataloader.pin_memory == ("cuda" in idist.device().type)
    def _test(metric_device):
        engine = Engine(update)

        m = MeanPairwiseDistance(device=metric_device)
        m.attach(engine, "mpwd")

        data = list(range(n_iters))
        engine.run(data=data, max_epochs=1)

        assert "mpwd" in engine.state.metrics
        res = engine.state.metrics["mpwd"]

        true_res = []
        for i in range(n_iters * idist.get_world_size()):
            true_res.append(
                torch.pairwise_distance(y_true[i * s:(i + 1) * s, ...],
                                        y_preds[i * s:(i + 1) * s, ...],
                                        p=m._p,
                                        eps=m._eps).cpu().numpy())
        true_res = np.array(true_res).ravel()
        true_res = true_res.mean()

        assert pytest.approx(res) == true_res
Esempio n. 17
0
    def __init__(self, logger: TrainsLogger = None, output_uri: str = None, dirname: str = None, *args, **kwargs):

        self._setup_check_trains(logger, output_uri)

        if not dirname:
            dirname = ""
            if idist.get_rank() == 0:
                dirname = tempfile.mkdtemp(
                    prefix="ignite_checkpoints_{}".format(datetime.now().strftime("%Y_%m_%d_%H_%M_%S_"))
                )
            if idist.get_world_size() > 1:
                dirname = idist.all_gather(dirname)[0]

            warnings.warn("TrainsSaver created a temporary checkpoints directory: {}".format(dirname))
            idist.barrier()

        # Let's set non-atomic tmp dir saving behaviour
        if "atomic" not in kwargs:
            kwargs["atomic"] = False

        self._checkpoint_slots = defaultdict(list)

        super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs)
Esempio n. 18
0
    def __init__(
        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu"),
    ):
        self._output_transform = output_transform

        # Check device if distributed is initialized:
        if idist.get_world_size() > 1:

            # check if reset and update methods are decorated. Compute may not be decorated
            if not (hasattr(self.reset, "_decorated") and hasattr(self.update, "_decorated")):
                warnings.warn(
                    f"{self.__class__.__name__} class does not support distributed setting. "
                    "Computed result is not collected across all computing devices",
                    RuntimeWarning,
                )

        # Some metrics have a large performance regression when run on XLA devices, so for now, we disallow it.
        if torch.device(device).type == "xla":
            raise ValueError("Cannot create metric on an XLA device. Use device='cpu' instead.")

        self._device = torch.device(device)
        self._is_reduced = False
        self.reset()
Esempio n. 19
0
def test_auto_dataloader_warning_distributed_sampler(
        distributed_context_single_node_gloo):
    dataset = DummyDS()
    rank = idist.get_rank()
    world_size = idist.get_world_size()
    auto_dataloader(dataset,
                    sampler=DistributedSampler(dataset,
                                               num_replicas=world_size,
                                               rank=rank))
    if world_size > 1:
        wrong_rank = (rank + 1) % world_size
        expected_warning = f"Found distributed sampler with rank={wrong_rank}, but process rank is {rank}"
        with pytest.warns(UserWarning, match=expected_warning):
            auto_dataloader(dataset,
                            sampler=DistributedSampler(dataset,
                                                       num_replicas=world_size,
                                                       rank=wrong_rank))
    expected_warning = f"Found distributed sampler with num_replicas={world_size + 1}, but world size is {world_size}"
    with pytest.warns(UserWarning, match=expected_warning):
        auto_dataloader(dataset,
                        sampler=DistributedSampler(dataset,
                                                   num_replicas=world_size + 1,
                                                   rank=rank))
Esempio n. 20
0
def test_no_distrib(capsys):

    from ignite.distributed.utils import _model

    print("test_no_distrib : dist: ", dist.is_available())
    print("test_no_distrib : _model", type(_model))

    assert idist.backend() is None
    if torch.cuda.is_available():
        assert idist.device().type == "cuda"
    else:
        assert idist.device().type == "cpu"
    assert idist.get_rank() == 0
    assert idist.get_world_size() == 1
    assert idist.get_local_rank() == 0
    assert idist.model_name() == "serial"

    from ignite.distributed.utils import _model, _SerialModel

    _sanity_check()
    assert isinstance(_model, _SerialModel)

    idist.show_config()
    captured = capsys.readouterr()
    out = captured.err.split("\r")
    out = list(map(lambda x: x.strip(), out))
    out = list(filter(None, out))
    assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[
        -1]
    assert "ignite.distributed.utils INFO: backend: None" in out[-1]
    if torch.cuda.is_available():
        assert "ignite.distributed.utils INFO: device: cuda" in out[-1]
    else:
        assert "ignite.distributed.utils INFO: device: cpu" in out[-1]
    assert "ignite.distributed.utils INFO: rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: local rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
Esempio n. 21
0
def _test_auto_dataloader(ws, nproc, batch_size, num_workers=1, sampler_name=None, dl_type=DataLoader):

    data = torch.rand(100, 3, 12, 12)

    if sampler_name is None:
        sampler = None
    elif sampler_name == "WeightedRandomSampler":
        sampler = WeightedRandomSampler(weights=torch.ones(100), num_samples=100)
    else:
        raise RuntimeError("Unknown sampler name: {}".format(sampler_name))

    # Test auto_dataloader
    assert idist.get_world_size() == ws
    dataloader = auto_dataloader(
        data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, shuffle=sampler is None
    )

    assert isinstance(dataloader, dl_type)
    if hasattr(dataloader, "_loader"):
        dataloader = dataloader._loader
    if ws < batch_size:
        assert dataloader.batch_size == batch_size // ws
    else:
        assert dataloader.batch_size == batch_size
    if ws <= num_workers:
        assert dataloader.num_workers == (num_workers + nproc - 1) // nproc
    else:
        assert dataloader.num_workers == num_workers

    if ws < 2:
        sampler_type = RandomSampler if sampler is None else type(sampler)
        assert isinstance(dataloader.sampler, sampler_type)
    else:
        sampler_type = DistributedSampler if sampler is None else DistributedProxySampler
        assert isinstance(dataloader.sampler, sampler_type)
    if isinstance(dataloader, DataLoader):
        assert dataloader.pin_memory == ("cuda" in idist.device().type)
Esempio n. 22
0
def _test_distrib_config(local_rank,
                         backend,
                         ws,
                         true_device,
                         rank=None,
                         true_init_method=None):
    assert idist.backend() == backend, f"{idist.backend()} vs {backend}"

    this_device = idist.device()
    assert isinstance(this_device, torch.device)
    if backend in ("nccl", "horovod") and "cuda" in this_device.type:
        true_device = torch.device(f"{true_device}:{local_rank}")
        assert this_device == true_device, f"{this_device} vs {true_device}"
    elif backend in ("gloo", "horovod"):
        assert this_device == torch.device(true_device)
    elif backend == "xla-tpu":
        assert true_device in this_device.type

    if rank is None:
        if idist.model_name() == "native-dist":
            rank = dist.get_rank()

    if rank is not None:
        assert idist.get_rank() == rank

    assert idist.get_world_size() == ws
    assert idist.get_local_rank() == local_rank

    assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist")

    _sanity_check()

    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        if true_init_method is not None:
            assert _model._init_method == true_init_method
Esempio n. 23
0
    def __init__(
        self,
        logger: Optional[ClearMLLogger] = None,
        output_uri: Optional[str] = None,
        dirname: Optional[str] = None,
        *args: Any,
        **kwargs: Any,
    ):

        self._setup_check_clearml(logger, output_uri)

        if not dirname:
            dirname = ""
            if idist.get_rank() == 0:
                dirname = tempfile.mkdtemp(
                    prefix=
                    f"ignite_checkpoints_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')}"
                )
            if idist.get_world_size() > 1:
                dirname = idist.all_gather(dirname)[
                    0]  # type: ignore[index, assignment]

            warnings.warn(
                f"ClearMLSaver created a temporary checkpoints directory: {dirname}"
            )
            idist.barrier()

        # Let's set non-atomic tmp dir saving behaviour
        if "atomic" not in kwargs:
            kwargs["atomic"] = False

        self._checkpoint_slots = defaultdict(
            list)  # type: DefaultDict[Union[str, Tuple[str, str]], List[Any]]

        super(ClearMLSaver, self).__init__(dirname=dirname, *args,
                                           **kwargs)  # type: ignore[misc]
Esempio n. 24
0
def log_basic_info(logger, config):
    logger.info(f"Train {config['model']} on CIFAR10")
    logger.info(f"- PyTorch version: {torch.__version__}")
    logger.info(f"- Ignite version: {ignite.__version__}")
    if torch.cuda.is_available():
        # explicitly import cudnn as
        # torch.backends.cudnn can not be pickled with hvd spawning procs
        from torch.backends import cudnn

        logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}")
        logger.info(f"- CUDA version: {torch.version.cuda}")
        logger.info(f"- CUDNN version: {cudnn.version()}")

    logger.info("\n")
    logger.info("Configuration:")
    for key, value in config.items():
        logger.info(f"\t{key}: {value}")
    logger.info("\n")

    if idist.get_world_size() > 1:
        logger.info("\nDistributed setting:")
        logger.info(f"\tbackend: {idist.backend()}")
        logger.info(f"\tworld size: {idist.get_world_size()}")
        logger.info("\n")
Esempio n. 25
0
def _test_frequency_with_engine(workers=None, lower_bound_factor=0.8, every=1):

    if workers is None:
        workers = idist.get_world_size()

    artificial_time = 1.0 / workers  # seconds
    total_tokens = 400 // workers
    batch_size = 128 // workers

    estimated_wps = batch_size * workers / artificial_time

    def update_fn(engine, batch):
        time.sleep(artificial_time)
        return {"ntokens": len(batch)}

    engine = Engine(update_fn)
    wps_metric = Frequency(output_transform=lambda x: x["ntokens"])
    event = Events.ITERATION_COMPLETED(every=every)
    wps_metric.attach(engine, "wps", event_name=event)

    @engine.on(event)
    def assert_wps(e):
        wps = e.state.metrics["wps"]
        # Skip iterations 2, 3, 4 if backend is Horovod on CUDA,
        # wps is abnormally low for these iterations
        # otherwise, other values of wps are OK
        if idist.model_name() == "horovod-dist" and e.state.iteration in (2, 3,
                                                                          4):
            return
        assert (
            estimated_wps * lower_bound_factor < wps <= estimated_wps
        ), f"{e.state.iteration}: {estimated_wps * lower_bound_factor} < {wps} < {estimated_wps}"

    data = [[i] * batch_size for i in range(0, total_tokens, batch_size)]
    max_epochs = 1 if idist.model_name() != "horovod-dist" else 2
    engine.run(data, max_epochs=2)
Esempio n. 26
0
def setup_common_training_handlers(
    trainer: Engine,
    train_sampler: Optional[DistributedSampler] = None,
    to_save: Optional[Mapping] = None,
    save_every_iters: int = 1000,
    output_path: Optional[str] = None,
    lr_scheduler: Optional[Union[ParamScheduler, _LRScheduler]] = None,
    with_gpu_stats: bool = False,
    output_names: Optional[Iterable[str]] = None,
    with_pbars: bool = True,
    with_pbar_on_iters: bool = True,
    log_every_iters: int = 100,
    stop_on_nan: bool = True,
    clear_cuda_cache: bool = True,
    save_handler: Optional[Union[Callable, BaseSaveHandler]] = None,
    **kwargs: Any,
) -> None:
    """Helper method to setup trainer with common handlers (it also supports distributed configuration):

        - :class:`~ignite.handlers.TerminateOnNan`
        - handler to setup learning rate scheduling
        - :class:`~ignite.handlers.ModelCheckpoint`
        - :class:`~ignite.metrics.RunningAverage` on `update_function` output
        - Two progress bars on epochs and optionally on iterations

    Args:
        trainer (Engine): trainer engine. Output of trainer's `update_function` should be a dictionary
            or sequence or a single tensor.
        train_sampler (torch.utils.data.DistributedSampler, optional): Optional distributed sampler used to call
            `set_epoch` method on epoch started event.
        to_save (dict, optional): dictionary with objects to save in the checkpoint. This argument is passed to
            :class:`~ignite.handlers.Checkpoint` instance.
        save_every_iters (int, optional): saving interval. By default, `to_save` objects are stored
            each 1000 iterations.
        output_path (str, optional): output path to indicate where `to_save` objects are stored. Argument is mutually
            exclusive with ``save_handler``.
        lr_scheduler (ParamScheduler or subclass of `torch.optim.lr_scheduler._LRScheduler`): learning rate scheduler
            as native torch LRScheduler or ignite's parameter scheduler.
        with_gpu_stats (bool, optional): if True, :class:`~ignite.contrib.metrics.handlers.GpuInfo` is attached to the
            trainer. This requires `pynvml` package to be installed.
        output_names (list/tuple, optional): list of names associated with `update_function` output dictionary.
        with_pbars (bool, optional): if True, two progress bars on epochs and optionally on iterations are attached.
            Default, True.
        with_pbar_on_iters (bool, optional): if True, a progress bar on iterations is attached to the trainer.
            Default, True.
        log_every_iters (int, optional): logging interval for :class:`~ignite.contrib.metrics.handlers.GpuInfo` and for
            epoch-wise progress bar. Default, 100.
        stop_on_nan (bool, optional): if True, :class:`~ignite.handlers.TerminateOnNan` handler is added to the trainer.
            Default, True.
        clear_cuda_cache (bool, optional): if True, `torch.cuda.empty_cache()` is called every end of epoch.
            Default, True.
        save_handler (callable or :class:`~ignite.handlers.checkpoint.BaseSaveHandler`, optional): Method or callable
            class to use to store ``to_save``. See :class:`~ignite.handlers.checkpoint.Checkpoint` for more details.
            Argument is mutually exclusive with ``output_path``.
        **kwargs: optional keyword args to be passed to construct :class:`~ignite.handlers.checkpoint.Checkpoint`.
    """

    if idist.get_world_size() > 1:
        _setup_common_distrib_training_handlers(
            trainer,
            train_sampler=train_sampler,
            to_save=to_save,
            save_every_iters=save_every_iters,
            output_path=output_path,
            lr_scheduler=lr_scheduler,
            with_gpu_stats=with_gpu_stats,
            output_names=output_names,
            with_pbars=with_pbars,
            with_pbar_on_iters=with_pbar_on_iters,
            log_every_iters=log_every_iters,
            stop_on_nan=stop_on_nan,
            clear_cuda_cache=clear_cuda_cache,
            save_handler=save_handler,
            **kwargs,
        )
    else:
        if train_sampler is not None and isinstance(train_sampler,
                                                    DistributedSampler):
            warnings.warn(
                "Argument train_sampler is a distributed sampler,"
                " but either there is no distributed setting or world size is < 2. "
                "Train sampler argument will be ignored",
                UserWarning,
            )
        _setup_common_training_handlers(
            trainer,
            to_save=to_save,
            save_every_iters=save_every_iters,
            output_path=output_path,
            lr_scheduler=lr_scheduler,
            with_gpu_stats=with_gpu_stats,
            output_names=output_names,
            with_pbars=with_pbars,
            with_pbar_on_iters=with_pbar_on_iters,
            log_every_iters=log_every_iters,
            stop_on_nan=stop_on_nan,
            clear_cuda_cache=clear_cuda_cache,
            save_handler=save_handler,
            **kwargs,
        )
Esempio n. 27
0
 def _get_output_value(self) -> Union[torch.Tensor, float]:
     # we need to compute average instead of sum produced by @sync_all_reduce("src")
     output = cast(Union[torch.Tensor, float],
                   self.src) / idist.get_world_size()
     return output
Esempio n. 28
0
    def _test(metric_device):
        metric_device = torch.device(metric_device)
        acc = Accuracy(is_multilabel=True, device=metric_device)

        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
        # check that result is not changed
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        # Batched Updates
        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()

        batch_size = 16
        n_iters = y.shape[0] // batch_size + 1

        for i in range(n_iters):
            idx = i * batch_size
            acc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, L, ...) -> (N * L * ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, L, ...) -> (N * L ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
Esempio n. 29
0
def _test_distrib_integration_multilabel(device):

    from ignite.engine import Engine

    rank = idist.get_rank()
    torch.manual_seed(12)

    def _test(average, n_epochs):
        n_iters = 60
        s = 16
        n_classes = 7

        offset = n_iters * s
        y_true = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device)
        y_preds = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device)

        def update(engine, i):
            return (
                y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, ...],
                y_true[i * s + rank * offset : (i + 1) * s + rank * offset, ...],
            )

        engine = Engine(update)

        pr = Precision(average=average, is_multilabel=True)
        pr.attach(engine, "pr")

        data = list(range(n_iters))
        engine.run(data=data, max_epochs=n_epochs)

        assert "pr" in engine.state.metrics
        res = engine.state.metrics["pr"]
        res2 = pr.compute()
        if isinstance(res, torch.Tensor):
            res = res.cpu().numpy()
            res2 = res2.cpu().numpy()
            assert (res == res2).all()
        else:
            assert res == res2

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UndefinedMetricWarning)
            true_res = precision_score(
                to_numpy_multilabel(y_true), to_numpy_multilabel(y_preds), average="samples" if average else None
            )

        assert pytest.approx(res) == true_res

    for _ in range(2):
        _test(average=True, n_epochs=1)
        _test(average=True, n_epochs=2)

    if idist.get_world_size() > 1:
        with pytest.warns(
            RuntimeWarning,
            match="Precision/Recall metrics do not work in distributed setting when "
            "average=False and is_multilabel=True",
        ):
            pr = Precision(average=False, is_multilabel=True)

        y_pred = torch.randint(0, 2, size=(4, 3, 6, 8))
        y = torch.randint(0, 2, size=(4, 3, 6, 8)).long()
        pr.update((y_pred, y))
        pr_compute1 = pr.compute()
        pr_compute2 = pr.compute()
        assert len(pr_compute1) == 4 * 6 * 8
        assert (pr_compute1 == pr_compute2).all()
Esempio n. 30
0
    def _test(metric_device):
        data = list(range(n_iters))
        np.random.seed(12)
        all_y_true_batch_values = np.random.randint(
            0,
            n_classes,
            size=(idist.get_world_size(), n_epochs * n_iters, batch_size))
        all_y_pred_batch_values = np.random.rand(idist.get_world_size(),
                                                 n_epochs * n_iters,
                                                 batch_size, n_classes)

        y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
        y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])

        def update_fn(engine, batch):
            y_true_batch = next(y_true_batch_values)
            y_pred_batch = next(y_pred_batch_values)
            return torch.from_numpy(y_pred_batch), torch.from_numpy(
                y_true_batch)

        trainer = Engine(update_fn)
        alpha = 0.98

        acc_metric = RunningAverage(Accuracy(
            output_transform=lambda x: [x[0], x[1]], device=metric_device),
                                    alpha=alpha,
                                    epoch_bound=False)
        acc_metric.attach(trainer, "running_avg_accuracy")

        running_avg_acc = [
            None,
        ]
        true_acc_metric = Accuracy(device=metric_device)

        @trainer.on(Events.ITERATION_COMPLETED)
        def manual_running_avg_acc(engine):
            i = engine.state.iteration - 1

            true_acc_metric.reset()
            for j in range(idist.get_world_size()):
                output = (
                    torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
                    torch.from_numpy(all_y_true_batch_values[j, i, :]),
                )
                true_acc_metric.update(output)

            batch_acc = true_acc_metric._num_correct.item(
            ) * 1.0 / true_acc_metric._num_examples

            if running_avg_acc[0] is None:
                running_avg_acc[0] = batch_acc
            else:
                running_avg_acc[0] = running_avg_acc[0] * alpha + (
                    1.0 - alpha) * batch_acc
            engine.state.running_avg_acc = running_avg_acc[0]

        @trainer.on(Events.ITERATION_COMPLETED)
        def assert_equal_running_avg_acc_values(engine):
            assert (
                engine.state.running_avg_acc ==
                engine.state.metrics["running_avg_accuracy"]
            ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"

        trainer.run(data, max_epochs=3)