Esempio n. 1
0
def _test_distrib_average(device):

    with pytest.raises(NotComputableError):
        v = Average(device=device)
        v.compute()

    mean_var = Average(device=device)
    y_true = torch.rand(100, dtype=torch.float64) + torch.randint(
        0, 10, size=(100, )).double()
    y_true = y_true.to(device)

    for y in y_true:
        mean_var.update(y)

    m = mean_var.compute()

    y_true = idist.all_reduce(y_true)
    assert m.item() == pytest.approx(y_true.mean().item() /
                                     idist.get_world_size())

    mean_var = Average(device=device)
    y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(
        0, 10, size=(100, 10)).double()
    y_true = y_true.to(device)

    for y in y_true:
        mean_var.update(y)

    m = mean_var.compute()

    y_true = idist.all_reduce(y_true)
    np.testing.assert_almost_equal(m.cpu().numpy(),
                                   y_true.mean(dim=0).cpu().numpy() /
                                   idist.get_world_size(),
                                   decimal=5)
Esempio n. 2
0
    def _test(metric_device):
        with pytest.raises(NotComputableError):
            v = GeometricAverage(device=metric_device)
            v.compute()

        decimal = 5 if device.type != "xla" else 4

        mean_var = GeometricAverage(device=metric_device)
        y_true = torch.rand(100, dtype=torch.float64) + torch.randint(0, 10, size=(100,)).double()
        y_true = y_true.to(device)

        for y in y_true:
            mean_var.update(y)

        m = mean_var.compute()
        log_y_true = torch.log(y_true)
        log_y_true = idist.all_reduce(log_y_true)
        np.testing.assert_almost_equal(
            m, torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).item(), decimal=decimal
        )

        mean_var = GeometricAverage(device=metric_device)
        y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(0, 10, size=(100, 10)).double()
        y_true = y_true.to(device)

        for y in y_true:
            mean_var.update(y)

        m = mean_var.compute()
        log_y_true = torch.log(y_true)
        log_y_true = idist.all_reduce(log_y_true)
        np.testing.assert_almost_equal(
            m.cpu().numpy(), torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).cpu().numpy(), decimal=decimal
        )
Esempio n. 3
0
def _test_distrib_variable_accumulation(device):

    mean_var = VariableAccumulation(lambda a, x: a + x, device=device)
    y_true = torch.rand(100, device=device, dtype=torch.float64)

    for y in y_true:
        mean_var.update(y)

    y_true = idist.all_reduce(y_true)
    a, n = mean_var.compute()
    assert a.item() == pytest.approx(y_true.sum().item())
    assert n == len(y_true) * idist.get_world_size()
    # check if call compute twice
    a, n = mean_var.compute()
    assert a.item() == pytest.approx(y_true.sum().item())
    assert n == len(y_true) * idist.get_world_size()

    mean_var = VariableAccumulation(lambda a, x: a + x, device=device)
    y_true = torch.rand(50, 10, device=device, dtype=torch.float64)

    for y in y_true:
        mean_var.update(y)

    y_true = idist.all_reduce(y_true)
    a, n = mean_var.compute()
    assert n == len(y_true) * idist.get_world_size()
    np.testing.assert_almost_equal(a.cpu().numpy(),
                                   y_true.sum(dim=0).cpu().numpy(),
                                   decimal=4)
    a, n = mean_var.compute()
    assert n == len(y_true) * idist.get_world_size()
    np.testing.assert_almost_equal(a.cpu().numpy(),
                                   y_true.sum(dim=0).cpu().numpy(),
                                   decimal=4)
Esempio n. 4
0
    def compute(self) -> Union[torch.Tensor, float]:
        is_scalar = not isinstance(self._positives,
                                   torch.Tensor) or self._positives.ndim == 0
        if is_scalar and self._positives == 0:
            raise NotComputableError(
                f"{self.__class__.__name__} must have at least one example before it can be computed."
            )
        if not self._is_reduced:
            if not (self._type == "multilabel" and not self._average):
                self._true_positives = idist.all_reduce(
                    self._true_positives)  # type: ignore[assignment]
                self._positives = idist.all_reduce(
                    self._positives)  # type: ignore[assignment]
            else:
                self._true_positives = cast(
                    torch.Tensor, idist.all_gather(self._true_positives))
                self._positives = cast(torch.Tensor,
                                       idist.all_gather(self._positives))
            self._is_reduced = True  # type: bool

        result = self._true_positives / (self._positives + self.eps)

        if self._average:
            return cast(torch.Tensor, result).mean().item()
        else:
            return result
Esempio n. 5
0
def _test_distrib_all_gather(device):

    res = torch.tensor(idist.all_gather(10), device=device)
    true_res = torch.tensor([
        10,
    ] * idist.get_world_size(), device=device)
    assert (res == true_res).all()

    t = torch.tensor(idist.get_rank(), device=device)
    res = idist.all_gather(t)
    true_res = torch.tensor([i for i in range(idist.get_world_size())],
                            device=device)
    assert (res == true_res).all()

    x = "test-test"
    if idist.get_rank() == 0:
        x = "abc"
    res = idist.all_gather(x)
    true_res = [
        "abc",
    ] + ["test-test"] * (idist.get_world_size() - 1)
    assert res == true_res

    base_x = "x" * 1026
    x = base_x
    if idist.get_rank() == 0:
        x = "abc"

    if idist.get_rank() > 0:
        with pytest.warns(
                UserWarning,
                match=r"is larger than 1024 and thus will be truncated"):
            res = idist.all_gather(x)
    else:
        res = idist.all_gather(x)
    true_res = [
        "abc",
    ] + [base_x[:1024]] * (idist.get_world_size() - 1)
    assert res == true_res

    t = torch.arange(100, device=device).reshape(4,
                                                 25) * (idist.get_rank() + 1)
    in_dtype = t.dtype
    res = idist.all_gather(t)
    assert res.shape == (idist.get_world_size() * 4, 25)
    assert res.dtype == in_dtype
    true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device)
    for i in range(idist.get_world_size()):
        true_res[i * 4:(i + 1) * 4, ...] = torch.arange(
            100, device=device).reshape(4, 25) * (i + 1)
    assert (res == true_res).all()

    if idist.get_world_size() > 1:
        with pytest.raises(TypeError, match=r"Unhandled input type"):
            idist.all_reduce([0, 1, 2])
Esempio n. 6
0
def _test_distrib_all_gather(device):

    res = torch.tensor(idist.all_gather(10), device=device)
    true_res = torch.tensor([
        10,
    ] * idist.get_world_size(), device=device)
    assert (res == true_res).all()

    t = torch.tensor(idist.get_rank(), device=device)
    res = idist.all_gather(t)
    true_res = torch.tensor([i for i in range(idist.get_world_size())],
                            device=device)
    assert (res == true_res).all()

    x = "test-test"
    if idist.get_rank() == 0:
        x = "abc"
    res = idist.all_gather(x)
    true_res = [
        "abc",
    ] + ["test-test"] * (idist.get_world_size() - 1)
    assert res == true_res

    base_x = "tests/ignite/distributed/utils/test_native.py" * 2000
    x = base_x
    if idist.get_rank() == 0:
        x = "abc"

    res = idist.all_gather(x)
    true_res = [
        "abc",
    ] + [base_x] * (idist.get_world_size() - 1)
    assert res == true_res

    t = torch.arange(100, device=device).reshape(4,
                                                 25) * (idist.get_rank() + 1)
    in_dtype = t.dtype
    res = idist.all_gather(t)
    assert res.shape == (idist.get_world_size() * 4, 25)
    assert res.dtype == in_dtype
    true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device)
    for i in range(idist.get_world_size()):
        true_res[i * 4:(i + 1) * 4, ...] = torch.arange(
            100, device=device).reshape(4, 25) * (i + 1)
    assert (res == true_res).all()

    if idist.get_world_size() > 1:
        with pytest.raises(TypeError, match=r"Unhandled input type"):
            idist.all_reduce([0, 1, 2])
Esempio n. 7
0
def _test_distrib_log_lr_and_loss(device):
    from ignite.handlers import ParamScheduler

    lr_finder = FastaiLRFinder()
    _lr_schedule = MagicMock(spec=ParamScheduler)

    # minimal setup for lr_finder to make _log_lr_and_loss work
    rank = idist.get_rank()
    loss = 0.01 * (rank + 1)

    engine = Engine(lambda e, b: None)

    engine.state.output = loss
    engine.state.iteration = 1
    lr_finder._lr_schedule = _lr_schedule
    lr_finder._history["loss"] = []
    lr_finder._history["lr"] = []

    lr_finder._log_lr_and_loss(engine,
                               output_transform=lambda x: x,
                               smooth_f=0.1,
                               diverge_th=10.0)

    expected_loss = idist.all_reduce(loss)
    assert pytest.approx(lr_finder._history["loss"][-1]) == expected_loss
Esempio n. 8
0
        def another_wrapper(self: Metric, *args: Any,
                            **kwargs: Any) -> Callable:
            if not isinstance(self, Metric):
                raise RuntimeError(
                    "Decorator sync_all_reduce should be used on ignite.metric.Metric class methods only"
                )
            ws = idist.get_world_size()
            if len(attrs) > 0 and not self._is_reduced:
                if ws > 1:
                    for attr in attrs:
                        op_kwargs = {}
                        if ":" in attr:
                            attr, op = attr.split(":")
                            valid_ops = ["MIN", "MAX", "SUM", "PRODUCT"]
                            if op not in valid_ops:
                                raise ValueError(
                                    f"Reduction operation is not valid (expected : {valid_ops}, got: {op}"
                                )
                            op_kwargs["op"] = op
                        t = getattr(self, attr, None)
                        if t is not None:
                            t = idist.all_reduce(t, **op_kwargs)
                            self._is_reduced = True
                            setattr(self, attr, t)
                else:
                    self._is_reduced = True

            return func(self, *args, **kwargs)
Esempio n. 9
0
def _dist_geom_mean(y_true):
    log_y_true = torch.log(y_true)
    log_y_true = idist.all_reduce(log_y_true)
    if len(log_y_true.shape) > 2:
        log_y_true = log_y_true.reshape(-1, log_y_true.shape[-1])
    np_t = log_y_true.cpu().numpy()
    return np.exp(np.mean(np_t, axis=0) / idist.get_world_size())
Esempio n. 10
0
    def compute(self):
        is_scalar = not isinstance(self._positives, torch.Tensor) or self._positives.ndim == 0
        if is_scalar and self._positives == 0:
            return -1

        if not (self._type == "multilabel" and not self._average):
            if not self._is_reduced:
                self._true_positives = idist.all_reduce(self._true_positives)  # type: ignore[assignment]
                self._positives = idist.all_reduce(self._positives)  # type: ignore[assignment]
                self._is_reduced = True  # type: bool

        result = self._true_positives / (self._positives + self.eps)

        if self._average:
            return cast(torch.Tensor, result).mean().item()
        else:
            return result
Esempio n. 11
0
    def compute(self) -> Union[torch.Tensor, float]:
        if not (isinstance(self._positives, torch.Tensor)
                or self._positives > 0):
            raise NotComputableError("{} must have at least one example before"
                                     " it can be computed.".format(
                                         self.__class__.__name__))

        if not (self._type == "multilabel" and not self._average):
            if not self._is_reduced:
                self._true_positives = idist.all_reduce(self._true_positives)
                self._positives = idist.all_reduce(self._positives)
                self._is_reduced = True

        result = self._true_positives / (self._positives + self.eps)

        if self._average:
            return result.mean().item()
        else:
            return result
Esempio n. 12
0
def _test_distrib_barrier(device):

    t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float)
    true_res = sum([i for i in range(idist.get_world_size())])

    if idist.get_rank() == 0:
        t += 10.0
    idist.barrier()

    tt = idist.all_reduce(t)
    assert tt.item() == true_res + 10.0
Esempio n. 13
0
def training(local_rank, config, **kwargs):

    import time

    time.sleep(idist.get_rank() * 0.1)

    print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs,
          f"- backend={idist.backend()}")

    t = torch.tensor([idist.get_rank()], device=idist.device())
    t = idist.all_reduce(t)
    t = t.item()
    ws = idist.get_world_size()
    assert t == ws * (ws - 1) / 2, f"{t} vs {ws}"
    assert local_rank == idist.get_local_rank()
Esempio n. 14
0
        def another_wrapper(self: Metric, *args, **kwargs) -> Callable:
            if not isinstance(self, Metric):
                raise RuntimeError(
                    "Decorator sync_all_reduce should be used on ignite.metric.Metric class methods only"
                )

            if len(attrs) > 0 and not self._is_reduced:
                for attr in attrs:
                    t = getattr(self, attr, None)
                    if t is not None and idist.get_world_size() > 1:
                        t = idist.all_reduce(t)
                        self._is_reduced = True
                        setattr(self, attr, t)

            return func(self, *args, **kwargs)
Esempio n. 15
0
    def _log_lr_and_loss(self, trainer: Engine, output_transform: Callable, smooth_f: float, diverge_th: float) -> None:
        output = trainer.state.output
        loss = output_transform(output)
        loss = idist.all_reduce(loss)
        lr = self._lr_schedule.get_param()  # type: ignore[union-attr]
        self._history["lr"].append(lr)
        if trainer.state.iteration == 1:
            self._best_loss = loss
        else:
            if smooth_f > 0:
                loss = smooth_f * loss + (1 - smooth_f) * self._history["loss"][-1]
            if loss < self._best_loss:
                self._best_loss = loss
        self._history["loss"].append(loss)

        # Check if the loss has diverged; if it has, stop the trainer
        if self._history["loss"][-1] > diverge_th * self._best_loss:  # type: ignore[operator]
            self._diverge_flag = True
            self.logger.info("Stopping early, the loss has diverged")
            trainer.terminate()
Esempio n. 16
0
def _test_distrib_all_reduce(device):

    res = idist.all_reduce(10)
    assert res == 10 * idist.get_world_size()

    t = torch.tensor(10, device=device)
    res = idist.all_reduce(t)
    assert res.item() == 10 * idist.get_world_size()

    t = torch.tensor(idist.get_rank(), device=device)
    res = idist.all_reduce(t)
    assert res.item() == sum([i for i in range(idist.get_world_size())])

    if idist.get_world_size() > 1:
        with pytest.raises(TypeError, match=r"Unhandled input type"):
            idist.all_reduce("abc")

        with pytest.raises(ValueError,
                           match=r"Unsupported reduction operation"):
            idist.all_reduce(10, op="ABC")
Esempio n. 17
0
def training(local_rank, config, **kwargs):

    import time

    time.sleep(idist.get_rank() * 0.1)

    print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}")

    t = torch.tensor([idist.get_rank()], device=idist.device())
    t = idist.all_reduce(t)
    t = t.item()
    ws = idist.get_world_size()
    assert t == ws * (ws - 1) / 2, f"{t} vs {ws}"
    assert local_rank == idist.get_local_rank()

    # Test init method:
    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        true_init_method = config.get("true_init_method", None)
        assert true_init_method is not None, true_init_method
        assert _model._init_method == true_init_method
Esempio n. 18
0
    def _log_lr_and_loss(self, trainer: Engine, output_transform: Callable,
                         smooth_f: float, diverge_th: float) -> None:
        output = trainer.state.output
        loss = output_transform(output)
        if not isinstance(loss, float):
            if isinstance(loss, torch.Tensor):
                if (loss.ndimension() == 0) or (loss.ndimension() == 1
                                                and len(loss) == 1):
                    loss = loss.item()
                else:
                    raise ValueError(
                        "if output of the engine is torch.Tensor, then "
                        "it must be 0d torch.Tensor or 1d torch.Tensor with 1 element, "
                        f"but got torch.Tensor of shape {loss.shape}")
            else:
                raise TypeError(
                    "output of the engine should be of type float or 0d torch.Tensor "
                    "or 1d torch.Tensor with 1 element, "
                    f"but got output of type {type(loss).__name__}")
        loss = idist.all_reduce(loss)
        lr = self._lr_schedule.get_param()  # type: ignore[union-attr]
        self._history["lr"].append(lr)
        if trainer.state.iteration == 1:
            self._best_loss = loss
        else:
            if smooth_f > 0:
                loss = smooth_f * loss + (1 -
                                          smooth_f) * self._history["loss"][-1]
            if loss < self._best_loss:
                self._best_loss = loss
        self._history["loss"].append(loss)

        # Check if the loss has diverged; if it has, stop the trainer
        if self._history["loss"][
                -1] > diverge_th * self._best_loss:  # type: ignore[operator]
            self._diverge_flag = True
            self.logger.info("Stopping early, the loss has diverged")
            trainer.terminate()
Esempio n. 19
0
def _test_distrib_all_reduce(device):

    res = idist.all_reduce(10)
    assert res == 10 * idist.get_world_size()

    t = torch.tensor(10, device=device)
    res = idist.all_reduce(t)
    assert res.item() == 10 * idist.get_world_size()

    rank = idist.get_rank()
    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t)
    assert res.item() == sum(
        [i * 2.0 + 1.0 for i in range(idist.get_world_size())])

    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t, "MIN").item()
    true_val = min([i * 2 + 1 for i in range(idist.get_world_size())])
    assert res == true_val, f"{res} vs {true_val}"

    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t, "MAX").item()
    true_val = max([i * 2.0 + 1.0 for i in range(idist.get_world_size())])
    assert res == true_val, f"{res} vs {true_val}"

    t = torch.tensor(rank * 2.0 + 1.0, device=device)
    res = idist.all_reduce(t, "PRODUCT").item()
    true_val = 1
    for v in [i * 2.0 + 1.0 for i in range(idist.get_world_size())]:
        true_val *= v
    assert res == true_val, f"{res} vs {true_val}"

    if idist.get_world_size() > 1:
        with pytest.raises(TypeError, match=r"Unhandled input type"):
            idist.all_reduce("abc")

        with pytest.raises(ValueError,
                           match=r"Unsupported reduction operation"):
            idist.all_reduce(10, op="ABC")

        t = torch.tensor([0, 1, 2])
        res = idist.all_reduce(t)
        assert res.device == t.device, f"{res.device} vs {t.device}"
Esempio n. 20
0
def _dist_mean(y_true):
    y_true = idist.all_reduce(y_true) / idist.get_world_size()
    if len(y_true.shape) > 2:
        y_true = y_true.reshape(-1, y_true.shape[-1])
    return y_true.mean(dim=0).cpu().numpy()
Esempio n. 21
0
 def score_function(engine):
     i = trainer.state.epoch - 1
     v = scores[i]
     idist.all_reduce(v)
     v /= idist.get_world_size()
     return v.item()
Esempio n. 22
0
def test_idist_all_reduce_no_dist():
    assert idist.all_reduce(10) == 10
Esempio n. 23
0
 def _mean(y_true):
     y_true = idist.all_reduce(y_true)
     return y_true.mean(dim=0).cpu().numpy() / idist.get_world_size()
Esempio n. 24
0
 def _geom_mean(y_true):
     log_y_true = torch.log(y_true)
     log_y_true = idist.all_reduce(log_y_true)
     np_t = log_y_true.cpu().numpy()
     return np.exp(np.mean(np_t, axis=0) / idist.get_world_size())