Beispiel #1
0
    def test_compare_cuda_cpu(self, reg_data):
        Xtr, Ytr, Xts, Yts = reg_data
        kernel = kernels.GaussianKernel(20.0)

        def error_fn(t, p):
            return torch.sqrt(torch.mean((t - p)**2)).item(), "RMSE"

        opt_cpu = FalkonOptions(use_cpu=True, keops_active="no", debug=True)
        flk_cpu = Falkon(kernel=kernel,
                         penalty=1e-6,
                         M=500,
                         seed=10,
                         options=opt_cpu,
                         maxiter=10,
                         error_fn=error_fn)
        flk_cpu.fit(Xtr, Ytr, Xts=Xts, Yts=Yts)
        opt_gpu = FalkonOptions(use_cpu=False, keops_active="no", debug=True)
        flk_gpu = Falkon(kernel=kernel,
                         penalty=1e-6,
                         M=500,
                         seed=10,
                         options=opt_gpu,
                         maxiter=10,
                         error_fn=error_fn)
        flk_gpu.fit(Xtr, Ytr, Xts=Xts, Yts=Yts)

        np.testing.assert_allclose(flk_cpu.alpha_.numpy(),
                                   flk_gpu.alpha_.numpy())
Beispiel #2
0
 def __init__(self, name: str, kernel_type: str,
              opt: Optional[FalkonOptions]):
     self.name = name
     self.kernel_type = kernel_type
     if opt is None:
         opt = FalkonOptions()
     self.params: FalkonOptions = opt
Beispiel #3
0
 def __init__(self,
              name: str,
              kernel: falkon.kernels.kernel.Kernel,
              opt: Optional[FalkonOptions] = None):
     self.name = name
     self.kernel = kernel
     self.params = opt or FalkonOptions()
Beispiel #4
0
def gpu_lauum(A,
              upper,
              overwrite=True,
              write_opposite=False,
              opt: Optional[FalkonOptions] = None):
    """
    Parameters
    -----------
    A : ndarray [N, N]
        2D positive-definite matrix that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.

    Notes
    ------
    The factorization will always be the 'lower' version of the factorization
    which could however end up on the upper-triangular part of the matrix
    in case A is not Fortran contiguous to begin with.
    """
    if opt is None:
        opt = FalkonOptions()
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    # Start matrix preparations
    if isinstance(A, np.ndarray):
        Anp = A
    elif isinstance(A, torch.Tensor):
        Anp = A.numpy()
    else:
        raise TypeError("Unexpected type encountered for A: %s" % (A.dtype))

    if not overwrite:
        Anp = np.copy(Anp, order='A')

    # Will give a fortran-contiguous numpy array. No copies are performed.
    Anp, transposed = prepare_matrix(Anp)
    if transposed:
        upper = not upper

    # Parallel can only do lower C or F-contiguous arrays
    # But by transposing as necessary, it is able to run with every combination of inputs.
    At = torch.from_numpy(Anp)
    if upper:
        At = At.T
    # The parallel runner chooses based on the contiguity pattern of the inputs.
    _parallel_lauum_runner(At, write_opposite, opt, gpu_info)

    if transposed:
        Anp = Anp.T

    if isinstance(A, np.ndarray):
        return Anp
    else:
        return torch.from_numpy(Anp)
Beispiel #5
0
 def __init__(self,
              name: str,
              kernel: falkon.kernels.Kernel,
              opt: FalkonOptions = FalkonOptions()):
     self.name = name
     self.kernel = kernel
     self.params = opt
Beispiel #6
0
    def test_cuda_predict(self, reg_data):
        Xtr, Ytr, Xts, Yts = reg_data
        kernel = kernels.GaussianKernel(20.0)

        def error_fn(t, p):
            return torch.sqrt(torch.mean((t - p)**2)), "RMSE"

        opt = FalkonOptions(use_cpu=False,
                            keops_active="no",
                            debug=True,
                            min_cuda_pc_size_64=1,
                            min_cuda_iter_size_64=1)

        flk = Falkon(kernel=kernel,
                     penalty=1e-6,
                     M=500,
                     seed=10,
                     options=opt,
                     error_fn=error_fn)
        flk.fit(Xtr, Ytr, Xts=Xts, Yts=Yts)
        flk.to("cuda:0")

        cuda_ts_preds = flk.predict(Xts.to("cuda:0"))
        cuda_tr_preds = flk.predict(Xtr.to("cuda:0"))
        assert cuda_ts_preds.device.type == "cuda"
        assert cuda_ts_preds.shape == (Yts.shape[0], 1)
        ts_err = error_fn(cuda_ts_preds.cpu(), Yts)[0]
        tr_err = error_fn(cuda_tr_preds.cpu(), Ytr)[0]
        assert tr_err < ts_err
        assert ts_err < 2.5
Beispiel #7
0
    def test_classif(self, cls_data):
        X, Y = cls_data
        Xc = X.cuda()
        Yc = Y.cuda()
        kernel = kernels.GaussianKernel(2.0)
        torch.manual_seed(13)
        np.random.seed(13)

        def error_fn(t, p):
            return 100 * torch.sum(t * p <= 0).to(
                torch.float32) / t.shape[0], "c-err"

        opt = FalkonOptions(use_cpu=False, keops_active="no", debug=True)
        M = 500
        flkc = InCoreFalkon(kernel=kernel,
                            penalty=1e-6,
                            M=M,
                            seed=10,
                            options=opt,
                            maxiter=20,
                            error_fn=error_fn)
        flkc.fit(Xc, Yc)

        cpreds = flkc.predict(Xc)
        assert cpreds.device == Xc.device
        err = error_fn(cpreds, Yc)[0]
        assert err < 5
Beispiel #8
0
    def test_classif(self, cls_data, cuda_usage):
        X, Y = cls_data
        if cuda_usage == "incore":
            X, Y = X.cuda(), Y.cuda()
            flk_cls = InCoreFalkon
        else:
            flk_cls = Falkon
        kernel = kernels.GaussianKernel(2.0)

        def error_fn(t, p):
            return 100 * torch.sum(t * p <= 0).to(
                torch.float32) / t.shape[0], "c-err"

        def weight_fn(y):
            weight = torch.empty_like(y)
            weight[y == 1] = 1
            weight[y == -1] = 2
            return weight

        opt = FalkonOptions(use_cpu=cuda_usage == "cpu_only",
                            keops_active="no",
                            debug=False)

        flk_weight = flk_cls(kernel=kernel,
                             penalty=1e-6,
                             M=500,
                             seed=10,
                             options=opt,
                             error_fn=error_fn,
                             weight_fn=weight_fn)
        flk_weight.fit(X, Y)
        preds_weight = flk_weight.predict(X)
        preds_weight_m1 = preds_weight[Y == -1]
        preds_weight_p1 = preds_weight[Y == 1]
        err_weight_m1 = error_fn(preds_weight_m1, Y[Y == -1])[0]
        err_weight_p1 = error_fn(preds_weight_p1, Y[Y == 1])[0]

        flk = flk_cls(kernel=kernel,
                      penalty=1e-6,
                      M=500,
                      seed=10,
                      options=opt,
                      error_fn=error_fn,
                      weight_fn=None)
        flk.fit(X, Y)
        preds = flk.predict(X)
        preds_m1 = preds[Y == -1]
        preds_p1 = preds[Y == 1]
        err_m1 = error_fn(preds_m1, Y[Y == -1])[0]
        err_p1 = error_fn(preds_p1, Y[Y == 1])[0]

        print(
            "Weighted errors: -1 (%f) +1 (%f) -- Normal errors: -1 (%f) +1 (%f)"
            % (err_weight_m1, err_weight_p1, err_m1, err_p1))

        assert err_weight_m1 < err_m1, "Error of weighted class is higher than without weighting"
        assert err_weight_p1 >= err_p1, "Error of unweighted class is lower than in flk with no weights"
Beispiel #9
0
class AbstractKernelTester(abc.ABC):
    max_mem = 2 * 2**20
    basic_options = FalkonOptions(debug=True, compute_arch_speed=False,
                                  max_cpu_mem=max_mem, max_gpu_mem=max_mem)

    @pytest.fixture(scope="class")
    def exp_v(self, exp_k: np.ndarray, v: torch.Tensor) -> np.ndarray:
        return exp_k @ v.numpy()

    @pytest.fixture(scope="class")
    def exp_dv(self, exp_k: np.ndarray, v: torch.Tensor) -> np.ndarray:
        return exp_k.T @ (exp_k @ v.numpy())

    @pytest.fixture(scope="class")
    def exp_dw(self, exp_k: np.ndarray, w: torch.Tensor) -> np.ndarray:
        return exp_k.T @ w.numpy()

    @pytest.fixture(scope="class")
    def exp_dvw(self, exp_k: np.ndarray, v: torch.Tensor, w: torch.Tensor) -> np.ndarray:
        return exp_k.T @ (exp_k @ v.numpy() + w.numpy())

    def test_kernel(self, kernel, A, B, exp_k, cpu, rtol):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu)
        _run_test(kernel, exp_k, (A, B), out=None, rtol=rtol[A.dtype], opt=opt)

    @pytest.mark.parametrize("keops", [
        pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")),
        "no"
    ], ids=["KeOps", "No KeOps"])
    def test_mmv(self, kernel, keops, A, B, v, exp_v, cpu, rtol):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops)
        _run_test(kernel.mmv, exp_v, (A, B, v), out=None, rtol=rtol[A.dtype], opt=opt)

    @pytest.mark.parametrize("keops", [
        pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")),
        "no"
    ], ids=["KeOps", "No KeOps"])
    def test_dv(self, kernel, keops, A, B, v, exp_dv, cpu, rtol):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops)
        _run_test(kernel.dmmv, exp_dv, (A, B, v, None), out=None, rtol=rtol[A.dtype], opt=opt)

    @pytest.mark.parametrize("keops", [
        pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")),
        "no"
    ], ids=["KeOps", "No KeOps"])
    def test_dw(self, kernel, keops, A, B, w, exp_dw, cpu, rtol):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops)
        _run_test(kernel.dmmv, exp_dw, (A, B, None, w), out=None, rtol=rtol[A.dtype], opt=opt)

    @pytest.mark.parametrize("keops", [
        pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")),
        "no"
    ], ids=["KeOps", "No KeOps"])
    def test_dvw(self, kernel, keops, A, B, v, w, exp_dvw, cpu, rtol):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops)
        _run_test(kernel.dmmv, exp_dvw, (A, B, v, w), out=None, rtol=rtol[A.dtype], opt=opt)
Beispiel #10
0
class TestOutOfCorePyTest():
    basic_options = FalkonOptions(debug=True, chol_force_ooc=True)

    def test_start_cuda_fail(self, pd_data, dtype, overwrite):
        # Cannot run OOC-POTRF on CUDA matrices (only IC-POTRF allowed)
        with pytest.raises(ValueError,
                           match="Cannot run out-of-core POTRF on CUDA"):
            run_potrf_test(pd_data,
                           dtype=dtype,
                           order="F",
                           upper=False,
                           clean=False,
                           overwrite=overwrite,
                           input_device="cuda:0",
                           opt=self.basic_options)

    @pytest.mark.parametrize("clean", [True, False])
    @pytest.mark.parametrize(
        "order,upper",
        [
            pytest.param("F", True, marks=[
                pytest.mark.xfail(strict=True),
            ]),  # Upper-F not possible
            pytest.param("C", True),
            pytest.param("F", False),
            pytest.param("C", False, marks=[
                pytest.mark.xfail(strict=True),
            ]),  # Lower-C not possible
        ])
    def test_ooc(self, pd_data, dtype, order, upper, clean, overwrite):
        run_potrf_test(pd_data,
                       dtype=dtype,
                       order=order,
                       upper=upper,
                       clean=clean,
                       overwrite=overwrite,
                       input_device="cpu",
                       opt=self.basic_options)

    @pytest.mark.parametrize("clean,order,upper", [
        pytest.param(False, "C", True),
        pytest.param(True, "F", False),
    ])
    def test_ooc_mem(self, pd_data, dtype, order, upper, clean, overwrite):
        # 1600 is the minimum memory the fn seems to use (even for the 4x4 data)
        max_mem = max(pd_data.shape[0] * sizeof_dtype(dtype) * 1000, 1600)
        opt = dataclasses.replace(self.basic_options, max_gpu_mem=max_mem)
        run_potrf_test(pd_data,
                       dtype=dtype,
                       order=order,
                       upper=upper,
                       clean=clean,
                       overwrite=overwrite,
                       input_device="cpu",
                       opt=opt)
Beispiel #11
0
class TestFalkonConjugateGradient:
    basic_opt = FalkonOptions(use_cpu=True)
    N = 500
    M = 10
    D = 10
    penalty = 10

    @pytest.fixture()
    def kernel(self):
        return GaussianKernel(100.0)

    @pytest.fixture()
    def data(self):
        return torch.from_numpy(
            gen_random(self.N, self.D, 'float64', F=False, seed=10))

    @pytest.fixture(params=[1, 10], ids=["1-rhs", "10-rhs"])
    def vec_rhs(self, request):
        return torch.from_numpy(
            gen_random(self.N, request.param, 'float64', F=False, seed=9))

    @pytest.fixture()
    def centers(self, data):
        cs = UniformSel(np.random.default_rng(2))
        return cs.select(data, None, self.M)

    @pytest.fixture()
    def knm(self, kernel, data, centers):
        return kernel(data, centers, opt=self.basic_opt)

    @pytest.fixture()
    def kmm(self, kernel, centers):
        return kernel(centers, centers, opt=self.basic_opt)

    @pytest.fixture()
    def preconditioner(self, kernel, centers):
        prec = FalkonPreconditioner(self.penalty, kernel, self.basic_opt)
        prec.init(centers)
        return prec

    def test_flk_cg(self, data, centers, kernel, preconditioner, knm, kmm,
                    vec_rhs):
        opt = FalkonConjugateGradient(kernel,
                                      preconditioner,
                                      opt=self.basic_opt)

        # Solve (knm.T @ knm + lambda*n*kmm) x = knm.T @ b
        rhs = knm.T @ vec_rhs
        lhs = knm.T @ knm + self.penalty * self.N * kmm
        expected = np.linalg.solve(lhs.numpy(), rhs.numpy())

        beta = opt.solve(data, centers, vec_rhs, self.penalty, None, 200)
        alpha = preconditioner.apply(beta)

        np.testing.assert_allclose(expected, alpha, rtol=1e-5)
Beispiel #12
0
def test_gaussian_pd():
    X = gen_random(10000, 2, 'float32', F=True, seed=12)
    Xt = torch.from_numpy(X)
    sigma = 10.0
    opt = FalkonOptions(compute_arch_speed=False, max_gpu_mem=1 * 2**30, use_cpu=False,
                        no_single_kernel=False)
    k = GaussianKernel(sigma, opt=opt)
    actual = k(Xt, Xt, opt=opt)
    actual += torch.eye(Xt.shape[0]) * (1e-7 * Xt.shape[0])
    # Test positive definite
    np.linalg.cholesky(actual)
Beispiel #13
0
 def __init__(self,
              kernel: falkon.kernels.Kernel,
              preconditioner: falkon.preconditioner.Preconditioner,
              opt: FalkonOptions,
              weight_fn=None):
     super().__init__()
     self.kernel = kernel
     self.preconditioner = preconditioner
     self.params = opt
     self.optimizer = ConjugateGradient(opt.get_conjgrad_options())
     self.weight_fn = weight_fn
Beispiel #14
0
class TestKeops:
    basic_options = FalkonOptions(debug=True, compute_arch_speed=False, keops_active="force",
                                  max_cpu_mem=max_mem_dense, max_gpu_mem=max_mem_dense)

    @pytest.mark.parametrize("Ao,Adt,Bo,Bdt,vo,vdt", [
        ("C", np.float32, "C", np.float32, "C", np.float32),
        ("C", np.float64, "C", np.float64, "C", np.float64),
        pytest.param("F", np.float32, "F", np.float32, "F", np.float32,
                     marks=[pytest.mark.xfail(reason="KeOps only C")]),
        pytest.param("F", np.float32, "C", np.float32, "C", np.float32,
                     marks=[pytest.mark.xfail(reason="KeOps only C")]),
    ], ids=["AC32-BC32-vC32", "AC64-BC64-vC64", "AF32-BF32-vF32", "AF32-BC32-vC32"])
    @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"])
    def test_fmmv(self, A, B, v, Ao, Adt, Bo, Bdt, vo, vdt, kernel,
                  expected_fmmv, cpu):
        A = fix_mat(A, order=Ao, dtype=Adt)
        B = fix_mat(B, order=Bo, dtype=Bdt)
        v = fix_mat(v, order=vo, dtype=vdt)

        opt = dataclasses.replace(self.basic_options, use_cpu=cpu)
        rtol = choose_on_dtype(A.dtype)

        # Test normal
        _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt)
        # Test with out
        out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype)
        _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt)

    @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")
    def test_gpu_inputs(self, A, B, v, kernel, expected_fmmv):
        A = fix_mat(A, order="C", dtype=n32).cuda()
        B = fix_mat(B, order="C", dtype=n32, device=A.device)
        v = fix_mat(v, order="C", dtype=n32, device=A.device)
        opt = dataclasses.replace(self.basic_options, use_cpu=False, max_gpu_mem=np.inf)
        rtol = choose_on_dtype(A.dtype)
        # Test normal
        _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt)
        # Test with out
        out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype, device=A.device)
        _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt)

    @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")
    def test_gpu_inputs_fail(self, A, B, v, kernel, expected_fmmv):
        A = fix_mat(A, order="C", dtype=n32, device="cuda:0")
        B = fix_mat(B, order="C", dtype=n32, device="cuda:0")
        v = fix_mat(v, order="C", dtype=n32, device="cpu")
        opt = dataclasses.replace(self.basic_options, use_cpu=False, max_gpu_mem=np.inf)
        rtol = choose_on_dtype(A.dtype)
        # Test normal
        with pytest.raises(RuntimeError):
            _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt)
Beispiel #15
0
    def test_fails_cpu_tensors(self, cls_data):
        X, Y = cls_data
        kernel = kernels.GaussianKernel(2.0)

        opt = FalkonOptions(use_cpu=False, keops_active="no", debug=True)

        flk = InCoreFalkon(kernel=kernel,
                           penalty=1e-6,
                           M=500,
                           seed=10,
                           options=opt)
        with pytest.raises(ValueError):
            flk.fit(X, Y)
        flk.fit(X.cuda(), Y.cuda())
        with pytest.raises(ValueError):
            flk.predict(X)
Beispiel #16
0
    def test_simple(self, data):
        X, Y = data
        kernel = kernels.GaussianKernel(3.0)
        loss = LogisticLoss(kernel=kernel)

        def error_fn(t, p):
            return float(100 * torch.sum(t * p <= 0)) / t.shape[0], "c-err"

        opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True)

        logflk = LogisticFalkon(
            kernel=kernel, loss=loss, penalty_list=[1e-1, 1e-3, 1e-5, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8],
            iter_list=[3, 3, 3, 3, 8, 8, 8, 8], M=500, seed=10,
            options=opt,
            error_fn=error_fn)
        logflk.fit(X, Y)
        preds = logflk.predict(X)
        err = error_fn(preds, Y)[0]
        assert err < 0.1
Beispiel #17
0
    def test_classif(self, cls_data):
        X, Y = cls_data
        kernel = kernels.GaussianKernel(2.0)

        def error_fn(t, p):
            return 100 * torch.sum(t * p <= 0).to(
                torch.float32) / t.shape[0], "c-err"

        opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True)

        flk = Falkon(kernel=kernel,
                     penalty=1e-6,
                     M=500,
                     seed=10,
                     options=opt,
                     error_fn=error_fn)
        flk.fit(X, Y)
        preds = flk.predict(X)
        err = error_fn(preds, Y)[0]
        assert err < 5
Beispiel #18
0
    def test_regression(self, reg_data):
        Xtr, Ytr, Xts, Yts = reg_data
        kernel = kernels.GaussianKernel(20.0)

        def error_fn(t, p):
            return torch.sqrt(torch.mean((t - p)**2)).item(), "RMSE"

        opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True)
        flk = Falkon(kernel=kernel,
                     penalty=1e-6,
                     M=500,
                     seed=10,
                     options=opt,
                     maxiter=10)
        flk.fit(Xtr, Ytr, Xts=Xts, Yts=Yts)

        assert flk.predict(Xts).shape == (Yts.shape[0], 1)
        ts_err = error_fn(flk.predict(Xts), Yts)[0]
        tr_err = error_fn(flk.predict(Xtr), Ytr)[0]
        assert tr_err < ts_err
        assert ts_err < 2.5
Beispiel #19
0
def test_cpu_gpu_equality(mat, kernel, gram):
    la = 12.3

    mat = fix_mat(mat, dtype=np.float64, order="F", copy=True)

    opt = FalkonOptions(compute_arch_speed=False,
                        use_cpu=False,
                        cpu_preconditioner=False)
    prec_gpu = FalkonPreconditioner(la, kernel, opt)
    prec_gpu.init(mat)

    opt = dataclasses.replace(opt, use_cpu=True, cpu_preconditioner=True)
    prec_cpu = FalkonPreconditioner(la, kernel, opt)
    prec_cpu.init(mat)

    np.testing.assert_allclose(prec_cpu.fC,
                               prec_gpu.fC,
                               rtol=1e-10,
                               atol=1e-10)
    np.testing.assert_allclose(prec_cpu.dA, prec_gpu.dA, rtol=1e-10)
    np.testing.assert_allclose(prec_cpu.dT, prec_gpu.dT, rtol=1e-10)
Beispiel #20
0
    def test_multiclass(self, multicls_data):
        X, Y = multicls_data
        kernel = kernels.GaussianKernel(10.0)

        def error_fn(t, p):
            t = torch.argmax(t, dim=1)
            p = torch.argmax(p, dim=1)
            return torch.mean((t.reshape(-1, ) != p.reshape(-1, )).to(
                torch.float64)), "multic-err"

        opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True)

        flk = Falkon(kernel=kernel,
                     penalty=1e-6,
                     M=500,
                     seed=10,
                     options=opt,
                     error_fn=error_fn)
        flk.fit(X, Y)
        preds = flk.predict(X)
        err = error_fn(preds, Y)[0]
        assert err < 0.23
Beispiel #21
0
class TestFalkonPreconditioner:
    rtol = {np.float64: 1e-10, np.float32: 1e-2}
    basic_opt = FalkonOptions(compute_arch_speed=False, no_single_kernel=True)

    @pytest.mark.parametrize("order", ["C", "F"])
    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    def test_simple(self, mat, kernel, gram, cpu, dtype, order):
        opt = dataclasses.replace(self.basic_opt,
                                  use_cpu=cpu,
                                  cpu_preconditioner=cpu)
        rtol = self.rtol[dtype]

        mat = fix_mat(mat, dtype=dtype, order=order, copy=True)
        gram = fix_mat(gram, dtype=dtype, order=order, copy=True)

        la = 100
        prec = FalkonPreconditioner(la, kernel, opt)
        prec.init(mat)
        assert_invariant_on_TT(prec, gram, tol=rtol)
        assert_invariant_on_AT(prec, gram, la, tol=rtol)
        assert_invariant_on_T(prec, gram, tol=rtol * 10)
        assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10)

    def test_zero_lambda(self, mat, kernel, gram, cpu):
        opt = dataclasses.replace(self.basic_opt,
                                  use_cpu=cpu,
                                  cpu_preconditioner=cpu)
        mat = fix_mat(mat, dtype=np.float64, order="K", copy=True)
        gram = fix_mat(gram, dtype=np.float64, order="K", copy=True)

        la = 0
        prec = FalkonPreconditioner(la, kernel, opt)
        prec.init(mat)
        assert_invariant_on_TT(prec, gram, tol=1e-10)
        assert_invariant_on_AT(prec, gram, la, tol=1e-10)
        assert_invariant_on_T(prec, gram, tol=1e-9)
        assert_invariant_on_prec(prec, N, gram, la, tol=1e-9)
Beispiel #22
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. "
                      "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
                      "please pass CPU tensors; to avoid this warning if the GPU backend is "
                      "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula, aliases,
                reduction_op=reduction, axis=axis,
                dtype=dtype, dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Compile on a small data subset
    small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars
    small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device)
    fn(*small_data_variables, out=small_data_out, backend=backend)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N, T, dtype=X1.dtype, device=device,
                          pin_memory=(backend != 'CPU') and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # Info about GPUs
        ram_slack = 0.7  # slack is high due to imprecise memory usage estimates
        gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
        gpu_ram = [
            min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack)
            for g in gpu_info
        ]
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i in range(len(gpu_info)):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2,
                v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                other_vars=other_vars,
                function=fn,
                backend=backend,
                gpu_ram=gpu_ram[i]
            ), gpu_info[i].Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        out = fn(*variables, out=out, backend=backend)

    return out
Beispiel #23
0
class TestOOCLauum:
    rtol = {np.float64: 1e-12, np.float32: 1e-5}
    max_mem = 2 * 2**20
    basic_opt = FalkonOptions(compute_arch_speed=False,
                              use_cpu=False,
                              max_gpu_mem=max_mem,
                              lauum_par_blk_multiplier=6)

    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    @pytest.mark.parametrize("order", ["F", "C"])
    @pytest.mark.parametrize("device", ["cpu", "cuda:0"])
    def test_no_overwrite(self, dtype, order, get_mat, expected_lower,
                          expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        # For cuda inputs we must add to available GPU memory the amount used by the
        # input matrix, since overwrite=False and a full copy must be performed.
        mgpu_slack = 0
        if device.startswith("cuda"):
            mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[
                0]**2 * sizeof_dtype(mat.dtype)

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_upper,
                                   act_up.cpu().numpy(),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(omat, mat.cpu())

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_lower,
                                   act_lo.cpu().numpy(),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(omat, mat.cpu())

    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    @pytest.mark.parametrize("order", ["F", "C"])
    @pytest.mark.parametrize("device", ["cpu", "cuda:0"])
    def test_overwrite(self, dtype, order, get_mat, expected_lower,
                       expected_upper, device):
        mat = get_mat(order=order, dtype=dtype, device=device)

        with memory_checker(self.basic_opt) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=True, opt=new_opt)
        np.testing.assert_allclose(expected_upper,
                                   act_up.cpu().numpy(),
                                   rtol=self.rtol[dtype])

        mat = get_mat(order=order, dtype=dtype, device=device)
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=True, opt=new_opt)
        np.testing.assert_allclose(expected_lower,
                                   act_lo.cpu().numpy(),
                                   rtol=self.rtol[dtype])

    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    @pytest.mark.parametrize("order", ["F", "C"])
    @pytest.mark.parametrize("device", ["cpu", "cuda:0"])
    def test_write_opposite(self, dtype, order, get_mat, expected_lower,
                            expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        with memory_checker(self.basic_opt) as new_opt:
            act_up = gpu_lauum(mat,
                               upper=True,
                               overwrite=True,
                               write_opposite=True,
                               opt=new_opt)
            torch.cuda.synchronize()
        act_up = act_up.cpu()
        np.testing.assert_allclose(np.triu(omat, k=1),
                                   np.triu(act_up.numpy(), k=1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.tril(act_up.numpy()),
                                   np.triu(expected_upper).T,
                                   rtol=self.rtol[dtype])

        mat = get_mat(order=order, dtype=dtype)
        mat = move_tensor(mat, device)
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(mat,
                               upper=False,
                               overwrite=True,
                               write_opposite=True,
                               opt=new_opt)
            torch.cuda.synchronize()
        act_lo = act_lo.cpu()
        np.testing.assert_allclose(np.tril(omat, k=-1),
                                   np.tril(act_lo.numpy(), k=-1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.triu(act_lo.numpy()),
                                   np.tril(expected_lower).T,
                                   rtol=self.rtol[dtype])

    def test_no_blk_mul(self, get_mat, expected_upper):
        dtype = np.float32
        mat = get_mat(order="F", dtype=dtype).numpy().copy(order="K")
        opt = dataclasses.replace(self.basic_opt, lauum_par_blk_multiplier=1)

        act_lo = gpu_lauum(torch.from_numpy(mat),
                           upper=True,
                           overwrite=True,
                           opt=opt)
        torch.cuda.synchronize()
        np.testing.assert_allclose(expected_upper,
                                   act_lo.numpy(),
                                   rtol=self.rtol[dtype])
Beispiel #24
0
class TestSparseFmm:
    basic_options = FalkonOptions(debug=True,
                                  compute_arch_speed=False,
                                  no_single_kernel=True)

    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    def test_sparse(self, k_class, k_exp, s_A, s_B, dtype, cpu):
        max_mem = 50 * 2**20
        opt = dataclasses.replace(self.basic_options,
                                  use_cpu=cpu,
                                  max_cpu_mem=max_mem,
                                  max_gpu_mem=max_mem)

        A_sparse = s_A[0].to(dtype=numpy_to_torch_type(dtype))
        B_sparse = s_B[0].to(dtype=numpy_to_torch_type(dtype))
        rtol = choose_on_dtype(dtype)

        # Here both A and B are sparse
        _run_fmm_test(k_class,
                      k_exp,
                      A_sparse,
                      B_sparse,
                      out=None,
                      dtype=dtype,
                      opt=opt,
                      rtol=rtol)
        # Test with output matrix (C) (fails on GPU)
        out = torch.empty(A_sparse.shape[0],
                          B_sparse.shape[0],
                          dtype=A_sparse.dtype)
        if not cpu:
            with pytest.raises(RuntimeError):
                _run_fmm_test(k_class,
                              k_exp,
                              A_sparse,
                              B_sparse,
                              out=out,
                              dtype=dtype,
                              opt=opt,
                              rtol=rtol)
        else:
            _run_fmm_test(k_class,
                          k_exp,
                          A_sparse,
                          B_sparse,
                          out=out,
                          dtype=dtype,
                          opt=opt,
                          rtol=rtol)
        # Test with output matrix (F)
        out = torch.empty(B_sparse.shape[0],
                          A_sparse.shape[0],
                          dtype=A_sparse.dtype).T
        _run_fmm_test(k_class,
                      k_exp,
                      A_sparse,
                      B_sparse,
                      out=out,
                      dtype=dtype,
                      opt=opt,
                      rtol=rtol)
Beispiel #25
0
class TestDenseFmm:
    basic_options = FalkonOptions(debug=True,
                                  compute_arch_speed=False,
                                  no_single_kernel=False)

    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    @pytest.mark.parametrize("A,B", [
        pytest.param('Ac', 'Bc', marks=pytest.mark.usefixtures('Ac', 'Bc')),
        pytest.param('Af', 'Bf', marks=pytest.mark.usefixtures('Af', 'Bf')),
        pytest.param('Ac', 'Bf', marks=pytest.mark.usefixtures('Ac', 'Bf')),
    ],
                             indirect=True)
    def test(self, A, B, k_class, k_exp, dtype, cpu):
        max_mem = 2 * 2**20
        opt = dataclasses.replace(self.basic_options,
                                  use_cpu=cpu,
                                  max_cpu_mem=max_mem,
                                  max_gpu_mem=max_mem)

        rtol = choose_on_dtype(dtype)
        _run_fmm_test(k_class,
                      k_exp,
                      A,
                      B,
                      out=None,
                      dtype=dtype,
                      opt=opt,
                      rtol=rtol)

    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
    def test_with_out(self, Ac: torch.Tensor, Bc: torch.Tensor, k_class, k_exp,
                      dtype, cpu):
        out = np.empty((Ac.shape[0], Bc.shape[0]), dtype=Ac.dtype)
        max_mem = 2 * 2**20
        opt = dataclasses.replace(self.basic_options,
                                  use_cpu=cpu,
                                  max_cpu_mem=max_mem,
                                  max_gpu_mem=max_mem)
        rtol = choose_on_dtype(dtype)
        _run_fmm_test(k_class,
                      k_exp,
                      Ac,
                      Bc,
                      out=out,
                      dtype=dtype,
                      opt=opt,
                      rtol=rtol)

    @pytest.mark.parametrize("A,B", [
        pytest.param('Af', 'Bf', marks=pytest.mark.usefixtures('Af', 'Bf')),
        pytest.param('Ac', 'Bf', marks=pytest.mark.usefixtures('Ac', 'Bf')),
    ],
                             indirect=True)
    def test_precise_kernel(self, A, B, k_class, k_exp, cpu):
        max_mem = 2 * 2**20
        opt = dataclasses.replace(self.basic_options,
                                  use_cpu=cpu,
                                  max_cpu_mem=max_mem,
                                  max_gpu_mem=max_mem,
                                  no_single_kernel=True)
        expected_rtol = 1e-6
        out = np.empty((A.shape[0], B.shape[0]), dtype=A.dtype)
        _run_fmm_test(k_class,
                      k_exp,
                      A,
                      B,
                      out=out,
                      dtype=np.float32,
                      opt=opt,
                      rtol=expected_rtol)