def test_compare_cuda_cpu(self, reg_data): Xtr, Ytr, Xts, Yts = reg_data kernel = kernels.GaussianKernel(20.0) def error_fn(t, p): return torch.sqrt(torch.mean((t - p)**2)).item(), "RMSE" opt_cpu = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk_cpu = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt_cpu, maxiter=10, error_fn=error_fn) flk_cpu.fit(Xtr, Ytr, Xts=Xts, Yts=Yts) opt_gpu = FalkonOptions(use_cpu=False, keops_active="no", debug=True) flk_gpu = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt_gpu, maxiter=10, error_fn=error_fn) flk_gpu.fit(Xtr, Ytr, Xts=Xts, Yts=Yts) np.testing.assert_allclose(flk_cpu.alpha_.numpy(), flk_gpu.alpha_.numpy())
def __init__(self, name: str, kernel_type: str, opt: Optional[FalkonOptions]): self.name = name self.kernel_type = kernel_type if opt is None: opt = FalkonOptions() self.params: FalkonOptions = opt
def __init__(self, name: str, kernel: falkon.kernels.kernel.Kernel, opt: Optional[FalkonOptions] = None): self.name = name self.kernel = kernel self.params = opt or FalkonOptions()
def gpu_lauum(A, upper, overwrite=True, write_opposite=False, opt: Optional[FalkonOptions] = None): """ Parameters ----------- A : ndarray [N, N] 2D positive-definite matrix that will be factorized as A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper` is False. overwrite : bool Whether to overwrite matrix A or to output the result in a new buffer. Notes ------ The factorization will always be the 'lower' version of the factorization which could however end up on the upper-triangular part of the matrix in case A is not Fortran contiguous to begin with. """ if opt is None: opt = FalkonOptions() gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] for g in gpu_info: g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95, opt.max_gpu_mem * 0.95) # Start matrix preparations if isinstance(A, np.ndarray): Anp = A elif isinstance(A, torch.Tensor): Anp = A.numpy() else: raise TypeError("Unexpected type encountered for A: %s" % (A.dtype)) if not overwrite: Anp = np.copy(Anp, order='A') # Will give a fortran-contiguous numpy array. No copies are performed. Anp, transposed = prepare_matrix(Anp) if transposed: upper = not upper # Parallel can only do lower C or F-contiguous arrays # But by transposing as necessary, it is able to run with every combination of inputs. At = torch.from_numpy(Anp) if upper: At = At.T # The parallel runner chooses based on the contiguity pattern of the inputs. _parallel_lauum_runner(At, write_opposite, opt, gpu_info) if transposed: Anp = Anp.T if isinstance(A, np.ndarray): return Anp else: return torch.from_numpy(Anp)
def __init__(self, name: str, kernel: falkon.kernels.Kernel, opt: FalkonOptions = FalkonOptions()): self.name = name self.kernel = kernel self.params = opt
def test_cuda_predict(self, reg_data): Xtr, Ytr, Xts, Yts = reg_data kernel = kernels.GaussianKernel(20.0) def error_fn(t, p): return torch.sqrt(torch.mean((t - p)**2)), "RMSE" opt = FalkonOptions(use_cpu=False, keops_active="no", debug=True, min_cuda_pc_size_64=1, min_cuda_iter_size_64=1) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(Xtr, Ytr, Xts=Xts, Yts=Yts) flk.to("cuda:0") cuda_ts_preds = flk.predict(Xts.to("cuda:0")) cuda_tr_preds = flk.predict(Xtr.to("cuda:0")) assert cuda_ts_preds.device.type == "cuda" assert cuda_ts_preds.shape == (Yts.shape[0], 1) ts_err = error_fn(cuda_ts_preds.cpu(), Yts)[0] tr_err = error_fn(cuda_tr_preds.cpu(), Ytr)[0] assert tr_err < ts_err assert ts_err < 2.5
def test_classif(self, cls_data): X, Y = cls_data Xc = X.cuda() Yc = Y.cuda() kernel = kernels.GaussianKernel(2.0) torch.manual_seed(13) np.random.seed(13) def error_fn(t, p): return 100 * torch.sum(t * p <= 0).to( torch.float32) / t.shape[0], "c-err" opt = FalkonOptions(use_cpu=False, keops_active="no", debug=True) M = 500 flkc = InCoreFalkon(kernel=kernel, penalty=1e-6, M=M, seed=10, options=opt, maxiter=20, error_fn=error_fn) flkc.fit(Xc, Yc) cpreds = flkc.predict(Xc) assert cpreds.device == Xc.device err = error_fn(cpreds, Yc)[0] assert err < 5
def test_classif(self, cls_data, cuda_usage): X, Y = cls_data if cuda_usage == "incore": X, Y = X.cuda(), Y.cuda() flk_cls = InCoreFalkon else: flk_cls = Falkon kernel = kernels.GaussianKernel(2.0) def error_fn(t, p): return 100 * torch.sum(t * p <= 0).to( torch.float32) / t.shape[0], "c-err" def weight_fn(y): weight = torch.empty_like(y) weight[y == 1] = 1 weight[y == -1] = 2 return weight opt = FalkonOptions(use_cpu=cuda_usage == "cpu_only", keops_active="no", debug=False) flk_weight = flk_cls(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn, weight_fn=weight_fn) flk_weight.fit(X, Y) preds_weight = flk_weight.predict(X) preds_weight_m1 = preds_weight[Y == -1] preds_weight_p1 = preds_weight[Y == 1] err_weight_m1 = error_fn(preds_weight_m1, Y[Y == -1])[0] err_weight_p1 = error_fn(preds_weight_p1, Y[Y == 1])[0] flk = flk_cls(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn, weight_fn=None) flk.fit(X, Y) preds = flk.predict(X) preds_m1 = preds[Y == -1] preds_p1 = preds[Y == 1] err_m1 = error_fn(preds_m1, Y[Y == -1])[0] err_p1 = error_fn(preds_p1, Y[Y == 1])[0] print( "Weighted errors: -1 (%f) +1 (%f) -- Normal errors: -1 (%f) +1 (%f)" % (err_weight_m1, err_weight_p1, err_m1, err_p1)) assert err_weight_m1 < err_m1, "Error of weighted class is higher than without weighting" assert err_weight_p1 >= err_p1, "Error of unweighted class is lower than in flk with no weights"
class AbstractKernelTester(abc.ABC): max_mem = 2 * 2**20 basic_options = FalkonOptions(debug=True, compute_arch_speed=False, max_cpu_mem=max_mem, max_gpu_mem=max_mem) @pytest.fixture(scope="class") def exp_v(self, exp_k: np.ndarray, v: torch.Tensor) -> np.ndarray: return exp_k @ v.numpy() @pytest.fixture(scope="class") def exp_dv(self, exp_k: np.ndarray, v: torch.Tensor) -> np.ndarray: return exp_k.T @ (exp_k @ v.numpy()) @pytest.fixture(scope="class") def exp_dw(self, exp_k: np.ndarray, w: torch.Tensor) -> np.ndarray: return exp_k.T @ w.numpy() @pytest.fixture(scope="class") def exp_dvw(self, exp_k: np.ndarray, v: torch.Tensor, w: torch.Tensor) -> np.ndarray: return exp_k.T @ (exp_k @ v.numpy() + w.numpy()) def test_kernel(self, kernel, A, B, exp_k, cpu, rtol): opt = dataclasses.replace(self.basic_options, use_cpu=cpu) _run_test(kernel, exp_k, (A, B), out=None, rtol=rtol[A.dtype], opt=opt) @pytest.mark.parametrize("keops", [ pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")), "no" ], ids=["KeOps", "No KeOps"]) def test_mmv(self, kernel, keops, A, B, v, exp_v, cpu, rtol): opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops) _run_test(kernel.mmv, exp_v, (A, B, v), out=None, rtol=rtol[A.dtype], opt=opt) @pytest.mark.parametrize("keops", [ pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")), "no" ], ids=["KeOps", "No KeOps"]) def test_dv(self, kernel, keops, A, B, v, exp_dv, cpu, rtol): opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops) _run_test(kernel.dmmv, exp_dv, (A, B, v, None), out=None, rtol=rtol[A.dtype], opt=opt) @pytest.mark.parametrize("keops", [ pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")), "no" ], ids=["KeOps", "No KeOps"]) def test_dw(self, kernel, keops, A, B, w, exp_dw, cpu, rtol): opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops) _run_test(kernel.dmmv, exp_dw, (A, B, None, w), out=None, rtol=rtol[A.dtype], opt=opt) @pytest.mark.parametrize("keops", [ pytest.param("force", marks=pytest.mark.skipif(not decide_keops(), reason="no KeOps found.")), "no" ], ids=["KeOps", "No KeOps"]) def test_dvw(self, kernel, keops, A, B, v, w, exp_dvw, cpu, rtol): opt = dataclasses.replace(self.basic_options, use_cpu=cpu, keops_active=keops) _run_test(kernel.dmmv, exp_dvw, (A, B, v, w), out=None, rtol=rtol[A.dtype], opt=opt)
class TestOutOfCorePyTest(): basic_options = FalkonOptions(debug=True, chol_force_ooc=True) def test_start_cuda_fail(self, pd_data, dtype, overwrite): # Cannot run OOC-POTRF on CUDA matrices (only IC-POTRF allowed) with pytest.raises(ValueError, match="Cannot run out-of-core POTRF on CUDA"): run_potrf_test(pd_data, dtype=dtype, order="F", upper=False, clean=False, overwrite=overwrite, input_device="cuda:0", opt=self.basic_options) @pytest.mark.parametrize("clean", [True, False]) @pytest.mark.parametrize( "order,upper", [ pytest.param("F", True, marks=[ pytest.mark.xfail(strict=True), ]), # Upper-F not possible pytest.param("C", True), pytest.param("F", False), pytest.param("C", False, marks=[ pytest.mark.xfail(strict=True), ]), # Lower-C not possible ]) def test_ooc(self, pd_data, dtype, order, upper, clean, overwrite): run_potrf_test(pd_data, dtype=dtype, order=order, upper=upper, clean=clean, overwrite=overwrite, input_device="cpu", opt=self.basic_options) @pytest.mark.parametrize("clean,order,upper", [ pytest.param(False, "C", True), pytest.param(True, "F", False), ]) def test_ooc_mem(self, pd_data, dtype, order, upper, clean, overwrite): # 1600 is the minimum memory the fn seems to use (even for the 4x4 data) max_mem = max(pd_data.shape[0] * sizeof_dtype(dtype) * 1000, 1600) opt = dataclasses.replace(self.basic_options, max_gpu_mem=max_mem) run_potrf_test(pd_data, dtype=dtype, order=order, upper=upper, clean=clean, overwrite=overwrite, input_device="cpu", opt=opt)
class TestFalkonConjugateGradient: basic_opt = FalkonOptions(use_cpu=True) N = 500 M = 10 D = 10 penalty = 10 @pytest.fixture() def kernel(self): return GaussianKernel(100.0) @pytest.fixture() def data(self): return torch.from_numpy( gen_random(self.N, self.D, 'float64', F=False, seed=10)) @pytest.fixture(params=[1, 10], ids=["1-rhs", "10-rhs"]) def vec_rhs(self, request): return torch.from_numpy( gen_random(self.N, request.param, 'float64', F=False, seed=9)) @pytest.fixture() def centers(self, data): cs = UniformSel(np.random.default_rng(2)) return cs.select(data, None, self.M) @pytest.fixture() def knm(self, kernel, data, centers): return kernel(data, centers, opt=self.basic_opt) @pytest.fixture() def kmm(self, kernel, centers): return kernel(centers, centers, opt=self.basic_opt) @pytest.fixture() def preconditioner(self, kernel, centers): prec = FalkonPreconditioner(self.penalty, kernel, self.basic_opt) prec.init(centers) return prec def test_flk_cg(self, data, centers, kernel, preconditioner, knm, kmm, vec_rhs): opt = FalkonConjugateGradient(kernel, preconditioner, opt=self.basic_opt) # Solve (knm.T @ knm + lambda*n*kmm) x = knm.T @ b rhs = knm.T @ vec_rhs lhs = knm.T @ knm + self.penalty * self.N * kmm expected = np.linalg.solve(lhs.numpy(), rhs.numpy()) beta = opt.solve(data, centers, vec_rhs, self.penalty, None, 200) alpha = preconditioner.apply(beta) np.testing.assert_allclose(expected, alpha, rtol=1e-5)
def test_gaussian_pd(): X = gen_random(10000, 2, 'float32', F=True, seed=12) Xt = torch.from_numpy(X) sigma = 10.0 opt = FalkonOptions(compute_arch_speed=False, max_gpu_mem=1 * 2**30, use_cpu=False, no_single_kernel=False) k = GaussianKernel(sigma, opt=opt) actual = k(Xt, Xt, opt=opt) actual += torch.eye(Xt.shape[0]) * (1e-7 * Xt.shape[0]) # Test positive definite np.linalg.cholesky(actual)
def __init__(self, kernel: falkon.kernels.Kernel, preconditioner: falkon.preconditioner.Preconditioner, opt: FalkonOptions, weight_fn=None): super().__init__() self.kernel = kernel self.preconditioner = preconditioner self.params = opt self.optimizer = ConjugateGradient(opt.get_conjgrad_options()) self.weight_fn = weight_fn
class TestKeops: basic_options = FalkonOptions(debug=True, compute_arch_speed=False, keops_active="force", max_cpu_mem=max_mem_dense, max_gpu_mem=max_mem_dense) @pytest.mark.parametrize("Ao,Adt,Bo,Bdt,vo,vdt", [ ("C", np.float32, "C", np.float32, "C", np.float32), ("C", np.float64, "C", np.float64, "C", np.float64), pytest.param("F", np.float32, "F", np.float32, "F", np.float32, marks=[pytest.mark.xfail(reason="KeOps only C")]), pytest.param("F", np.float32, "C", np.float32, "C", np.float32, marks=[pytest.mark.xfail(reason="KeOps only C")]), ], ids=["AC32-BC32-vC32", "AC64-BC64-vC64", "AF32-BF32-vF32", "AF32-BC32-vC32"]) @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"]) def test_fmmv(self, A, B, v, Ao, Adt, Bo, Bdt, vo, vdt, kernel, expected_fmmv, cpu): A = fix_mat(A, order=Ao, dtype=Adt) B = fix_mat(B, order=Bo, dtype=Bdt) v = fix_mat(v, order=vo, dtype=vdt) opt = dataclasses.replace(self.basic_options, use_cpu=cpu) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype) _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_gpu_inputs(self, A, B, v, kernel, expected_fmmv): A = fix_mat(A, order="C", dtype=n32).cuda() B = fix_mat(B, order="C", dtype=n32, device=A.device) v = fix_mat(v, order="C", dtype=n32, device=A.device) opt = dataclasses.replace(self.basic_options, use_cpu=False, max_gpu_mem=np.inf) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype, device=A.device) _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_gpu_inputs_fail(self, A, B, v, kernel, expected_fmmv): A = fix_mat(A, order="C", dtype=n32, device="cuda:0") B = fix_mat(B, order="C", dtype=n32, device="cuda:0") v = fix_mat(v, order="C", dtype=n32, device="cpu") opt = dataclasses.replace(self.basic_options, use_cpu=False, max_gpu_mem=np.inf) rtol = choose_on_dtype(A.dtype) # Test normal with pytest.raises(RuntimeError): _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt)
def test_fails_cpu_tensors(self, cls_data): X, Y = cls_data kernel = kernels.GaussianKernel(2.0) opt = FalkonOptions(use_cpu=False, keops_active="no", debug=True) flk = InCoreFalkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt) with pytest.raises(ValueError): flk.fit(X, Y) flk.fit(X.cuda(), Y.cuda()) with pytest.raises(ValueError): flk.predict(X)
def test_simple(self, data): X, Y = data kernel = kernels.GaussianKernel(3.0) loss = LogisticLoss(kernel=kernel) def error_fn(t, p): return float(100 * torch.sum(t * p <= 0)) / t.shape[0], "c-err" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) logflk = LogisticFalkon( kernel=kernel, loss=loss, penalty_list=[1e-1, 1e-3, 1e-5, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8], iter_list=[3, 3, 3, 3, 8, 8, 8, 8], M=500, seed=10, options=opt, error_fn=error_fn) logflk.fit(X, Y) preds = logflk.predict(X) err = error_fn(preds, Y)[0] assert err < 0.1
def test_classif(self, cls_data): X, Y = cls_data kernel = kernels.GaussianKernel(2.0) def error_fn(t, p): return 100 * torch.sum(t * p <= 0).to( torch.float32) / t.shape[0], "c-err" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(X, Y) preds = flk.predict(X) err = error_fn(preds, Y)[0] assert err < 5
def test_regression(self, reg_data): Xtr, Ytr, Xts, Yts = reg_data kernel = kernels.GaussianKernel(20.0) def error_fn(t, p): return torch.sqrt(torch.mean((t - p)**2)).item(), "RMSE" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, maxiter=10) flk.fit(Xtr, Ytr, Xts=Xts, Yts=Yts) assert flk.predict(Xts).shape == (Yts.shape[0], 1) ts_err = error_fn(flk.predict(Xts), Yts)[0] tr_err = error_fn(flk.predict(Xtr), Ytr)[0] assert tr_err < ts_err assert ts_err < 2.5
def test_cpu_gpu_equality(mat, kernel, gram): la = 12.3 mat = fix_mat(mat, dtype=np.float64, order="F", copy=True) opt = FalkonOptions(compute_arch_speed=False, use_cpu=False, cpu_preconditioner=False) prec_gpu = FalkonPreconditioner(la, kernel, opt) prec_gpu.init(mat) opt = dataclasses.replace(opt, use_cpu=True, cpu_preconditioner=True) prec_cpu = FalkonPreconditioner(la, kernel, opt) prec_cpu.init(mat) np.testing.assert_allclose(prec_cpu.fC, prec_gpu.fC, rtol=1e-10, atol=1e-10) np.testing.assert_allclose(prec_cpu.dA, prec_gpu.dA, rtol=1e-10) np.testing.assert_allclose(prec_cpu.dT, prec_gpu.dT, rtol=1e-10)
def test_multiclass(self, multicls_data): X, Y = multicls_data kernel = kernels.GaussianKernel(10.0) def error_fn(t, p): t = torch.argmax(t, dim=1) p = torch.argmax(p, dim=1) return torch.mean((t.reshape(-1, ) != p.reshape(-1, )).to( torch.float64)), "multic-err" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(X, Y) preds = flk.predict(X) err = error_fn(preds, Y)[0] assert err < 0.23
class TestFalkonPreconditioner: rtol = {np.float64: 1e-10, np.float32: 1e-2} basic_opt = FalkonOptions(compute_arch_speed=False, no_single_kernel=True) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_simple(self, mat, kernel, gram, cpu, dtype, order): opt = dataclasses.replace(self.basic_opt, use_cpu=cpu, cpu_preconditioner=cpu) rtol = self.rtol[dtype] mat = fix_mat(mat, dtype=dtype, order=order, copy=True) gram = fix_mat(gram, dtype=dtype, order=order, copy=True) la = 100 prec = FalkonPreconditioner(la, kernel, opt) prec.init(mat) assert_invariant_on_TT(prec, gram, tol=rtol) assert_invariant_on_AT(prec, gram, la, tol=rtol) assert_invariant_on_T(prec, gram, tol=rtol * 10) assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10) def test_zero_lambda(self, mat, kernel, gram, cpu): opt = dataclasses.replace(self.basic_opt, use_cpu=cpu, cpu_preconditioner=cpu) mat = fix_mat(mat, dtype=np.float64, order="K", copy=True) gram = fix_mat(gram, dtype=np.float64, order="K", copy=True) la = 0 prec = FalkonPreconditioner(la, kernel, opt) prec.init(mat) assert_invariant_on_TT(prec, gram, tol=1e-10) assert_invariant_on_AT(prec, gram, la, tol=1e-10) assert_invariant_on_T(prec, gram, tol=1e-9) assert_invariant_on_prec(prec, N, gram, la, tol=1e-9)
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Compile on a small data subset small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device) fn(*small_data_variables, out=small_data_out, backend=backend) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # Info about GPUs ram_slack = 0.7 # slack is high due to imprecise memory usage estimates gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] gpu_ram = [ min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack) for g in gpu_info ] block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i in range(len(gpu_info)): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=gpu_ram[i] ), gpu_info[i].Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars out = fn(*variables, out=out, backend=backend) return out
class TestOOCLauum: rtol = {np.float64: 1e-12, np.float32: 1e-5} max_mem = 2 * 2**20 basic_opt = FalkonOptions(compute_arch_speed=False, use_cpu=False, max_gpu_mem=max_mem, lauum_par_blk_multiplier=6) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("order", ["F", "C"]) @pytest.mark.parametrize("device", ["cpu", "cuda:0"]) def test_no_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[ 0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu()) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("order", ["F", "C"]) @pytest.mark.parametrize("device", ["cpu", "cuda:0"]) def test_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device): mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=True, opt=new_opt) np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, opt=new_opt) np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("order", ["F", "C"]) @pytest.mark.parametrize("device", ["cpu", "cuda:0"]) def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_up = act_up.cpu() np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype) mat = move_tensor(mat, device) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_lo = act_lo.cpu() np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype]) def test_no_blk_mul(self, get_mat, expected_upper): dtype = np.float32 mat = get_mat(order="F", dtype=dtype).numpy().copy(order="K") opt = dataclasses.replace(self.basic_opt, lauum_par_blk_multiplier=1) act_lo = gpu_lauum(torch.from_numpy(mat), upper=True, overwrite=True, opt=opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_lo.numpy(), rtol=self.rtol[dtype])
class TestSparseFmm: basic_options = FalkonOptions(debug=True, compute_arch_speed=False, no_single_kernel=True) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_sparse(self, k_class, k_exp, s_A, s_B, dtype, cpu): max_mem = 50 * 2**20 opt = dataclasses.replace(self.basic_options, use_cpu=cpu, max_cpu_mem=max_mem, max_gpu_mem=max_mem) A_sparse = s_A[0].to(dtype=numpy_to_torch_type(dtype)) B_sparse = s_B[0].to(dtype=numpy_to_torch_type(dtype)) rtol = choose_on_dtype(dtype) # Here both A and B are sparse _run_fmm_test(k_class, k_exp, A_sparse, B_sparse, out=None, dtype=dtype, opt=opt, rtol=rtol) # Test with output matrix (C) (fails on GPU) out = torch.empty(A_sparse.shape[0], B_sparse.shape[0], dtype=A_sparse.dtype) if not cpu: with pytest.raises(RuntimeError): _run_fmm_test(k_class, k_exp, A_sparse, B_sparse, out=out, dtype=dtype, opt=opt, rtol=rtol) else: _run_fmm_test(k_class, k_exp, A_sparse, B_sparse, out=out, dtype=dtype, opt=opt, rtol=rtol) # Test with output matrix (F) out = torch.empty(B_sparse.shape[0], A_sparse.shape[0], dtype=A_sparse.dtype).T _run_fmm_test(k_class, k_exp, A_sparse, B_sparse, out=out, dtype=dtype, opt=opt, rtol=rtol)
class TestDenseFmm: basic_options = FalkonOptions(debug=True, compute_arch_speed=False, no_single_kernel=False) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("A,B", [ pytest.param('Ac', 'Bc', marks=pytest.mark.usefixtures('Ac', 'Bc')), pytest.param('Af', 'Bf', marks=pytest.mark.usefixtures('Af', 'Bf')), pytest.param('Ac', 'Bf', marks=pytest.mark.usefixtures('Ac', 'Bf')), ], indirect=True) def test(self, A, B, k_class, k_exp, dtype, cpu): max_mem = 2 * 2**20 opt = dataclasses.replace(self.basic_options, use_cpu=cpu, max_cpu_mem=max_mem, max_gpu_mem=max_mem) rtol = choose_on_dtype(dtype) _run_fmm_test(k_class, k_exp, A, B, out=None, dtype=dtype, opt=opt, rtol=rtol) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_with_out(self, Ac: torch.Tensor, Bc: torch.Tensor, k_class, k_exp, dtype, cpu): out = np.empty((Ac.shape[0], Bc.shape[0]), dtype=Ac.dtype) max_mem = 2 * 2**20 opt = dataclasses.replace(self.basic_options, use_cpu=cpu, max_cpu_mem=max_mem, max_gpu_mem=max_mem) rtol = choose_on_dtype(dtype) _run_fmm_test(k_class, k_exp, Ac, Bc, out=out, dtype=dtype, opt=opt, rtol=rtol) @pytest.mark.parametrize("A,B", [ pytest.param('Af', 'Bf', marks=pytest.mark.usefixtures('Af', 'Bf')), pytest.param('Ac', 'Bf', marks=pytest.mark.usefixtures('Ac', 'Bf')), ], indirect=True) def test_precise_kernel(self, A, B, k_class, k_exp, cpu): max_mem = 2 * 2**20 opt = dataclasses.replace(self.basic_options, use_cpu=cpu, max_cpu_mem=max_mem, max_gpu_mem=max_mem, no_single_kernel=True) expected_rtol = 1e-6 out = np.empty((A.shape[0], B.shape[0]), dtype=A.dtype) _run_fmm_test(k_class, k_exp, A, B, out=out, dtype=np.float32, opt=opt, rtol=expected_rtol)