def test_with_x0(self, mat, vec_rhs, conjgrad, order, device): if order == "F": mat = torch.from_numpy(np.asfortranarray(mat.numpy())) vec_rhs = torch.from_numpy(np.asfortranarray(vec_rhs.numpy())) mat = move_tensor(mat, device) vec_rhs = move_tensor(vec_rhs, device) init_sol = create_same_stride(vec_rhs.size(), vec_rhs, vec_rhs.dtype, device) init_sol.fill_(0.0) x = conjgrad.solve(X0=init_sol, B=vec_rhs, mmv=lambda x_: mat @ x_, max_iter=10, callback=None) assert x.data_ptr() == init_sol.data_ptr( ), "Initial solution vector was copied" assert str(x.device) == device, "Device has changed unexpectedly" assert x.shape == (self.t, vec_rhs.shape[1]), "Output shape is incorrect" assert x.stride() == vec_rhs.stride( ), "Stride has changed unexpectedly" expected = np.linalg.solve(mat.cpu().numpy(), vec_rhs.cpu().numpy()) np.testing.assert_allclose(expected, x.cpu().numpy(), rtol=1e-6)
def test_lauum(self, dtype, get_mat, expected_lower, expected_upper, lower): device = torch.device("cuda:0") mat = get_mat(order="F", dtype=dtype) gpu_in = move_tensor(mat, device) gpu_out = move_tensor(mat, device) gpu_out.fill_(0.0) # Run on the GPU cuda_lauum(n=mat.shape[0], A=gpu_in, lda=gpu_in.stride(1), B=gpu_out, ldb=gpu_out.stride(1), lower=lower) torch.cuda.synchronize(device) # Compare outputs and print timing info if lower: np.testing.assert_allclose(np.tril(expected_lower), gpu_out.cpu().numpy(), rtol=self.rtol[dtype]) else: np.testing.assert_allclose(np.triu(expected_upper), gpu_out.cpu().numpy(), rtol=self.rtol[dtype])
def test_flk_cg(self, data, centers, kernel, preconditioner, knm, kmm, vec_rhs, device): preconditioner = preconditioner.to(device) options = dataclasses.replace(self.basic_opt, use_cpu=device == "cpu") opt = FalkonConjugateGradient(kernel, preconditioner, opt=options) # Solve (knm.T @ knm + lambda*n*kmm) x = knm.T @ b rhs = knm.T @ vec_rhs lhs = knm.T @ knm + self.penalty * self.N * kmm expected = np.linalg.solve(lhs.numpy(), rhs.numpy()) data = move_tensor(data, device) centers = move_tensor(centers, device) vec_rhs = move_tensor(vec_rhs, device) beta = opt.solve(X=data, M=centers, Y=vec_rhs, _lambda=self.penalty, initial_solution=None, max_iter=200) alpha = preconditioner.apply(beta) assert str(beta.device) == device, "Device has changed unexpectedly" np.testing.assert_allclose(expected, alpha.cpu().numpy(), rtol=1e-5)
def test_cuda_start(self, mat, kernel, gram, dtype, order): opt = dataclasses.replace(self.basic_opt, use_cpu=False, cpu_preconditioner=False) rtol = self.rtol[dtype] mat = fix_mat(mat, dtype=dtype, order=order, copy=True) gpu_mat = move_tensor(mat, "cuda:0") gram = fix_mat(gram, dtype=dtype, order=order, copy=True) gpu_gram = move_tensor(gram, "cuda:0") la = 1 prec = FalkonPreconditioner(la, kernel, opt) prec.init(mat) gpu_prec = FalkonPreconditioner(la, kernel, opt) gpu_prec.init(gpu_mat) np.testing.assert_allclose(prec.dT.numpy(), gpu_prec.dT.cpu().numpy(), rtol=rtol) np.testing.assert_allclose(prec.dA.numpy(), gpu_prec.dA.cpu().numpy(), rtol=rtol) np.testing.assert_allclose(prec.fC.numpy(), gpu_prec.fC.cpu().numpy(), rtol=rtol * 10) assert gpu_prec.fC.device == gpu_mat.device, "Device changed unexpectedly" assert_invariant_on_TT(gpu_prec, gpu_gram, tol=rtol) assert_invariant_on_AT(prec, gram, la, tol=rtol) assert_invariant_on_T(prec, gram, tol=rtol * 10) assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10)
def test_rect(self, rect, order, dtype): from falkon.la_helpers.cuda_la_helpers import cuda_transpose mat = fix_mat(rect, order=order, dtype=dtype, copy=True, numpy=True) exp_mat_out = np.copy(mat.T, order=order) mat = move_tensor(torch.from_numpy(mat), "cuda:0") mat_out = move_tensor(torch.from_numpy(exp_mat_out), "cuda:0") mat_out.fill_(0.0) cuda_transpose(input=mat, output=mat_out) mat_out = move_tensor(mat_out, "cpu").numpy() assert mat_out.strides == exp_mat_out.strides np.testing.assert_allclose(exp_mat_out, mat_out)
def test_trsm_wrapper(mat, arr, dtype, order, device, lower, transpose): rtol = 1e-2 if dtype == np.float32 else 1e-11 n_mat = move_tensor(fix_mat(mat, dtype=dtype, order=order, copy=True), device=device) n_arr = move_tensor(fix_mat(arr, dtype=dtype, order=order, copy=True), device=device) expected = sclb.dtrsm(1e-2, mat, arr, side=0, lower=lower, trans_a=transpose, overwrite_b=0) if device.startswith("cuda") and order == "C": with pytest.raises(ValueError): actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose) else: actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose) np.testing.assert_allclose(expected, actual.cpu().numpy(), rtol=rtol)
def run_potrf_test(np_data, dtype, order, opt, input_device, upper, clean, overwrite): # Convert pd_data to the appropriate form data = np.array(np_data, order=order, dtype=dtype, copy=True) lapack_fn, rtol = choose_on_dtype(dtype) A = move_tensor(torch.from_numpy(data.copy(order="K")), input_device) orig_stride = A.stride() orig_ptr = A.data_ptr() with memory_checker(opt) as new_opt: C_gpu = gpu_cholesky(A, upper=upper, clean=clean, overwrite=overwrite, opt=new_opt) assert orig_stride == C_gpu.stride(), "gpu_potrf modified matrix stride." if overwrite: assert orig_ptr == C_gpu.data_ptr( ), "Data-pointer changed although overwrite is True." C_cpu = lapack_fn(data, lower=int(not upper), clean=int(clean), overwrite_a=int(overwrite))[0] np.testing.assert_allclose(C_cpu, C_gpu.cpu().numpy(), rtol=rtol, verbose=True)
def test_low(self, mat, order, dtype, device): mat = fix_mat(mat, order=order, dtype=dtype, numpy=True) mat_low = mat.copy(order="K") # Upper triangle of mat_low is 0 mat_low[np.triu_indices(self.t, 1)] = 0 # Create device matrix mat_low = torch.from_numpy(mat_low) mat_low_dev = move_tensor(mat_low, device) # Run copy copy_triang(mat_low_dev, upper=False) # Make checks on CPU mat_low = mat_low_dev.cpu().numpy() assert np.sum(mat_low == 0) == 0 np.testing.assert_array_equal(np.tril(mat), np.tril(mat_low)) np.testing.assert_array_equal(np.triu(mat_low), np.tril(mat_low).T) np.testing.assert_array_equal(np.diag(mat), np.diag(mat_low)) # Reset and try with `upper=True` mat_low[np.triu_indices(self.t, 1)] = 0 mat_low_dev.copy_(torch.from_numpy(mat_low)) copy_triang(mat_low_dev, upper=True) # Only the diagonal will be set mat_low = mat_low_dev.cpu().numpy() np.testing.assert_array_equal(np.diag(mat), np.diag(mat_low))
def test_up(self, mat, order, dtype, device): mat = fix_mat(mat, order=order, dtype=dtype, numpy=True) mat_up = mat.copy(order="K") # Lower triangle of mat_up is 0 mat_up[np.tril_indices(self.t, -1)] = 0 # Create device matrix mat_up = torch.from_numpy(mat_up) mat_up_dev = move_tensor(mat_up, device) copy_triang(mat_up_dev, upper=True) mat_up = mat_up_dev.cpu().numpy() assert np.sum(mat_up == 0) == 0 np.testing.assert_array_equal(np.triu(mat), np.triu(mat_up)) np.testing.assert_array_equal(np.tril(mat_up), np.triu(mat_up).T) np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up)) # Reset and try with `upper=False` mat_up[np.tril_indices(self.t, -1)] = 0 mat_up_dev.copy_(torch.from_numpy(mat_up)) copy_triang(mat_up_dev, upper=False) # Only the diagonal will be set. mat_up = mat_up_dev.cpu().numpy() np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up))
def test(self, A, B, k_class, k_exp, dtype, cpu, input_device): opt = dataclasses.replace(self.basic_options, use_cpu=cpu) if input_device.startswith("cuda"): # For fMM there is nothing we can do about CUDA memory usage! opt = dataclasses.replace(opt, max_gpu_mem=np.inf) A = move_tensor(torch.from_numpy(A), input_device) B = move_tensor(torch.from_numpy(B), input_device) _run_fmm_test(k_class, k_exp, A, B, out=None, dtype=dtype, opt=opt, rtol=self._RTOL[A.dtype])
def test_trsm(self, mat, vec, solution, alpha, dtype, order_v, order_A, device): mat = move_tensor(fix_mat(mat, dtype, order_A, copy=True, numpy=False), device=device) vec = move_tensor(fix_mat(vec, dtype, order_v, copy=True, numpy=False), device=device) sol_vec, lower, trans = solution out = trsm(vec, mat, alpha, lower=int(lower), transpose=int(trans)) assert out.data_ptr() != vec.data_ptr(), "Vec was overwritten." assert out.device == vec.device, "Output device is incorrect." assert out.stride() == vec.stride(), "Stride was modified." assert out.dtype == vec.dtype, "Dtype was modified." np.testing.assert_allclose(sol_vec, out.cpu().numpy(), rtol=self.rtol[dtype])
def test_strided(self, dtype, get_mat, expected_lower): device = torch.device("cuda:0") mat = get_mat(order="F", dtype=dtype) gpu_in = move_tensor(mat, device) gpu_in_strided = torch.cat([gpu_in, torch.zeros(gpu_in.shape[0], 10, device=device, dtype=gpu_in.dtype)], 1).T gpu_in_strided = gpu_in_strided[:gpu_in.shape[0], :gpu_in.shape[0]] gpu_in_strided.copy_(gpu_in) gpu_out = move_tensor(mat, device) gpu_out_strided = torch.cat([gpu_out, torch.zeros(gpu_out.shape[0], 10, device=device, dtype=gpu_in.dtype)], 1).T gpu_out_strided = gpu_out_strided[:gpu_out.shape[0], :gpu_out.shape[0]] gpu_out_strided.fill_(0.0) # Run on the GPU cuda_lauum_lower(n=gpu_in.shape[0], A=gpu_in_strided, lda=gpu_in_strided.stride(1), B=gpu_out_strided, ldb=gpu_out_strided.stride(1)) torch.cuda.synchronize(device) # Compare outputs and print timing info np.testing.assert_allclose(np.tril(expected_lower), gpu_out_strided.cpu().numpy(), rtol=self.rtol[dtype])
def test_with_out(self, Ac: np.ndarray, Bc: np.ndarray, k_class, k_exp, dtype, cpu, input_device): opt = dataclasses.replace(self.basic_options, use_cpu=cpu) Ac = move_tensor(torch.from_numpy(Ac.astype(dtype)), input_device) Bc = move_tensor(torch.from_numpy(Bc.astype(dtype)), input_device) out = torch.empty(Ac.shape[0], Bc.shape[0], dtype=Ac.dtype, device=input_device) _run_fmm_test(k_class, k_exp, Ac, Bc, out=out, dtype=dtype, opt=opt, rtol=self._RTOL[Ac.dtype])
def fix_mat(t, dtype, order, device="cpu", copy=False, numpy=False): if dtype is None or order is None: return None if isinstance(t, torch.Tensor): t = t.numpy() if isinstance(t, np.ndarray): t = np.array(t, dtype=dtype, order=order, copy=copy) if numpy: return t return move_tensor(torch.from_numpy(t), device) return t
def test_one_rhs(self, mat, vec_rhs, conjgrad, order, device): if order == "F": mat = torch.from_numpy(np.asfortranarray(mat.numpy())) vec_rhs = torch.from_numpy(np.asfortranarray(vec_rhs.numpy())) mat = move_tensor(mat, device) vec_rhs = move_tensor(vec_rhs, device) x = conjgrad.solve(X0=None, B=vec_rhs, mmv=lambda x_: mat @ x_, max_iter=10, callback=None) assert str(x.device) == device, "Device has changed unexpectedly" assert x.stride() == vec_rhs.stride( ), "Stride has changed unexpectedly" assert x.shape == (self.t, vec_rhs.shape[1]), "Output shape is incorrect" expected = np.linalg.solve(mat.cpu().numpy(), vec_rhs.cpu().numpy()) np.testing.assert_allclose(expected, x.cpu().numpy(), rtol=1e-6)
def test_precise_kernel(self, A, B, k_class, k_exp, cpu, input_device): opt = dataclasses.replace(self.basic_options, use_cpu=cpu, no_single_kernel=True) A = move_tensor(torch.from_numpy(A), input_device) B = move_tensor(torch.from_numpy(B), input_device) out = torch.empty(A.shape[0], B.shape[0], dtype=A.dtype, device=input_device) # Note rtol is 10x lower than in the other tests _run_fmm_test(k_class, k_exp, A, B, out=out, dtype=np.float32, opt=opt, rtol=1e-6)
def test_no_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device): mat = get_mat(order=order, dtype=dtype) mat = move_tensor(mat, device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype]) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype])
def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_up = act_up.cpu() np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype) mat = move_tensor(mat, device) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_lo = act_lo.cpu() np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype])