def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper): omat = get_mat(order=order, dtype=dtype).numpy() mat = torch.from_numpy(omat.copy(order="K")) with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=True, write_opposite=True, opt=new_opt) np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = torch.from_numpy(omat.copy(order="K")) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype])
def test_no_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[ 0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu()) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu())
def test_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper): mat = get_mat(order=order, dtype=dtype).numpy().copy(order="K") with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(torch.from_numpy(mat), upper=True, overwrite=True, opt=new_opt) np.testing.assert_allclose(expected_upper, act_up.numpy(), rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype).numpy().copy(order="K") with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(torch.from_numpy(mat), upper=False, overwrite=True, opt=new_opt) np.testing.assert_allclose(expected_lower, act_lo.numpy(), rtol=self.rtol[dtype])
def test_no_blk_mul(self, get_mat, expected_upper): dtype = np.float32 mat = get_mat(order="F", dtype=dtype).numpy().copy(order="K") opt = dataclasses.replace(self.basic_opt, lauum_par_blk_multiplier=1) act_lo = gpu_lauum(torch.from_numpy(mat), upper=True, overwrite=True, opt=opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_lo.numpy(), rtol=self.rtol[dtype])
def test_diff_blk_sizes(self, dtype, order, get_mat, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype) opt_v1 = dataclasses.replace(self.basic_opt, max_gpu_mem=2*2**20 + mgpu_slack) act_up_v1 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v1) opt_v2 = dataclasses.replace(self.basic_opt, max_gpu_mem=4*2**20 + mgpu_slack) act_up_v2 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v2) opt_v3 = dataclasses.replace(self.basic_opt, max_gpu_mem=6*2**20 + mgpu_slack) act_up_v3 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v3) np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v1.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v2.cpu().numpy(), rtol=self.rtol[dtype])
def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, write_opposite=True, opt=new_opt) act_up = act_up.cpu() np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_lo = act_lo.cpu() np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype])
def lauum_wrapper(A: np.ndarray, upper: bool, use_cuda: bool, opt: FalkonOptions) -> np.ndarray: if use_cuda: from falkon.ooc_ops.ooc_lauum import gpu_lauum return gpu_lauum(A, upper=upper, write_opposite=True, overwrite=True, opt=opt) else: lauum = choose_fn(A.dtype, scll.dlauum, scll.slauum, "LAUUM") sol, info = lauum(A, lower=int(not upper), overwrite_c=1) if info != 0: raise RuntimeError(f"Lapack LAUUM failed with error code {info}.") return sol