コード例 #1
0
ファイル: test_ooc_lauum.py プロジェクト: mathurinm/falkon
    def test_write_opposite(self, dtype, order, get_mat, expected_lower,
                            expected_upper):
        omat = get_mat(order=order, dtype=dtype).numpy()
        mat = torch.from_numpy(omat.copy(order="K"))
        with memory_checker(self.basic_opt) as new_opt:
            act_up = gpu_lauum(mat,
                               upper=True,
                               overwrite=True,
                               write_opposite=True,
                               opt=new_opt)
        np.testing.assert_allclose(np.triu(omat, k=1),
                                   np.triu(act_up.numpy(), k=1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.tril(act_up.numpy()),
                                   np.triu(expected_upper).T,
                                   rtol=self.rtol[dtype])

        mat = torch.from_numpy(omat.copy(order="K"))
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(mat,
                               upper=False,
                               overwrite=True,
                               write_opposite=True,
                               opt=new_opt)
        np.testing.assert_allclose(np.tril(omat, k=-1),
                                   np.tril(act_lo.numpy(), k=-1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.triu(act_lo.numpy()),
                                   np.tril(expected_lower).T,
                                   rtol=self.rtol[dtype])
コード例 #2
0
    def test_no_overwrite(self, dtype, order, get_mat, expected_lower,
                          expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        # For cuda inputs we must add to available GPU memory the amount used by the
        # input matrix, since overwrite=False and a full copy must be performed.
        mgpu_slack = 0
        if device.startswith("cuda"):
            mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[
                0]**2 * sizeof_dtype(mat.dtype)

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_upper,
                                   act_up.cpu().numpy(),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(omat, mat.cpu())

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_lower,
                                   act_lo.cpu().numpy(),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(omat, mat.cpu())
コード例 #3
0
ファイル: test_ooc_lauum.py プロジェクト: fwilliams/falkon
    def test_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper):
        mat = get_mat(order=order, dtype=dtype).numpy().copy(order="K")
        with memory_checker(self.basic_opt) as new_opt:
            act_up = gpu_lauum(torch.from_numpy(mat), upper=True, overwrite=True, opt=new_opt)
        np.testing.assert_allclose(expected_upper, act_up.numpy(), rtol=self.rtol[dtype])

        mat = get_mat(order=order, dtype=dtype).numpy().copy(order="K")
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(torch.from_numpy(mat), upper=False, overwrite=True, opt=new_opt)
        np.testing.assert_allclose(expected_lower, act_lo.numpy(), rtol=self.rtol[dtype])
コード例 #4
0
ファイル: test_ooc_lauum.py プロジェクト: fwilliams/falkon
    def test_no_blk_mul(self, get_mat, expected_upper):
        dtype = np.float32
        mat = get_mat(order="F", dtype=dtype).numpy().copy(order="K")
        opt = dataclasses.replace(self.basic_opt, lauum_par_blk_multiplier=1)

        act_lo = gpu_lauum(torch.from_numpy(mat), upper=True, overwrite=True, opt=opt)
        torch.cuda.synchronize()
        np.testing.assert_allclose(expected_upper, act_lo.numpy(), rtol=self.rtol[dtype])
コード例 #5
0
    def test_diff_blk_sizes(self, dtype, order, get_mat, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        # For cuda inputs we must add to available GPU memory the amount used by the
        # input matrix, since overwrite=False and a full copy must be performed.
        mgpu_slack = 0
        if device.startswith("cuda"):
            mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype)

        opt_v1 = dataclasses.replace(self.basic_opt, max_gpu_mem=2*2**20 + mgpu_slack)
        act_up_v1 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v1)
        opt_v2 = dataclasses.replace(self.basic_opt, max_gpu_mem=4*2**20 + mgpu_slack)
        act_up_v2 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v2)
        opt_v3 = dataclasses.replace(self.basic_opt, max_gpu_mem=6*2**20 + mgpu_slack)
        act_up_v3 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v3)

        np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v1.cpu().numpy(), rtol=self.rtol[dtype])
        np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v2.cpu().numpy(), rtol=self.rtol[dtype])
コード例 #6
0
    def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype)
        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=False, write_opposite=True, opt=new_opt)
        act_up = act_up.cpu()
        np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T,
                                   rtol=self.rtol[dtype])

        mat = get_mat(order=order, dtype=dtype, device=device)
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt)
            torch.cuda.synchronize()
        act_lo = act_lo.cpu()
        np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T,
                                   rtol=self.rtol[dtype])
コード例 #7
0
ファイル: pc_utils.py プロジェクト: mathurinm/falkon
def lauum_wrapper(A: np.ndarray, upper: bool, use_cuda: bool,
                  opt: FalkonOptions) -> np.ndarray:
    if use_cuda:
        from falkon.ooc_ops.ooc_lauum import gpu_lauum
        return gpu_lauum(A,
                         upper=upper,
                         write_opposite=True,
                         overwrite=True,
                         opt=opt)
    else:
        lauum = choose_fn(A.dtype, scll.dlauum, scll.slauum, "LAUUM")
        sol, info = lauum(A, lower=int(not upper), overwrite_c=1)
        if info != 0:
            raise RuntimeError(f"Lapack LAUUM failed with error code {info}.")
        return sol