Esempio n. 1
0
 def to_torch(Xtr, Ytr, Xts, Yts, **kwargs):
     from falkon.sparse.sparse_tensor import SparseTensor
     import torch
     return (SparseTensor.from_scipy(Xtr),
             torch.from_numpy(Ytr),
             SparseTensor.from_scipy(Xts),
             torch.from_numpy(Yts), {})
Esempio n. 2
0
    def test_start_zero(self):
        device = 'cpu'
        indexptr = torch.tensor([0, 1, 3, 4], dtype=torch.long, device=device)
        index = torch.tensor([1, 0, 1, 0], dtype=torch.long, device=device)
        value = torch.tensor([2, 1, 3, 4], dtype=torch.float32, device=device)
        arr = SparseTensor(indexptr=indexptr,
                           index=index,
                           data=value,
                           size=(3, 2),
                           sparse_type="csr")

        arr_small = arr.narrow_rows(0, 2)
        sm_coo = arr_small.to_scipy().tocoo()
        self.assertEqual(sm_coo.row.tolist(), [0, 1, 1])
        self.assertEqual(sm_coo.col.tolist(), [1, 0, 1])
        self.assertEqual(sm_coo.data.tolist(), [2, 1, 3])
        self.assertEqual(arr.indexptr.data_ptr(),
                         arr_small.indexptr.data_ptr())

        arr_small = arr.narrow_rows(0, 1)
        sm_coo = arr_small.to_scipy().tocoo()
        self.assertEqual(sm_coo.row.tolist(), [0])
        self.assertEqual(sm_coo.col.tolist(), [1])
        self.assertEqual(sm_coo.data.tolist(), [2])
        self.assertEqual(arr.indexptr.data_ptr(),
                         arr_small.indexptr.data_ptr())
Esempio n. 3
0
def sparse_matmul(A: SparseTensor, B: SparseTensor,
                  out: torch.Tensor) -> torch.Tensor:
    """Sparse*Sparse matrix multiplication. Output will be copied into dense `out` matrix.

    This function can be applied to CPU or CUDA tensors (but all tensors must
    be  on the same device).

    Parameters
    ----------
    A : SparseTensor
        N x D, sparse matrix.
    B : SparseTensor
        D x M, sparse matrix
    out : torch.Tensor
        Dense N x M tensor, it will hold the output of the multiplication.

    Returns
    -------
    out : torch.Tensor
        The same tensor as the input `out` parameter.

    """
    if A.nnz() == 0 or B.nnz() == 0:
        out.fill_(0.0)
        return out

    if A.is_cuda:
        return _sparse_matmul_cuda(A, B, out)
    else:
        return _sparse_matmul_cpu(A, B, out)
Esempio n. 4
0
def fmm_cpu_sparse(X1: SparseTensor, X2: SparseTensor,
                   kernel: 'falkon.kernels.Kernel',
                   out: Optional[torch.Tensor],
                   opt: BaseOptions) -> torch.Tensor:
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu_sparse(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare_sparse(X1, X2)
        kernel._apply_sparse(X1, X2.transpose_csc(), out)
        kernel._finalize(out, ddd)

    return out
Esempio n. 5
0
 def _prepare_sparse(self, X1: SparseTensor,
                     X2: SparseTensor) -> DistKerContainer:
     sq1 = torch.empty(X1.size(0), dtype=X1.dtype, device=X1.device)
     sparse_ops.sparse_square_norm(X1, sq1)
     sq2 = torch.empty(X2.size(0), dtype=X1.dtype, device=X1.device)
     sparse_ops.sparse_square_norm(X2, sq2)
     return DistKerContainer(sq1=sq1.reshape(-1, 1), sq2=sq2.reshape(-1, 1))
Esempio n. 6
0
def fmmv_cuda_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: torch.Tensor,
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fmmv, args)
    return out
Esempio n. 7
0
def fmm_cuda_sparse(X1: SparseTensor,
                    X2: SparseTensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor] = None,
                    opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((out, 'out'))
    N = X1.size(0)
    M = X2.size(0)
    if out is None:
        out = create_fortran((N, M), X1.dtype, 'cpu', pin_memory=True)
    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # If float32 we need to upcast to float64 to avoid numerical precision errors
    # in the kernel
    gpu_dtype = X1.dtype
    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        gpu_dtype = torch.float64

    # Create the arguments passed to each subprocess
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        args.append((ArgsFmm(X1=X1.narrow_rows(block_sizes[i], bwidth),
                             X2=X2,
                             out=out.narrow(0, block_sizes[i], bwidth),
                             kernel=kernel,
                             gpu_dtype=gpu_dtype,
                             max_mem=g.usable_ram), g.Id))
    _start_wait_processes(_sparse_fmm, args)
    torch.cuda.empty_cache()
    return out
Esempio n. 8
0
    def test_simple_transpose(self):
        for device in ('cpu', 'cuda:0'):
            with self.subTest(device=device):
                if device == 'cuda:0' and not torch.cuda.is_available():
                    self.skipTest("Cuda not available")

                indexptr = torch.tensor([0, 1, 3, 4],
                                        dtype=torch.long,
                                        device=device)
                index = torch.tensor([1, 0, 1, 0],
                                     dtype=torch.long,
                                     device=device)
                value = torch.tensor([2, 1, 3, 4],
                                     dtype=torch.float32,
                                     device=device)
                arr = SparseTensor(indexptr=indexptr,
                                   index=index,
                                   data=value,
                                   size=(3, 2),
                                   sparse_type="csr")
                tr_arr = arr.transpose_csc()
                self.assertEqual((2, 3), tr_arr.shape)
                tr_mat = tr_arr.to_scipy().tocoo()
                self.assertEqual(tr_mat.row.tolist(), [1, 0, 1, 0])
                self.assertEqual(tr_mat.col.tolist(), [0, 1, 1, 2])
                self.assertEqual(tr_mat.data.tolist(), [2, 1, 3, 4])
Esempio n. 9
0
    def test_cpu_matmul(self, mat1, mat2, expected):
        out = torch.empty_like(expected)
        mat1_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat1))
        mat2_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat2))
        sparse_matmul(mat1_csr, mat2_csc, out)

        torch.testing.assert_allclose(out, expected)
Esempio n. 10
0
def fdmmv_cpu_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: Optional[torch.Tensor],
                     w: Optional[torch.Tensor],
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None):
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.size(1) if v is not None else w.size(1)
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrow X1 : n
    # ker_chunk : n*M
    # w_blk     : n*T
    n = avail_mem / (M * T + 1)
    n = int(math.floor(n))
    if n < 1:
        raise MemoryError(("Available memory %.2fGB is insufficient "
                           "for blockwise fdMMv.") %
                          (avail_mem * sizeof_dtype(dtype) / 2**30))

    # Allocate fixed arrays
    ker_chunk = create_same_stride((n, M), out, dtype, device='cpu')
    w_blk = create_same_stride((n, T), out, dtype, device='cpu')
    # Run blocked fdmmv
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        X1_chunk = X1.narrow_rows(i, ic)
        cur_ker_chunk = ker_chunk[:ic]
        cur_ker_chunk.fill_(0.0)
        ddd = kernel._prepare_sparse(X1_chunk, X2)
        kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk)
        kernel._finalize(cur_ker_chunk, ddd)

        # Multiply by the vector v
        cur_w_blk = w_blk[:ic]  # n x T
        cur_w_blk.fill_(0.0)
        if w is not None:
            cur_w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            cur_w_blk.addmm_(cur_ker_chunk, v)
        out.addmm_(cur_ker_chunk.T, cur_w_blk)
    del ker_chunk, w_blk
    return out
Esempio n. 11
0
    def test_cuda_matmul(self, mat1, mat2, expected):
        dev = torch.device("cuda:0")
        out = create_fortran(expected.shape, expected.dtype, dev)
        mat1_csr = SparseTensor.from_scipy(
            scipy.sparse.csr_matrix(mat1)).to(device=dev)
        mat2_csr = SparseTensor.from_scipy(
            scipy.sparse.csr_matrix(mat2)).to(device=dev)
        sparse_matmul(mat1_csr, mat2_csr, out)

        torch.testing.assert_allclose(out.cpu(), expected)
Esempio n. 12
0
def fdmmv_cuda_sparse(X1: SparseTensor,
                      X2: SparseTensor,
                      v: Optional[torch.Tensor],
                      w: Optional[torch.Tensor],
                      kernel,
                      out: Optional[torch.Tensor] = None,
                      opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.95)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        cur_out_gpu = create_C((M, T), X1.dtype,
                               f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None

        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fdmmv, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            torch.cuda.comm.reduce_add(
                wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
Esempio n. 13
0
 def test_cpu_matmul_wrong_format(self, mat1, mat2, expected):
     out = torch.empty_like(expected)
     mat1_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat1))
     mat2_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat2))
     with pytest.raises(ValueError) as exc_info:
         sparse_matmul(mat1_csr, mat2_csr, out)
     assert str(exc_info.value).startswith("B must be CSC matrix")
     mat1_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat1))
     with pytest.raises(ValueError) as exc_info:
         sparse_matmul(mat1_csc, mat2_csr, out)
     assert str(exc_info.value).startswith("A must be CSR matrix")
Esempio n. 14
0
    def test_empty(self):
        device = 'cpu'
        indexptr = torch.tensor([0, 1, 1, 1, 3, 4], dtype=torch.long, device=device)
        index = torch.tensor([1, 0, 1, 0], dtype=torch.long, device=device)
        value = torch.tensor([2, 1, 3, 4], dtype=torch.float32, device=device)
        arr = SparseTensor(indexptr=indexptr, index=index, data=value, size=(5, 2), sparse_type="csr")

        arr_small = arr.narrow_rows(1, 2)
        sm_coo = arr_small.to_scipy().tocoo()
        self.assertEqual(sm_coo.row.tolist(), [])
        self.assertEqual(sm_coo.col.tolist(), [])
        self.assertEqual(sm_coo.data.tolist(), [])
Esempio n. 15
0
def test_check_sparse():
    smat = scipy.sparse.csr_matrix(
        np.array([[0, 1], [0, 1]]).astype(np.float32))
    st = SparseTensor.from_scipy(smat)

    assert [False, True] == check_sparse(torch.tensor(0), st)
    assert [] == check_sparse()
Esempio n. 16
0
def test_check_same_dtype_equal():
    smat = scipy.sparse.csr_matrix(
        np.array([[0, 1], [0, 1]]).astype(np.float32))
    ts = [
        torch.tensor(0, dtype=torch.float32),
        SparseTensor.from_scipy(smat), None
    ]
    assert check_same_dtype(*ts) is True
Esempio n. 17
0
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor], opt: BaseOptions):
    opt = _setup_opt(opt, is_cpu=True)

    dtype = X1.dtype
    ntot, dtot = X1.size()
    mtot, T = v.size()

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)
    out.fill_(0.0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrowing X1, X2: n + m
    # Prepare - not computable, depends on kernel
    # ker_chunk : n*m
    # finalize : 0 (if can be implemented in place, kernel-dependent)
    n, m = select_dim_over_m(maxM=mtot,
                             maxN=ntot,
                             coef_nm=1,
                             coef_n=1,
                             coef_m=1,
                             tot=avail_mem)

    ker_chunk = create_same_stride((n, m), out, dtype, device='cpu')
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        cur_out = out[i:i + ic, :]
        X1_chunk = X1.narrow_rows(i, ic)
        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow_rows(j, jc)
            cur_ker_chunk = ker_chunk[:ic, :jc]
            cur_ker_chunk.fill_(0.0)

            ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
            kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(),
                                 cur_ker_chunk)
            kernel._finalize(cur_ker_chunk, ddd)

            # Multiply by the vector v
            cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc))
    return out
Esempio n. 18
0
def csc_mat() -> SparseTensor:
    indexptr = torch.tensor([0, 2, 3, 6], dtype=torch.long)
    index = torch.tensor([0, 2, 2, 0, 1, 2], dtype=torch.long)
    value = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.float32)
    return SparseTensor(indexptr=indexptr,
                        index=index,
                        data=value,
                        size=(3, 3),
                        sparse_type="csc")
Esempio n. 19
0
def test_check_same_dtype_notequal():
    smat32 = scipy.sparse.csr_matrix(
        np.array([[0, 1], [0, 1]]).astype(np.float32))
    smat64 = scipy.sparse.csr_matrix(
        np.array([[0, 1], [0, 1]]).astype(np.float64))
    ts = [
        torch.tensor(0, dtype=torch.float32),
        torch.tensor(0, dtype=torch.float64),
        SparseTensor.from_scipy(smat32),
    ]
    assert check_same_dtype(*ts) is False

    ts = [
        torch.tensor(0, dtype=torch.float32),
        SparseTensor.from_scipy(smat32),
        SparseTensor.from_scipy(smat64),
    ]
    assert check_same_dtype(*ts) is False
Esempio n. 20
0
    def test_matmul_zeros(self, mat1, mat2, expected, device):
        mat1_zero_csr = SparseTensor.from_scipy(
            scipy.sparse.csr_matrix(
                torch.zeros_like(mat1).numpy())).to(device=device)
        mat2_csc = SparseTensor.from_scipy(
            scipy.sparse.csc_matrix(mat2.numpy())).to(device=device)
        out = torch.empty_like(expected).to(device)
        sparse_matmul(mat1_zero_csr, mat2_csc, out)
        assert torch.all(out == 0.0)

        mat1_csr = SparseTensor.from_scipy(
            scipy.sparse.csr_matrix(mat1.numpy())).to(device=device)
        mat2_zero_csc = SparseTensor.from_scipy(
            scipy.sparse.csc_matrix(
                torch.zeros_like(mat2).numpy())).to(device=device)
        out = torch.empty_like(expected).to(device=device)
        sparse_matmul(mat1_csr, mat2_zero_csc, out)
        assert torch.all(out == 0.0)
Esempio n. 21
0
def gen_sparse_matrix(a, b, dtype, density=0.1, seed=0) -> SparseTensor:
    out = random_sparse(a,
                        b,
                        density=density,
                        format='csr',
                        dtype=dtype,
                        seed=seed)

    return SparseTensor.from_scipy(out)
Esempio n. 22
0
    def select(self,
               X: _tensor_type,
               Y: Union[torch.Tensor, None],
               M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]:
        """Select M observations from 2D tensor `X`, preserving device and memory order.

        The selection strategy is uniformly at random. To control the randomness,
        pass an appropriate numpy random generator to this class's constructor.

        Parameters
        ----------
        X
            N x D tensor containing the whole input dataset. We have that N <= M.
        Y
            Optional N x T tensor containing the input targets. If `Y` is provided,
            the same observations selected for `X` will also be selected from `Y`.
            Certain models (such as :class:`falkon.models.LogisticFalkon`) require centers to be
            extracted from both predictors and targets, while others (such as
            :class:`falkon.models.Falkon`) only require the centers from the predictors.
        M
            The number of observations to choose. M <= N, otherwise M is forcibly set to N
            with a warning.

        Returns
        -------
        X_M
            The randomly selected centers. They will be in a new, memory-contiguous tensor.
            All characteristics of the input tensor will be preserved.
        (X_M, Y_M)
            If `Y` was different than `None` then the entries of `Y` corresponding to the
            selected centers of `X` will also be returned.
        """
        N = X.shape[0]
        if M > N:
            warnings.warn("Number of centers M greater than the "
                          "number of data-points. Setting M to %d" % (N))
            M = N
        idx = self.random_gen.choice(N, size=M, replace=False)

        if isinstance(X, SparseTensor):
            X = X.to_scipy()
            centers = X[idx, :].copy()
            Xc = SparseTensor.from_scipy(centers)
        else:
            Xc = create_same_stride((M, X.shape[1]), other=X, dtype=X.dtype, device=X.device,
                                    pin_memory=False)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device)
            torch.index_select(X, dim=0, index=th_idx, out=Xc)

        if Y is not None:
            Yc = create_same_stride((M, Y.shape[1]), other=Y, dtype=Y.dtype, device=Y.device,
                                    pin_memory=False)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(Y.device)
            torch.index_select(Y, dim=0, index=th_idx, out=Yc)
            return Xc, Yc
        return Xc
Esempio n. 23
0
def csr_mat() -> SparseTensor:
    """
     -  2
     1  3
     4  -
    """
    indexptr = torch.tensor([0, 1, 3, 4], dtype=torch.long)
    index = torch.tensor([1, 0, 1, 0], dtype=torch.long)
    value = torch.tensor([2, 1, 3, 4], dtype=torch.float32)
    return SparseTensor(indexptr=indexptr,
                        index=index,
                        data=value,
                        size=(3, 2),
                        sparse_type="csr")
Esempio n. 24
0
def sparse_matmul(A: SparseTensor, B: SparseTensor,
                  out: torch.Tensor) -> torch.Tensor:
    """Sparse*Sparse matrix multiplication. Output will be copied into dense `out` matrix.

    This function can be applied to CPU or CUDA tensors (but all tensors must
    be consistently on the same device). Note that the CUDA matrix multiplication
    is

    Parameters
    ----------
    A : SparseTensor
        N x D, sparse matrix.
    B : SparseTensor
        D x M, sparse matrix

    """
    if A.nnz() == 0 or B.nnz() == 0:
        return out

    if A.is_cuda:
        return _sparse_matmul_cuda(A, B, out)
    else:
        return _sparse_matmul_cpu(A, B, out)
Esempio n. 25
0
    def select(
            self, X: _tensor_type, Y: Union[torch.Tensor, None],
            M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]:
        """Select M rows from 2D array `X`, preserving the memory order of `X`.
        """
        N = X.size(0)
        if M > N:
            warnings.warn("Number of centers M greater than the "
                          "number of data-points. Setting M to %d" % (N))
            M = N
        idx = self.random_gen.choice(N, size=M, replace=False)

        if isinstance(X, SparseTensor):
            X = X.to_scipy()
            centers = X[idx, :].copy()
            Xc = SparseTensor.from_scipy(centers)
        else:
            Xnp = X.numpy()  # work on np array
            if is_f_contig(X):
                order = 'F'
            else:
                order = 'C'
            Xc_np = np.empty((M, Xnp.shape[1]), dtype=Xnp.dtype, order=order)
            Xc = torch.from_numpy(
                np.take(Xnp, idx, axis=0, out=Xc_np, mode='wrap'))

        if Y is not None:
            Ynp = Y.numpy()  # work on np array
            if is_f_contig(X):
                order = 'F'
            else:
                order = 'C'
            Yc_np = np.empty((M, Ynp.shape[1]), dtype=Ynp.dtype, order=order)
            Yc = torch.from_numpy(
                np.take(Ynp, idx, axis=0, out=Yc_np, mode='wrap'))
            return Xc, Yc
        return Xc
Esempio n. 26
0
def _sparse_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.size(0)

    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2_chunk : dtot + 2 * D * mtot * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here)
    # ker_gpu  : mtot * ntot
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density,
                                 coef_m=2 * dtot * X2.density,
                                 rest=dtot,
                                 max_mem=avail_mem)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)

            X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype)
            X2_chunk_d = SparseTensor.from_scipy(
                X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                .index_to_int() \
                .to(device=tc_device)
            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype)
                X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device)
                cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                cur_g_out.fill_(0.0)

                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
                cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                 cur_g_out)
                cur_g_out = kernel._finalize(cur_g_out, ddd)
                copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                     cpu_buf)
                del ddd, X1_chunk_d, X1_chunk
            del X2_chunk, X2_chunk_d
        del g_out
    return out
Esempio n. 27
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density + T,
                                 coef_m=2 * dtot * X2.density + T,
                                 rest=dtot,
                                 max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # First collect necessary memory
        mem_needed = mtot * T + n * T + n * m
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        v_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(mtot, T),
                                    other=v,
                                    offset=flat_offset)
        flat_offset += np.prod(v_gpu.shape)
        copy_to_device_noorder(mtot, T, v, 0, 0, v_gpu, 0, 0)
        mmv_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, T),
                                      other=out,
                                      offset=flat_offset)
        flat_offset += np.prod(mmv_gpu.shape)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = extract_fortran(flat_gpu_tn, size=(n, m), offset=flat_offset)
        flat_offset += np.prod(ker_gpu.shape)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            if not cuda_inputs:
                copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Esempio n. 28
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_m(
        maxM=mtot,
        maxN=ntot,
        tot=avail_mem,
        coef_nm=3,
        coef_n=2 + 2 * dtot * X1.density + T,
        coef_m=2 * dtot * X2.density + T,
        rest=dtot,
    )

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        v_gpu = v.to(device=ddev)  # M x T
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Esempio n. 29
0
def sparse_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, w, out = a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.shape
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory needs:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2       : dtot + 2 * D * M * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1)
    # ker_gpu  : M * ntot
    # w_gpu    : ntot * T
    # v_gpu    : M * T
    # out_gpu  : M * T
    avail_mem = max_mem / sizeof_dtype(dtype)
    den = 2 * D * X1.density + 2 + 3 * M + T
    sub = D + 2 * D * M * X2.density + M * T
    if v is not None:
        sub += M * T
    n = (avail_mem - sub) / den
    n = min(int(n), N)
    if n < 1:
        raise MemoryError("Not enough memory to run sparse dfmmv")

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        w_gpu = create_same_stride((n, T), out, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        ker_gpu = create_fortran((n, M), dtype, ddev)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        X2_d = SparseTensor.from_scipy(
            X2.transpose_csc().to_scipy().tocsr(copy=False)) \
            .index_to_int() \
            .to(device=ddev)

        for i in range(0, N, n):
            ic = min(n, N - i)
            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)

            ker_chunk = ker_gpu[:ic]
            ker_chunk.fill_(0.0)

            # TODO: This is wasteful (X2 will be prepared many times over)
            ddd = kernel._prepare_sparse(X1_chunk, X2)
            ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk)
            ker_chunk = kernel._finalize(ker_chunk, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)

            if v is not None:
                c_g_w.addmm_(ker_chunk, v_gpu)
            out_gpu.addmm_(ker_chunk.T, c_g_w)
            del ddd, X1_chunk, X1_chunk_d

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Esempio n. 30
0
    def mkl_export_sparse(self,
                          mkl_mat: sparse_matrix_t,
                          dtype: torch.dtype,
                          output_type: str = "csr") -> SparseTensor:
        """Create a :class:`SparseTensor` from a MKL sparse matrix holder.

        Note that not all possible MKL sparse matrices are supported (for example if 1-based
        indexing is used, or for non floating-point types), but those created with
        :meth:`mkl_create_sparse_from_scipy` and :meth:`mkl_create_sparse` are.

        Parameters
        -----------
        mkl_mat
            The MKL sparse matrix holder
        dtype
            The data-type of the matrix. This must match the data-type of the data stored in
            the MKL matrix (no type conversion is performed), otherwise garbage data or memory
            corruption could occur.
        output_type
            Whether the matrix should be interpreted as CSR (pass ``"csr"``) or CSC
            (pass ``"csc"``). This should match the MKL matrix, otherwise a transposed output
            may be produced.

        Returns
        --------
        The :class:`SparseTensor` object, sharing the same data arrays as the MKL matrix.

        Notes
        ------
        Depending on the integer type of the linked MKL version, the indices of the matrix may
        be copied. In any case the output tensor will use :class:`torch.int64` indices.
        """
        indptrb = ctypes.POINTER(self.MKL_INT)()
        indptren = ctypes.POINTER(self.MKL_INT)()
        indices = ctypes.POINTER(self.MKL_INT)()

        ordering = ctypes.c_int()
        nrows = self.MKL_INT()
        ncols = self.MKL_INT()

        if output_type.lower() == "csr":
            if dtype == torch.float64:
                fn = self.libmkl.mkl_sparse_d_export_csr
                ctype = ctypes.c_double
            elif dtype == torch.float32:
                fn = self.libmkl.mkl_sparse_s_export_csr
                ctype = ctypes.c_float
            else:
                raise TypeError("Data type %s not valid to export" % (dtype))
        elif output_type.lower() == "csc":
            if dtype == torch.float64:
                fn = self.libmkl.mkl_sparse_d_export_csc
                ctype = ctypes.c_double
            elif dtype == torch.float32:
                fn = self.libmkl.mkl_sparse_s_export_csc
                ctype = ctypes.c_float
            else:
                raise TypeError("Data type %s not valid to export" % (dtype))
        else:
            raise ValueError("Output type %s not valid" % (output_type))

        data_ptr = ctypes.POINTER(ctype)()

        ret_val = fn(mkl_mat, ctypes.byref(ordering), ctypes.byref(nrows),
                     ctypes.byref(ncols), ctypes.byref(indptrb),
                     ctypes.byref(indptren), ctypes.byref(indices),
                     ctypes.byref(data_ptr))
        Mkl.mkl_check_return_val(ret_val, fn)

        if ordering.value != 0:
            raise ValueError("1-based indexing (F-style) is not supported")
        ncols = ncols.value
        nrows = nrows.value

        # Get the index dimension
        index_dim = nrows if output_type == "csr" else ncols
        # Construct a numpy array and add 0 to first position for scipy.sparse's 3-array indexing
        indptrb = as_array(indptrb, shape=(index_dim, ))
        indptren = as_array(indptren, shape=(index_dim, ))

        indptren = np.insert(indptren, 0, indptrb[0])
        nnz = indptren[-1] - indptrb[0]

        # Construct numpy arrays from data pointer and from indicies pointer
        data = np.array(as_array(data_ptr, shape=(nnz, )), copy=True)
        indices = np.array(as_array(indices, shape=(nnz, )), copy=True)

        return SparseTensor(indexptr=torch.from_numpy(indptren).to(torch.long),
                            index=torch.from_numpy(indices).to(torch.long),
                            data=torch.from_numpy(data),
                            size=(nrows, ncols),
                            sparse_type=output_type.lower())