Example #1
0
def test_cusparseScsrmm2_notranspose():
    A = np.random.laplace(size=(5, 3)).astype(np.float32)
    A[A<0.1] = 0
    A = sparse.csr_matrix(A, dtype=np.float32)
    B = np.random.normal(size=(3, 6)).astype(np.float32, order="f")
    C = np.ones((A.shape[0], B.shape[1]), dtype=np.float32)

    X_exp = (A*B) + 0.5*C
    a_data = gpu.to_gpu(A.data)
    a_indptr = gpu.to_gpu(A.indptr)
    a_indices = gpu.to_gpu(A.indices)
    b = gpu.to_gpu(B)

    h = cusparse.cusparseCreate()
    descrA = cusparse.cusparseCreateMatDescr()

    c = gpu.empty((C.shape[1], C.shape[0]), dtype=A.dtype)
    c.fill(1.0)

    cusparse.cusparseScsrmm2(h, cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE,
        cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE,
        c.shape[1], c.shape[0], b.shape[0], A.nnz, 1.0,
        descrA, a_data.gpudata, a_indptr.gpudata, a_indices.gpudata,
        b.gpudata, b.shape[0], 0.5, c.gpudata, c.shape[1])
    assert_allclose(c.get().T, X_exp, rtol=1e-4)
Example #2
0
def csrmm2(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0):
    ''' Calculates C += alpha * A*B + beta*C.
        Where A is sparse and both A and B can be transposed.
    '''

    if transA:
        ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE
        n, l = A_gpu.shape[1], A_gpu.shape[0]
    else:
        ta = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        n, l = A_gpu.shape

    if (B_gpu.flags.c_contiguous and transB) or (B_gpu.flags.f_contiguous and not transB):
        tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
    else:
        tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE

    k, m = (B_gpu.shape[1], B_gpu.shape[0]) if transB else B_gpu.shape

    assert (l == k) and (n, m) == C_gpu.shape

    ldb = B_gpu.shape[1] if B_gpu.flags.c_contiguous else B_gpu.shape[0]
    ldc = C_gpu.shape[0]

    # if C-major, save result into a temp array and transpose afterwards
    if C_gpu.flags.c_contiguous:
        out = __cuda_get_temp_matrix(C_gpu.shape, C_gpu.dtype)
        if beta != 0.0:
           memcpy_dtod(out.gpudata, C_gpu.gpudata, C_gpu.nbytes)
    else:
        out = C_gpu

    cusparse.cusparseScsrmm2(cusparse_handle, ta, tb,
        n, m, k, A_gpu.nnz, alpha,
        A_gpu.descr, A_gpu.data.gpudata, A_gpu.indptr.gpudata, A_gpu.indices.gpudata,
        B_gpu.gpudata, ldb, beta, out.gpudata, ldc)

    if C_gpu.flags.c_contiguous:
        cublas.cublasSgeam(cublas_handle, 1, 1, m, n,
                           1.0, out.gpudata, C_gpu.shape[0],
                           0.0, 0, C_gpu.shape[0], C_gpu.gpudata, C_gpu.shape[1])
    return C_gpu
Example #3
0
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0):
    ''' Calculates C += alpha * A*B + beta*C.
        Where B is sparse and both A and B can be transposed.

        Note: cuSPARSE only allows for sparse A, so we need some tricks:
            Essentially, we will compute C^T = B^T * A^T
            By enforcing C to be row-major, can drop its transpose
            since cuSPARSE assumes column-major. Thus, we only need to
            compute
            C = op(B)^T * op(A)^T
    '''
    assert C_gpu.flags.c_contiguous
    m, k = B_gpu.shape
    ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE

    if transA:
        if ta:  # we can't have ta and tb true at the same time according to cuSPARSE docs
            out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype)
            cublas.cublasSgeam(cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1],
                               0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0])
            out.shape = A_gpu.shape[1], A_gpu.shape[0]
            out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape)
            A_gpu = out
            tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
            n = A_gpu.shape[0]
        else:
            tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE
            n = A_gpu.shape[1]
    else:
        tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        n = A_gpu.shape[0]

    ldb = A_gpu.shape[1]
    ldc = C_gpu.shape[1]

    cusparse.cusparseScsrmm2(cusparse_handle, ta, tb,
        m, n, k, B_gpu.nnz, alpha,
        B_gpu.descr, B_gpu.data.gpudata, B_gpu.indptr.gpudata, B_gpu.indices.gpudata,
        A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)
    return C_gpu
Example #4
0
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0):
    """ Calculates C += alpha * A*B + beta*C.
        Where B is sparse and both A and B can be transposed.

        Note: cuSPARSE only allows for sparse A, so we need some tricks:
            Essentially, we will compute C^T = B^T * A^T
            By enforcing C to be row-major, can drop its transpose
            since cuSPARSE assumes column-major. Thus, we only need to
            compute
            C = op(B)^T * op(A)^T
    """
    assert C_gpu.flags.c_contiguous
    m, k = B_gpu.shape
    ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE

    if transA:
        if ta:  # we can't have ta and tb true at the same time according to cuSPARSE docs
            out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype)
            cublas.cublasSgeam(
                cublas_handle,
                1,
                1,
                A_gpu.shape[0],
                A_gpu.shape[1],
                1.0,
                A_gpu.gpudata,
                A_gpu.shape[1],
                0.0,
                A_gpu.gpudata,
                A_gpu.shape[1],
                out.gpudata,
                A_gpu.shape[0],
            )
            out.shape = A_gpu.shape[1], A_gpu.shape[0]
            out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape)
            A_gpu = out
            tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
            n = A_gpu.shape[0]
        else:
            tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE
            n = A_gpu.shape[1]
    else:
        tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        n = A_gpu.shape[0]

    ldb = A_gpu.shape[1]
    ldc = C_gpu.shape[1]

    cusparse.cusparseScsrmm2(
        cusparse_handle,
        ta,
        tb,
        m,
        n,
        k,
        B_gpu.nnz,
        alpha,
        B_gpu.descr,
        B_gpu.data.gpudata,
        B_gpu.indptr.gpudata,
        B_gpu.indices.gpudata,
        A_gpu.gpudata,
        ldb,
        beta,
        C_gpu.gpudata,
        ldc,
    )
    return C_gpu
Example #5
0
def csrmm2(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0):
    """ Calculates C += alpha * A*B + beta*C.
        Where A is sparse and both A and B can be transposed.
    """

    if transA:
        ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE
        n, l = A_gpu.shape[1], A_gpu.shape[0]
    else:
        ta = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        n, l = A_gpu.shape

    if (B_gpu.flags.c_contiguous and transB) or (B_gpu.flags.f_contiguous and not transB):
        tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
    else:
        tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE

    k, m = (B_gpu.shape[1], B_gpu.shape[0]) if transB else B_gpu.shape

    assert (l == k) and (n, m) == C_gpu.shape

    ldb = B_gpu.shape[1] if B_gpu.flags.c_contiguous else B_gpu.shape[0]
    ldc = C_gpu.shape[0]

    # if C-major, save result into a temp array and transpose afterwards
    if C_gpu.flags.c_contiguous:
        out = __cuda_get_temp_matrix(C_gpu.shape, C_gpu.dtype)
        if beta != 0.0:
            memcpy_dtod(out.gpudata, C_gpu.gpudata, C_gpu.nbytes)
    else:
        out = C_gpu

    cusparse.cusparseScsrmm2(
        cusparse_handle,
        ta,
        tb,
        n,
        m,
        k,
        A_gpu.nnz,
        alpha,
        A_gpu.descr,
        A_gpu.data.gpudata,
        A_gpu.indptr.gpudata,
        A_gpu.indices.gpudata,
        B_gpu.gpudata,
        ldb,
        beta,
        out.gpudata,
        ldc,
    )

    if C_gpu.flags.c_contiguous:
        cublas.cublasSgeam(
            cublas_handle,
            1,
            1,
            m,
            n,
            1.0,
            out.gpudata,
            C_gpu.shape[0],
            0.0,
            0,
            C_gpu.shape[0],
            C_gpu.gpudata,
            C_gpu.shape[1],
        )
    return C_gpu