Beispiel #1
0
def cublasXtDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
                  beta, C, ldc):
    status = _libcublas.cublasXtDgemm(handle, transa, transb, m, n, k,
                                      ctypes.byref(ctypes.c_double(alpha)),
                                      int(A), lda, int(B), ldb,
                                      ctypes.byref(ctypes.c_double(beta)),
                                      int(C), ldc)
    cublasCheckStatus(status)
Beispiel #2
0
def cublasXtZgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
                  C, ldc):
    status = _libcublas.cublasXtZgemm(
        handle, transa, transb, m, n, k,
        ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A),
        lda, int(B), ldb,
        ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc)
    cublasCheckStatus(status)
Beispiel #3
0
def cublasXtZgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
                  beta, C, ldc):
    status = _libcublas.cublasXtZgemm(handle, transa, transb, m, n, k,
                                      ctypes.byref(cuda.cuDoubleComplex(alpha.real,
                                                               alpha.imag)),
                                      int(A), lda, int(B), ldb,
                                      ctypes.byref(cuda.cuDoubleComplex(beta.real,
                                                               beta.imag)),
                                      int(C), ldc)
    cublasCheckStatus(status)
Beispiel #4
0
def cublasXtSetCpuRoutine(handle, blasOp, type, blasFunctor):
    status = _libcublas.cublasXtSetCpuRoutine(handle, blasOp, type, blasFunctor)
    cublasCheckStatus(status)
Beispiel #5
0
def cublasXtGetBlockDim(handle):
    blockDim = ctypes.c_void_p()
    status = _libcublas.cublasXtSetBlockDim(handle, ctypes.byref(blockDim))
    cublasCheckStatus(status)
    return blockDim.value
Beispiel #6
0
def cublasXtSetBlockDim(handle, blockDim):
    status = _libcublas.cublasXtSetBlockDim(handle, blockDim)
    cublasCheckStatus(status)
Beispiel #7
0
def cublasXtDeviceSelect(handle, nbDevices, deviceId):
    status = _libcublas.cublasXtDeviceSelect(handle, nbDevices, deviceId)
    cublasCheckStatus(status)
Beispiel #8
0
def cublasXtMaxBoards():
    nbGpuBoards = ctypes.c_int()
    status = _libcublas.cublasXtMaxBoards(ctypes.byref(nbGpuBoards))
    cublasCheckStatus(status)
    return nbGpuBoards.value
Beispiel #9
0
def cublasXtGetNumBoards(handle, deviceId):
    nbBoards = ctypes.c_int()
    status = _libcublas.cublasXtGetNumBoards(handle, deviceId, ctypes.byref(nbBoards))
    cublasCheckStatus(status)
    return nbBoards.value
Beispiel #10
0
def cublasXtDestroy(handle):
    status = _libcublas.cublasXtDestroy(handle)
    cublasCheckStatus(status)
Beispiel #11
0
def cublasXtCreate():
    handle = ctypes.c_void_p()
    status = _libcublas.cublasXtCreate(ctypes.byref(handle))
    cublasCheckStatus(status)
    return handle.value
Beispiel #12
0
def cublasXtSetCpuRatio(handle, blasOp, type, ratio):
    status = _libcublas.cublasXtSetCpuRatio(handle, blasOp, type, ratio)
    cublasCheckStatus(status)
Beispiel #13
0
def dot(x_gpu, y_gpu, transa='N', transb='N'):
    """
    Dot product of two arrays.

    For 1D arrays, this function computes the inner product. For 2D
    arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix
    product; the result has shape `(m, n)`.

    Parameters
    ----------
    x_gpu : pycuda.gpuarray.GPUArray
        Input array.
    y_gpu : pycuda.gpuarray.GPUArray
        Input array.
    transa : char
        If 'T', compute the product of the transpose of `x_gpu`.
        If 'C', compute the product of the Hermitian of `x_gpu`.
    transb : char
        If 'T', compute the product of the transpose of `y_gpu`.
        If 'C', compute the product of the Hermitian of `y_gpu`.

    Returns
    -------
    c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128}
        Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D
        arrays, the result will be returned as a scalar.
    
    Notes
    -----
    The input matrices must all contain elements of the same data type.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> import misc
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(4, 2), np.float32)
    >>> b = np.asarray(np.random.rand(2, 2), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> c_gpu = linalg.dot(a_gpu, b_gpu)
    >>> np.allclose(np.dot(a, b), c_gpu.get())
    True
    >>> d = np.asarray(np.random.rand(5), np.float32)
    >>> e = np.asarray(np.random.rand(5), np.float32)
    >>> d_gpu = gpuarray.to_gpu(d)
    >>> e_gpu = gpuarray.to_gpu(e)
    >>> f = linalg.dot(d_gpu, e_gpu)
    >>> np.allclose(np.dot(d, e), f)
    True
    
    """

    if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1:

        if x_gpu.size != y_gpu.size:
            raise ValueError('arrays must be of same length')

        # Compute inner product for 1D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCdotu
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSdot
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZdotu
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDdot
        else:
            raise ValueError('unsupported combination of input types')

        result = cublas_func(x_gpu.size, int(x_gpu.gpudata), 1,
                             int(y_gpu.gpudata), 1)

        if x_gpu.dtype == np.complex64:
            return np.float32(result.x) + 1j * np.float32(result.y)
        elif x_gpu.dtype == np.complex128:
            return np.float64(result.x) + 1j * np.float64(result.y)
        elif x_gpu.dtype == np.float32:
            return np.float32(result)
        else:
            return np.float64(result)
    else:

        # Perform matrix multiplication for 2D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCgemm
            alpha = cuda.cuFloatComplex(1, 0)
            beta = cuda.cuFloatComplex(0, 0)
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSgemm
            alpha = np.float32(1.0)
            beta = np.float32(0.0)
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZgemm
            alpha = cuda.cuDoubleComplex(1, 0)
            beta = cuda.cuDoubleComplex(0, 0)
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDgemm
            alpha = np.float64(1.0)
            beta = np.float64(0.0)
        else:
            raise ValueError('unsupported combination of input types')

        transa = lower(transa)
        transb = lower(transb)

        if transb in ['t', 'c']:
            m, k = y_gpu.shape
        elif transb in ['n']:
            k, m = y_gpu.shape
        else:
            raise ValueError('invalid value for transb')

        if transa in ['t', 'c']:
            l, n = x_gpu.shape
        elif transa in ['n']:
            n, l = x_gpu.shape
        else:
            raise ValueError('invalid value for transa')

        if l != k:
            raise ValueError('objects are not aligned')

        if transb == 'n':
            lda = max(1, m)
        else:
            lda = max(1, k)

        if transa == 'n':
            ldb = max(1, k)
        else:
            ldb = max(1, n)

        ldc = max(1, m)

        # Note that the desired shape of the output matrix is the transpose
        # of what CUBLAS assumes:
        c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype)
        cublas_func(transb, transa, m, n, k, alpha, int(y_gpu.gpudata), lda,
                    int(x_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc)

        status = cublas.cublasGetError()
        cublas.cublasCheckStatus(status)

        return c_gpu
Beispiel #14
0
def dot(x_gpu, y_gpu, transa='N', transb='N'):
    """
    Dot product of two arrays.

    For 1D arrays, this function computes the inner product. For 2D
    arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix
    product; the result has shape `(m, n)`.

    Parameters
    ----------
    x_gpu : pycuda.gpuarray.GPUArray
        Input array.
    y_gpu : pycuda.gpuarray.GPUArray
        Input array.
    transa : char
        If 'T', compute the product of the transpose of `x_gpu`.
        If 'C', compute the product of the Hermitian of `x_gpu`.
    transb : char
        If 'T', compute the product of the transpose of `y_gpu`.
        If 'C', compute the product of the Hermitian of `y_gpu`.

    Returns
    -------
    c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128}
        Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D
        arrays, the result will be returned as a scalar.
    
    Notes
    -----
    The input matrices must all contain elements of the same data type.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> import misc
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(4, 2), np.float32)
    >>> b = np.asarray(np.random.rand(2, 2), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> c_gpu = linalg.dot(a_gpu, b_gpu)
    >>> np.allclose(np.dot(a, b), c_gpu.get())
    True
    >>> d = np.asarray(np.random.rand(5), np.float32)
    >>> e = np.asarray(np.random.rand(5), np.float32)
    >>> d_gpu = gpuarray.to_gpu(d)
    >>> e_gpu = gpuarray.to_gpu(e)
    >>> f = linalg.dot(d_gpu, e_gpu)
    >>> np.allclose(np.dot(d, e), f)
    True
    
    """

    if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1:

        if x_gpu.size != y_gpu.size:
            raise ValueError('arrays must be of same length')
        
        # Compute inner product for 1D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCdotu
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSdot
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZdotu
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDdot
        else:
            raise ValueError('unsupported combination of input types')

        result = cublas_func(x_gpu.size, int(x_gpu.gpudata), 1,
                             int(y_gpu.gpudata), 1)

        if x_gpu.dtype == np.complex64:
            return np.float32(result.x)+1j*np.float32(result.y)
        elif x_gpu.dtype == np.complex128:
            return np.float64(result.x)+1j*np.float64(result.y)
        elif x_gpu.dtype == np.float32:
            return np.float32(result)
        else:
            return np.float64(result)
    else:

        # Perform matrix multiplication for 2D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCgemm        
            alpha = cuda.cuFloatComplex(1, 0)
            beta = cuda.cuFloatComplex(0, 0)
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSgemm
            alpha = np.float32(1.0)
            beta = np.float32(0.0)
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZgemm        
            alpha = cuda.cuDoubleComplex(1, 0)
            beta = cuda.cuDoubleComplex(0, 0)
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDgemm
            alpha = np.float64(1.0)
            beta = np.float64(0.0)
        else:
            raise ValueError('unsupported combination of input types')

        transa = lower(transa)
        transb = lower(transb)        

        if transb in ['t', 'c']:
            m, k = y_gpu.shape
        elif transb in ['n']:
            k, m = y_gpu.shape
        else:
            raise ValueError('invalid value for transb')

        if transa in ['t', 'c']:
            l, n = x_gpu.shape
        elif transa in ['n']:
            n, l = x_gpu.shape
        else:
            raise ValueError('invalid value for transa')

        if l != k:
            raise ValueError('objects are not aligned')
        
        if transb == 'n':
            lda = max(1, m)
        else:
            lda = max(1, k)
            
        if transa == 'n':
            ldb = max(1, k)
        else:
            ldb = max(1, n)

        ldc = max(1, m)

        # Note that the desired shape of the output matrix is the transpose
        # of what CUBLAS assumes:
        c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype)
        cublas_func(transb, transa, m, n, k, alpha, int(y_gpu.gpudata),
                    lda, int(x_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc)

        status = cublas.cublasGetError()
        cublas.cublasCheckStatus(status)

        return c_gpu
Beispiel #15
0
def dot(a_gpu, b_gpu):
    """
    Matrix product of two arrays.

    For 1D arrays, this function computes the inner product. For 2D
    arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix
    product; the result has shape `(m, n)`.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input array.
    b_gpu : pycuda.gpuarray.GPUArray
        Input array.
        
    Returns
    -------
    c_gpu : pycuda.gpuarray.GPUArray
        Dot product of `a_gpu` and `b_gpu`.
    
    Notes
    -----
    The input matrices must all contain elements of the same data type.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(4, 2), np.float32)
    >>> b = np.asarray(np.random.rand(2, 2), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> c_gpu = linalg.dot(a_gpu, b_gpu)
    >>> np.allclose(np.dot(a, b), c_gpu.get())
    True
    >>> d = np.asarray(np.random.rand(5), np.float32)
    >>> e = np.asarray(np.random.rand(5), np.float32)
    >>> d_gpu = gpuarray.to_gpu(d)
    >>> e_gpu = gpuarray.to_gpu(e)
    >>> f = linalg.dot(d_gpu, e_gpu)
    >>> np.allclose(np.dot(d, e), f)
    True
    >>> p = np.asarray(np.random.rand(4, 2), np.complex64)
    >>> q = np.asarray(np.random.rand(2, 2), np.complex64)
    >>> p_gpu = gpuarray.to_gpu(p)
    >>> q_gpu = gpuarray.to_gpu(q)
    >>> r_gpu = linalg.dot(p_gpu, q_gpu)
    >>> np.allclose(np.dot(p, q), r_gpu.get())
    True
    >>> s = np.asarray(np.random.rand(5), np.complex128)
    >>> t = np.asarray(np.random.rand(5), np.complex128)
    >>> s_gpu = gpuarray.to_gpu(s)
    >>> t_gpu = gpuarray.to_gpu(t)
    >>> u = linalg.dot(s_gpu, t_gpu)
    >>> np.allclose(np.dot(s, t), u)
    True
    
    """

    if len(a_gpu.shape) == 1 and len(b_gpu.shape) == 1:

        # Compute inner product for 1D arrays:
        if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCdotu
        elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSdot
        elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZdotu
        elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDdot
        else:
            raise ValueError('unsupported combination of input types')

        result = cublas_func(a_gpu.size, int(a_gpu.gpudata), 1,
                             int(b_gpu.gpudata), 1)

        if a_gpu.dtype == np.complex64:
            return np.float32(result.x)+1j*np.float32(result.y)
        elif a_gpu.dtype == np.complex128:
            return np.float64(result.x)+1j*np.float64(result.y)
        elif a_gpu.dtype == np.float32:
            return np.float32(result)
        else:
            return np.float64(result)
    else:

        # Perform matrix multiplication for 2D arrays:
        if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCgemm        
            alpha = cuda.cuFloatComplex(1, 0)
            beta = cuda.cuFloatComplex(0, 0)
        elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSgemm
            alpha = np.float32(1.0)
            beta = np.float32(0.0)
        elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZgemm        
            alpha = cuda.cuDoubleComplex(1, 0)
            beta = cuda.cuDoubleComplex(0, 0)
        elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDgemm
            alpha = np.float64(1.0)
            beta = np.float64(0.0)
        else:
            raise ValueError('unsupported combination of input types')

        transa = 'N'
        transb = 'N'
        m = b_gpu.shape[1]
        n = a_gpu.shape[0]
        k = b_gpu.shape[0]
        lda = m
        ldb = k
        ldc = max(1, m)

        c_gpu = gpuarray.empty((a_gpu.shape[0], b_gpu.shape[1]), a_gpu.dtype)
        cublas_func(transb, transa, m, n, k, alpha, int(b_gpu.gpudata),
                    lda, int(a_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc)

        status = cublas.cublasGetError()
        cublas.cublasCheckStatus(status)

        return c_gpu