コード例 #1
0
ファイル: sdot.py プロジェクト: npinto/python-cuda
def gpu_sdot(a,b):
    assert a.size == b.size
    assert a.shape[0] == b.shape[1]
    cublas.cublasInit()
    cublas.cublasFree(0)
    d_X = Linear(a.shape).from_numpy(a)
    d_Y = Linear(b.shape).from_numpy(b)
    gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1)
    cuda.cudaThreadSynchronize()
    cublas.cublasShutdown()
    return gpu_result
コード例 #2
0
def gpu_sdot(a, b):
    assert a.size == b.size
    assert a.shape[0] == b.shape[1]
    cublas.cublasInit()
    cublas.cublasFree(0)
    d_X = Linear(a.shape).from_numpy(a)
    d_Y = Linear(b.shape).from_numpy(b)
    gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1)
    cuda.cudaThreadSynchronize()
    cublas.cublasShutdown()
    return gpu_result
コード例 #3
0
ファイル: sgemm.py プロジェクト: npinto/python-cuda
def gpu_sgemm(a,b, alpha=1):
    """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication"""
    # init cublas
    cublasInit()

    assert a.shape[1] == b.shape[0]

    c_shape = (a.shape[0], b.shape[1])
    # allocate device matrices from host
    dA = Linear(a.shape, order='F').from_numpy(a)
    dB = Linear(b.shape, order='F').from_numpy(b)
    dC = Linear(c_shape, order='F')

    # transpose a/b ? t = yes, n = no
    transa = 'n'
    transb = 'n'

    # compute with CUBLAS
    cublasSgemm( transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0] )
    cudaThreadSynchronize()
    # shutdown
    cublasShutdown() 
    return dC.to_numpy()
コード例 #4
0
ファイル: sgemm.py プロジェクト: fgreve/python-cuda
def gpu_sgemm(a, b, alpha=1):
    """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication"""
    # init cublas
    cublasInit()

    assert a.shape[1] == b.shape[0]

    c_shape = (a.shape[0], b.shape[1])
    # allocate device matrices from host
    dA = Linear(a.shape, order='F').from_numpy(a)
    dB = Linear(b.shape, order='F').from_numpy(b)
    dC = Linear(c_shape, order='F')

    # transpose a/b ? t = yes, n = no
    transa = 'n'
    transb = 'n'

    # compute with CUBLAS
    cublasSgemm(transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha,
                dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0])
    cudaThreadSynchronize()
    # shutdown
    cublasShutdown()
    return dC.to_numpy()