def test_cublasSgemmBatched(self):        
        l, m, k, n = 11, 7, 5, 3
        A = np.random.rand(l, m, k).astype(np.float32)
        B = np.random.rand(l, k, n).astype(np.float32)
        
        C_res = np.einsum('nij,njk->nik', A, B)
        
        a_gpu = gpuarray.to_gpu(A)
        b_gpu = gpuarray.to_gpu(B)
        c_gpu = gpuarray.empty((l, m, n), np.float32)
        
        alpha = np.float32(1.0)
        beta = np.float32(0.0)

        a_arr = bptrs(a_gpu)
        b_arr = bptrs(b_gpu)
        c_arr = bptrs(c_gpu)

        cublas.cublasSgemmBatched(self.cublas_handle, 'n','n', 
                                  n, m, k, alpha, 
                                  b_arr.gpudata, n, 
                                  a_arr.gpudata, k, 
                                  beta, c_arr.gpudata, n, l)

        assert np.allclose(C_res, c_gpu.get())
Exemple #2
0
    def test_cublasSgemmBatched(self):        
        l, m, k, n = 11, 7, 5, 3
        A = np.random.rand(l, m, k).astype(np.float32)
        B = np.random.rand(l, k, n).astype(np.float32)
        
        C_res = np.einsum('nij,njk->nik', A, B)
        
        a_gpu = gpuarray.to_gpu(A)
        b_gpu = gpuarray.to_gpu(B)
        c_gpu = gpuarray.empty((l, m, n), np.float32)
        
        alpha = np.float32(1.0)
        beta = np.float32(0.0)

        a_arr = bptrs(a_gpu)
        b_arr = bptrs(b_gpu)
        c_arr = bptrs(c_gpu)

        cublas.cublasSgemmBatched(self.cublas_handle, 'n','n', 
                                  n, m, k, alpha, 
                                  b_arr.gpudata, n, 
                                  a_arr.gpudata, k, 
                                  beta, c_arr.gpudata, n, l)

        assert np.allclose(C_res, c_gpu.get())
Exemple #3
0
def solve_gpu(As, Bs):
    batch_size, num_factors = As.shape

    if allocated_shape[0] == As.shape: # reuse previous allocations
        As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[0]
        As_gpu.set(As)
        Bs_gpu.set(Bs)
    else: # allocate
        # transfer As and Bs to GPU
        As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32'))
        Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32'))

        # allocate arrays
        P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32)
        info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32)
        Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu) # inverted Bs.
        Rs_gpu = pycuda.gpuarray.empty_like(As_gpu) # final output, As * inverted Bs.
        
        # get pointer arrays
        A_arr = bptrs(As_gpu)
        B_arr = bptrs(Bs_gpu)
        C_arr = bptrs(Cs_gpu)
        R_arr = bptrs(Rs_gpu)

        allocated_shape[0] = As.shape
        allocations[0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr


    handle = scikits.cuda.misc._global_cublas_handle

    # perform LU factorization
    cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, info_gpu.gpudata, batch_size)
    # the LU factorization is now in Bs_gpu!

    # use factorization to perform inversion
    cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, C_arr.gpudata, num_factors, info_gpu.gpudata, batch_size)
    # the inverted matrices are now in Cs_gpu!

    # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors!
    transb = 'n'
    transa = 'n'
    N, k, m = Cs_gpu.shape
    N2, l = As_gpu.shape
    n = 1 # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1.
    # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu.

    lda = max(1, m)
    ldb = max(1, k)
    ldc = max(1, m)
    alpha = np.float32(1.0)
    beta = np.float32(0.0)

    cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, C_arr.gpudata,
                lda, A_arr.gpudata, ldb, beta, R_arr.gpudata, ldc, N)

    # the resulting batch of vectors is now in Rs_gpu.
    return Rs_gpu.get()
    
Exemple #4
0
def solve_gpu(As, Bs):
    batch_size, num_factors = As.shape

    if allocated_shape[0] == As.shape:  # reuse previous allocations
        As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[
            0]
        As_gpu.set(As)
        Bs_gpu.set(Bs)
    else:  # allocate
        # transfer As and Bs to GPU
        As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32'))
        Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32'))

        # allocate arrays
        P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32)
        info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32)
        Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu)  # inverted Bs.
        Rs_gpu = pycuda.gpuarray.empty_like(
            As_gpu)  # final output, As * inverted Bs.

        # get pointer arrays
        A_arr = bptrs(As_gpu)
        B_arr = bptrs(Bs_gpu)
        C_arr = bptrs(Cs_gpu)
        R_arr = bptrs(Rs_gpu)

        allocated_shape[0] = As.shape
        allocations[
            0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr

    handle = scikits.cuda.misc._global_cublas_handle

    # perform LU factorization
    cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors,
                               P_gpu.gpudata, info_gpu.gpudata, batch_size)
    # the LU factorization is now in Bs_gpu!

    # use factorization to perform inversion
    cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors,
                               P_gpu.gpudata, C_arr.gpudata, num_factors,
                               info_gpu.gpudata, batch_size)
    # the inverted matrices are now in Cs_gpu!

    # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors!
    transb = 'n'
    transa = 'n'
    N, k, m = Cs_gpu.shape
    N2, l = As_gpu.shape
    n = 1  # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1.
    # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu.

    lda = max(1, m)
    ldb = max(1, k)
    ldc = max(1, m)
    alpha = np.float32(1.0)
    beta = np.float32(0.0)

    cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha,
                              C_arr.gpudata, lda, A_arr.gpudata, ldb, beta,
                              R_arr.gpudata, ldc, N)

    # the resulting batch of vectors is now in Rs_gpu.
    return Rs_gpu.get()
Exemple #5
0
def compute_sgemm_batched(cols, kernels, biases, handle, m, k, n):
    batchsize = len(cols)
    #takes gpu arrays of pointers to pointers
    alpha = np.float32(1.0); beta = np.float32(1.0);
    flop = 2*m*n*k*batchsize
    cublas.cublasSgemmBatched(handle, 'n', 'n', n, m, k, alpha, cols.ptr, n, kernels.ptr, k, beta, biases.ptr, n, batchsize);
Exemple #6
0
def gpu_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N', handle=None):
    """
    uses cublasSgemmBatched to compute a bunch of dot products in parallel
    """
    if handle is None:
        handle = scikits.cuda.misc._global_cublas_handle

    assert len(bx_gpu.shape) == 3
    assert len(by_gpu.shape) == 3
    assert len(bc_gpu.shape) == 3
    assert bx_gpu.dtype == np.float32
    assert by_gpu.dtype == np.float32 
    assert bc_gpu.dtype == np.float32

    # Get the shapes of the arguments
    bx_shape = bx_gpu.shape
    by_shape = by_gpu.shape
    
    # Perform matrix multiplication for 2D arrays:
    alpha = np.float32(1.0)
    beta = np.float32(0.0)
    
    transa = string.lower(transa)
    transb = string.lower(transb)

    if transb in ['t', 'c']:
        N, m, k = by_shape
    elif transb in ['n']:
        N, k, m = by_shape
    else:
        raise ValueError('invalid value for transb')

    if transa in ['t', 'c']:
        N2, l, n = bx_shape
    elif transa in ['n']:
        N2, n, l = bx_shape
    else:
        raise ValueError('invalid value for transa')

    if l != k:
        raise ValueError('objects are not aligned')

    if N != N2:
        raise ValueError('batch sizes are not the same')

    if transb == 'n':
        lda = max(1, m)
    else:
        lda = max(1, k)

    if transa == 'n':
        ldb = max(1, k)
    else:
        ldb = max(1, n)

    ldc = max(1, m)

    # construct pointer arrays needed for cublasCgemmBatched
    bx_arr = bptrs(bx_gpu)
    by_arr = bptrs(by_gpu)
    bc_arr = bptrs(bc_gpu)

    cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, by_arr.gpudata,
                lda, bx_arr.gpudata, ldb, beta, bc_arr.gpudata, ldc, N)
Exemple #7
0
def compute_sgemm_batched(cols, kernels, biases, m, k, n, stream, handle):
    batchsize = len(cols)
    alpha = np.float32(1.0); beta = np.float32(1.0);

    flop = 2*m*n*k
    cublas.cublasSgemmBatched(handle, 'n', 'n', n, m, k, alpha, cols.ptr, n, kernels.ptr, k, beta, biases.ptr, n, batchsize)