def test_cublasSgetriBatched(self):
        l,m = 11,7
        np.random.seed(1)
        A = np.random.rand(l,m, m).astype(np.float32)
        
        a_gpu = gpuarray.to_gpu(A)
        a_arr = bptrs(a_gpu)
        c_gpu = gpuarray.empty((l,m,m), np.float32)
        c_arr = bptrs(c_gpu)

        p_gpu = gpuarray.empty((l,m), np.int32)
        i_gpu = gpuarray.zeros(l, np.int32)

        cublas.cublasSgetrfBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, 
                    i_gpu.gpudata, l)

        cublas.cublasSgetriBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, c_arr.gpudata,m,
                    i_gpu.gpudata, l)
        
        X = np.array(map(np.linalg.inv,A))
        X_ = c_gpu.get()

        assert np.allclose(X,X_,6)
Exemple #2
0
        def thunk():
            input_shape = inputs[0][0].shape

            size = input_shape[1] # matrices to invert are (size x size)
            batch_size = input_shape[0]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != input_shape:
                z[0] = cuda.CudaNdarray.zeros(input_shape)
                pivot_alloc[0] = pycuda.gpuarray.empty((batch_size, size), np.int32)
                info_alloc[0] = pycuda.gpuarray.zeros(batch_size, np.int32)

            input_pycuda = to_gpuarray(inputs[0][0])
            output_pycuda = to_gpuarray(z[0])
            pivot = pivot_alloc[0]
            info = info_alloc[0]

            # construct pointer arrays for batched operations
            input_arr = bptrs(input_pycuda)
            output_arr = bptrs(output_pycuda)

            if not self.destructive:
                input_pycuda = input_pycuda.copy() # to prevent destruction of the input

            handle = scikits.cuda.misc._global_cublas_handle

            # perform LU factorization
            cublas.cublasSgetrfBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, info.gpudata, batch_size)
            # the LU factorization is now in input_pycuda (destructive operation!)

            # use factorization to perform inversion
            cublas.cublasSgetriBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, output_arr.gpudata, size, info.gpudata, batch_size)
Exemple #3
0
def solve_gpu(As, Bs):
    batch_size, num_factors = As.shape

    if allocated_shape[0] == As.shape: # reuse previous allocations
        As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[0]
        As_gpu.set(As)
        Bs_gpu.set(Bs)
    else: # allocate
        # transfer As and Bs to GPU
        As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32'))
        Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32'))

        # allocate arrays
        P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32)
        info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32)
        Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu) # inverted Bs.
        Rs_gpu = pycuda.gpuarray.empty_like(As_gpu) # final output, As * inverted Bs.
        
        # get pointer arrays
        A_arr = bptrs(As_gpu)
        B_arr = bptrs(Bs_gpu)
        C_arr = bptrs(Cs_gpu)
        R_arr = bptrs(Rs_gpu)

        allocated_shape[0] = As.shape
        allocations[0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr


    handle = scikits.cuda.misc._global_cublas_handle

    # perform LU factorization
    cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, info_gpu.gpudata, batch_size)
    # the LU factorization is now in Bs_gpu!

    # use factorization to perform inversion
    cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, C_arr.gpudata, num_factors, info_gpu.gpudata, batch_size)
    # the inverted matrices are now in Cs_gpu!

    # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors!
    transb = 'n'
    transa = 'n'
    N, k, m = Cs_gpu.shape
    N2, l = As_gpu.shape
    n = 1 # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1.
    # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu.

    lda = max(1, m)
    ldb = max(1, k)
    ldc = max(1, m)
    alpha = np.float32(1.0)
    beta = np.float32(0.0)

    cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, C_arr.gpudata,
                lda, A_arr.gpudata, ldb, beta, R_arr.gpudata, ldc, N)

    # the resulting batch of vectors is now in Rs_gpu.
    return Rs_gpu.get()
    
    def test_cublasSgetrfBatched(self):
        from scipy.linalg import lu_factor
        l, m = 11, 7
        A = np.random.rand(l, m, m).astype(np.float32)
        A = np.array([np.matrix(a) * np.matrix(a).T for a in A])

        a_gpu = gpuarray.to_gpu(A)
        a_arr = bptrs(a_gpu)
        p_gpu = gpuarray.empty((l, m), np.int32)
        i_gpu = gpuarray.zeros(1, np.int32)
        X = np.array([lu_factor(a)[0] for a in A])

        cublas.cublasSgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m,
                                   p_gpu.gpudata, i_gpu.gpudata, l)

        X_ = np.array([a.T for a in a_gpu.get()])

        assert np.allclose(X, X_)
    def test_cublasSgetrfBatched(self):
        from scipy.linalg import lu_factor
        l,m = 11,7
        A = np.random.rand(l,m, m).astype(np.float32)
        A = np.array([np.matrix(a)*np.matrix(a).T for a in A])
        
        a_gpu = gpuarray.to_gpu(A)
        a_arr = bptrs(a_gpu)
        p_gpu = gpuarray.empty((l,m), np.int32)
        i_gpu = gpuarray.zeros(l, np.int32)
        X = np.array([ lu_factor(a)[0] for a in A])

        cublas.cublasSgetrfBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l)
        
        X_ = np.array([a.T for a in a_gpu.get()])

        assert np.allclose(X,X_,6)
Exemple #6
0
def solve_gpu(As, Bs):
    batch_size, num_factors = As.shape

    if allocated_shape[0] == As.shape:  # reuse previous allocations
        As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[
            0]
        As_gpu.set(As)
        Bs_gpu.set(Bs)
    else:  # allocate
        # transfer As and Bs to GPU
        As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32'))
        Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32'))

        # allocate arrays
        P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32)
        info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32)
        Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu)  # inverted Bs.
        Rs_gpu = pycuda.gpuarray.empty_like(
            As_gpu)  # final output, As * inverted Bs.

        # get pointer arrays
        A_arr = bptrs(As_gpu)
        B_arr = bptrs(Bs_gpu)
        C_arr = bptrs(Cs_gpu)
        R_arr = bptrs(Rs_gpu)

        allocated_shape[0] = As.shape
        allocations[
            0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr

    handle = scikits.cuda.misc._global_cublas_handle

    # perform LU factorization
    cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors,
                               P_gpu.gpudata, info_gpu.gpudata, batch_size)
    # the LU factorization is now in Bs_gpu!

    # use factorization to perform inversion
    cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors,
                               P_gpu.gpudata, C_arr.gpudata, num_factors,
                               info_gpu.gpudata, batch_size)
    # the inverted matrices are now in Cs_gpu!

    # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors!
    transb = 'n'
    transa = 'n'
    N, k, m = Cs_gpu.shape
    N2, l = As_gpu.shape
    n = 1  # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1.
    # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu.

    lda = max(1, m)
    ldb = max(1, k)
    ldc = max(1, m)
    alpha = np.float32(1.0)
    beta = np.float32(0.0)

    cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha,
                              C_arr.gpudata, lda, A_arr.gpudata, ldb, beta,
                              R_arr.gpudata, ldc, N)

    # the resulting batch of vectors is now in Rs_gpu.
    return Rs_gpu.get()