def gpu_skcuda_cusolver_cusolverDnDgesvd_S(input):
    coloring_print("\nGPU: skcuda.cusolver.cusolverDnDgesvd() 'S' option")

    # #縦横を逆(≒転地)してcolumn-majorにし、GPUのcusolverDnDgesvd()に対応させる。結果配列のU,Vが逆になる
    n, m = input.shape

    # change function by data type
    if input.dtype == np.dtype('float64'):
        get_buffer = solver.cusolverDnDgesvd_bufferSize
        cusolver_svd = solver.cusolverDnDgesvd
    elif input.dtype == np.dtype('float32'):
        get_buffer = solver.cusolverDnSgesvd_bufferSize
        cusolver_svd = solver.cusolverDnSgesvd
    else:
        print "Error: data type must be float64 or float32"

    h2d_start = time.time()
    input_gpu = gpuarray.to_gpu(input)
    h2d_end = time.time()
    print "H2D: ", h2d_end - h2d_start, "[sec]"

    # Set up work buffers:
    h = solver.cusolverDnCreate()
    Lwork = get_buffer(h, m, n)
    workspace_gpu = gpuarray.zeros(Lwork, input.dtype)
    devInfo_gpu = gpuarray.zeros(1, np.int32)

    # Set up output buffers:
    s_gpu = gpuarray.zeros(min(m, n), input.dtype)
    u_gpu = gpuarray.zeros((n, n), input.dtype)
    vh_gpu = gpuarray.zeros((m, m), input.dtype)

    # 'S': the first min(m,n) columns of U (the left singular vectors) are returned in the array U
    cusolver_S_svd_start = time.time()
    status = cusolver_svd(h, 'S', 'S', m, n, input_gpu.gpudata, m,
                          s_gpu.gpudata, u_gpu.gpudata, m, vh_gpu.gpudata, n,
                          workspace_gpu.gpudata, Lwork, 0, devInfo_gpu.gpudata)
    cusolver_S_svd_end = time.time()

    print "solver.cusolverDnSgesvd() 'S' option", cusolver_S_svd_end - cusolver_S_svd_start, "[sec]"
    print "Total: ", cusolver_S_svd_end - h2d_start, "[sec]"

    # u and s is swapped (数学的に正しいかはわからない)
    check_result(input, vh_gpu.get(), s_gpu.get(), u_gpu.get())

    solver.cusolverDnDestroy(h)
def gpu_skcuda_cusolver_cusolverDnDgesvd_N(input):
    coloring_print("\nGPU: skcuda.cusolver.cusolverDnDgesvd() 'N' option")

    # #縦横を逆(≒転地)してcolumn-majorにし、GPUのcusolverDnDgesvd()に対応させる。結果配列のU,Vが逆になる
    n, m = input.shape

    # change function by data type
    if input.dtype == np.dtype('float64'):
        get_buffer = solver.cusolverDnDgesvd_bufferSize
        cusolver_svd = solver.cusolverDnDgesvd
    elif input.dtype == np.dtype('float32'):
        get_buffer = solver.cusolverDnSgesvd_bufferSize
        cusolver_svd = solver.cusolverDnSgesvd
    else:
        print "Error: data type must be float64 or float32"

    h2d_start = time.time()
    input_gpu = gpuarray.to_gpu(input)
    h2d_end = time.time()
    print "H2D: ", h2d_end - h2d_start, "[sec]"

    # Set up work buffers:
    h = solver.cusolverDnCreate()
    Lwork = get_buffer(h, m, n)
    workspace_gpu = gpuarray.zeros(Lwork, input.dtype)
    devInfo_gpu = gpuarray.zeros(1, np.int32)

    # Set up output buffers:
    s_gpu = gpuarray.zeros(min(m, n), input.dtype)

    # 'N': no columns of U (no left singular vectors) are computed.
    cusolver_N_svd_start = time.time()
    status = cusolver_svd(h, 'N', 'N', m, n, input_gpu.gpudata, m,
                          s_gpu.gpudata, 0, m, 0, n, workspace_gpu.gpudata, 0,
                          0, devInfo_gpu.gpudata)
    cusolver_N_svd_end = time.time()
    print "solver.cusolverDnSgesvd() 'N' option: ", cusolver_N_svd_end - cusolver_N_svd_start, "[sec]"
    print "Total: ", cusolver_N_svd_end - h2d_start, "[sec]"

    print "only s is computed"
    # print s_gpu.get()

    solver.cusolverDnDestroy(h)
    def make_thunk(self, node, storage_map, _, no_recycling=[], impl=None):
        if not cusolver_available:
            raise RuntimeError('CUSOLVER is not available and '
                               'GpuCusolverSolve Op can not be constructed.')

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        global cusolver_handle
        if cusolver_handle is None:
            cusolver_handle = cusolver.cusolverDnCreate()

        def thunk():
            context = inputs[0][0].context

            # Size of the matrices to invert.
            z = outputs[0]

            # Matrix.
            A = inputs[0][0]

            # Solution vectors.
            b = inputs[1][0]

            assert (len(A.shape) == 2)
            assert (len(b.shape) == 2)

            if self.trans in ['T', 'C']:
                trans = 1
                l, n = A.shape
                k, m = b.shape
            elif self.trans == 'N':
                trans = 0
                n, l = A.shape
                k, m = b.shape
            else:
                raise ValueError('Invalid value for trans')
            if l != n:
                raise ValueError('A must be a square matrix')
            if n != k:
                raise ValueError('A and b must be aligned.')

            lda = max(1, n)
            ldb = max(1, k, m)

            # We copy A and b as cusolver operates inplace
            b = gpuarray.array(b, copy=True, order='F')
            if not self.inplace:
                A = gpuarray.array(A, copy=True)
            A_ptr = A.gpudata
            b_ptr = b.gpudata

            # cusolver expects a F ordered matrix, but A is not explicitly
            # converted between C and F order, instead we switch the
            # "transpose" flag.
            if A.flags['C_CONTIGUOUS']:
                trans = 1 - trans

            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                cusolver_handle, n, n, A_ptr, lda)

            if (thunk.workspace is None
                    or thunk.workspace.size != workspace_size):
                thunk.workspace = gpuarray.zeros((workspace_size, ),
                                                 dtype='float32',
                                                 context=context)

            if thunk.pivots is None or thunk.pivots.size != min(n, n):
                thunk.pivots = gpuarray.zeros((min(n, n), ),
                                              dtype='float32',
                                              context=context)

            if thunk.dev_info is None:
                thunk.dev_info = gpuarray.zeros((1, ),
                                                dtype='float32',
                                                context=context)

            workspace_ptr = thunk.workspace.gpudata
            pivots_ptr = thunk.pivots.gpudata
            dev_info_ptr = thunk.dev_info.gpudata

            cusolver.cusolverDnSgetrf(cusolver_handle, n, n, A_ptr, lda,
                                      workspace_ptr, pivots_ptr, dev_info_ptr)

            cusolver.cusolverDnSgetrs(cusolver_handle, trans, n, m, A_ptr, lda,
                                      pivots_ptr, b_ptr, ldb, dev_info_ptr)

            z[0] = b

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        thunk.workspace = None
        thunk.pivots = None
        thunk.dev_info = None

        return thunk
#!/usr/bin/env python
"""
Demo of how to call low-level CUSOLVER wrappers to perform LU decomposition.
"""

import numpy as np
import scipy.linalg
import scipy as sp
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import skcuda.cusolver as solver

h = solver.cusolverDnCreate()
x = np.asarray([[1.80, 2.88, 2.05, -0.89], [5.25, -2.95, -0.95, -3.80],
                [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59,
                                              0.80]]).astype(np.float32)

# Need to copy transposed matrix because T only returns a view:
m, n = x.shape
x_gpu = gpuarray.to_gpu(x.T.copy())

# Set up work buffers:
Lwork = solver.cusolverDnSgetrf_bufferSize(h, m, n, x_gpu.gpudata, m)
workspace_gpu = gpuarray.zeros(Lwork, np.float32)
devipiv_gpu = gpuarray.zeros(min(m, n), np.int32)
devinfo_gpu = gpuarray.zeros(1, np.int32)

# Compute:
solver.cusolverDnSgetrf(h, m, n, x_gpu.gpudata, m, workspace_gpu.gpudata,
                        devipiv_gpu.gpudata, devinfo_gpu.gpudata)
Exemple #5
0
def attach_cusolver_handle_to_context(ctx):
    handle = getattr(ctx, 'cusolver_handle', None)
    if handle is None:
        with ctx:
            ctx.cusolver_handle = cusolver.cusolverDnCreate()
Exemple #6
0
#!/usr/bin/env python
"""
Demo of how to call low-level CUSOLVER wrappers to perform eigen decomposition
for a batch of small Hermitian matrices.
"""

import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import skcuda.cusolver as solver

handle = solver.cusolverDnCreate()
batchSize = 100
n = 9

A = np.empty((n * batchSize, n), dtype=np.complex64)
B = np.empty((n * batchSize, n), dtype=A.dtype)

for i in range(batchSize):
    x = np.random.randn(n, n) + 1j * np.random.randn(n, n)
    x = x + x.conj().T
    x = x.astype(np.complex64)
    A[i * n:(i + 1) * n, :] = x
    # Need to reverse dimensions because CUSOLVER expects column-major matrices:
    B[i * n:(i + 1) * n, :] = x.T.copy()

x_gpu = gpuarray.to_gpu(B)

# Set up output buffers:
w_gpu = gpuarray.empty((batchSize, n), dtype=np.float32)
Exemple #7
0
def attach_cusolver_handle_to_context(ctx):
    handle = getattr(ctx, "cusolver_handle", None)
    if handle is None:
        with ctx:
            ctx.cusolver_handle = cusolver.cusolverDnCreate()
#!/usr/bin/env python

"""
Demo of how to call low-level CUSOLVER wrappers to perform SVD decomposition.
"""

import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import skcuda.cusolver as solver

h = solver.cusolverDnCreate()
x = np.asarray([[1.80, 2.88, 2.05, -0.89],
                [5.25, -2.95, -0.95, -3.80], 
                [1.58, -2.69, -2.90, -1.04],
                [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32)

# Need to reverse dimensions because CUSOLVER expects column-major matrices:
n, m = x.shape
x_gpu = gpuarray.to_gpu(x)

# Set up work buffers:
Lwork = solver.cusolverDnSgesvd_bufferSize(h, m, n)
workspace_gpu = gpuarray.zeros(Lwork, np.float32)
devInfo_gpu = gpuarray.zeros(1, np.int32)

# Set up output buffers:
s_gpu = gpuarray.zeros(min(m, n), np.float32)
u_gpu = gpuarray.zeros((m, m), np.float32)
vh_gpu = gpuarray.zeros((n, n), np.float32)
Exemple #9
0
 def prepare_node(self, node, storage_map, compute_map, impl):
     ctx = node.inputs[0].type.context
     handle = getattr(ctx, 'cusolver_handle', None)
     if handle is None:
         with ctx:
             ctx.cusolver_handle = cusolver.cusolverDnCreate()
Exemple #10
0
    def make_thunk(self,
                   node,
                   storage_map, _,
                   no_recycling=[],
                   impl=None):
        if not cusolver_available:
            raise RuntimeError('CUSOLVER is not available and '
                               'GpuCusolverSolve Op can not be constructed.')

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        global cusolver_handle
        if cusolver_handle is None:
            cusolver_handle = cusolver.cusolverDnCreate()

        def thunk():
            context = inputs[0][0].context

            # Size of the matrices to invert.
            z = outputs[0]

            # Matrix.
            A = inputs[0][0]

            # Solution vectors.
            b = inputs[1][0]

            assert(len(A.shape) == 2)
            assert(len(b.shape) == 2)

            if self.trans in ['T', 'C']:
                trans = 1
                l, n = A.shape
                k, m = b.shape
            elif self.trans == 'N':
                trans = 0
                n, l = A.shape
                k, m = b.shape
            else:
                raise ValueError('Invalid value for trans')
            if l != n:
                raise ValueError('A must be a square matrix')
            if n != k:
                raise ValueError('A and b must be aligned.')

            lda = max(1, n)
            ldb = max(1, k, m)

            # We copy A and b as cusolver operates inplace
            b = gpuarray.array(b, copy=True, order='F')
            if not self.inplace:
                A = gpuarray.array(A, copy=True)
            A_ptr = A.gpudata
            b_ptr = b.gpudata

            # cusolver expects a F ordered matrix, but A is not explicitly
            # converted between C and F order, instead we switch the
            # "transpose" flag.
            if A.flags['C_CONTIGUOUS']:
                trans = 1 - trans

            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                cusolver_handle, n, n, A_ptr, lda)

            if (thunk.workspace is None or
                    thunk.workspace.size != workspace_size):
                thunk.workspace = gpuarray.zeros((workspace_size,),
                                                 dtype='float32',
                                                 context=context)

            if thunk.pivots is None or thunk.pivots.size != min(n, n):
                thunk.pivots = gpuarray.zeros((min(n, n),),
                                              dtype='float32',
                                              context=context)

            if thunk.dev_info is None:
                thunk.dev_info = gpuarray.zeros((1,),
                                                dtype='float32',
                                                context=context)

            workspace_ptr = thunk.workspace.gpudata
            pivots_ptr = thunk.pivots.gpudata
            dev_info_ptr = thunk.dev_info.gpudata

            cusolver.cusolverDnSgetrf(
                cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                pivots_ptr, dev_info_ptr)

            cusolver.cusolverDnSgetrs(
                cusolver_handle, trans, n, m, A_ptr, lda,
                pivots_ptr, b_ptr, ldb, dev_info_ptr)

            z[0] = b

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        thunk.workspace = None
        thunk.pivots = None
        thunk.dev_info = None

        return thunk
Exemple #11
0
 def prepare_node(self, node, storage_map, compute_map, impl):
     ctx = node.inputs[0].type.context
     handle = getattr(ctx, 'cusolver_handle', None)
     if handle is None:
         with ctx:
             ctx.cusolver_handle = cusolver.cusolverDnCreate()
#!/usr/bin/env python

"""
Demo of how to call low-level CUSOLVER wrappers to perform eigen decomposition
for a batch of small symmetric matrices.
"""

import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import skcuda.cusolver as solver

handle = solver.cusolverDnCreate()
batchSize = 100
n = 9

A = np.empty((n*batchSize, n), dtype = np.double)

for i in range(batchSize):
    x = np.random.randn(n, n)
    x = x+x.T
    A[i*n:(i+1)*n, :] = x

x_gpu = gpuarray.to_gpu(A)

# Set up output buffers:
w_gpu = gpuarray.empty((batchSize, n), dtype = A.dtype)

# Set up parameters
params = solver.cusolverDnCreateSyevjInfo()
solver.cusolverDnXsyevjSetTolerance(params, 1e-7)