コード例 #1
0
    def _perform_sgemv(self, mat, v, vec_out, nvecs, dim):
        '''
        NOTES: cuBLAS uses Fortran layout
        cublas_sgemv is used to multiply matrix and vector (LEVEl 2 BLAS)

        cublas_handle   -> handle to the cuBLAS library context
        t               -> transpose
        dim             -> number of columns of matrix
        nvecs           -> number of rows of matrix
        alpha           -> scalar used for multiplication of mat
        mat.gpudata     -> matrix mat
        dim             -> columns of matrix
        v.gpudata       -> vector v
        incX            -> Stride within X. For example, if incX is 7, every 7th element is used.
        beta            -> scalar used for multiplication of v
        v_out.gpudata   -> result
        incY            -> Stride within Y. For example, if incx is 7, every 7th element is used

        Readmore        -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemv
        '''
        alpha = np.float32(1.0)
        beta = np.float32(0.0)

        incx = 1
        incy = 1

        cublas_handle = cublas.cublasCreate()

        cublas.cublasSgemv(cublas_handle, 't', dim, nvecs, alpha, mat.gpudata,
                           dim, v.gpudata, incx, beta, vec_out.gpudata, incy)

        cublas.cublasDestroy(cublas_handle)

        return vec_out
コード例 #2
0
    def _perform_sgemm(self, mat_a, mat_b, mat_out):
        nvecs_a, dim = mat_a.shape
        nvecs_b, dim = mat_b.shape

        alpha = np.float32(1.0)
        beta = np.float32(0.0)
        '''
        cublas_sgemm is used to  multiply matrix and matrix(LEVEL 3 BLAS)

        cublas_handle   -> handle to the cuBLAS-library context
        t               -> transpose mat_b
        n               -> notranspose mat_a
        nvecs_b         -> rows of mat_b
        nvecs_a         -> rows of mat_a
        dim             -> Common dimensions in mat_a and mat_b
        alpha           -> scaling factor for multiplication of mat_a and mat_b
        mat_b.gpudata   -> matrix mat_b
        dim             -> columns of mat_b
        mat_a.gpudata   -> matrix mat_a
        dim             -> columns of mat_a
        beta            -> scaling factor for r_gpu
        mat_out.gpudata -> matirx mat_out
        nvecs_b         -> rows of mat_b

        Read more       -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
        '''

        cublas_handle = cublas.cublasCreate()

        cublas.cublasSgemm(cublas_handle, 't', 'n', nvecs_b, nvecs_a, dim,
                           alpha, mat_b.gpudata, dim, mat_a.gpudata, dim, beta,
                           mat_out.gpudata, nvecs_b)
        cublas.cublasDestroy(cublas_handle)

        return mat_out
コード例 #3
0
def jki_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDscal, cublasDestroy
    import skcuda.misc as misc

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for j in range(0,n):
        for k in range(0,j):

            #N[k+1:,j] = N[k+1:,j] - N[k+1:,k] * N[k,j]
            cublasDaxpy(h, N_gpu[k+1:,k].size, -np.float64(N_gpu[k,j].get()), N_gpu[k+1:,k].gpudata, n, N_gpu[k+1:,j].gpudata, n)

        #N[j+1:,j] /= N[j,j]
        cublasDscal(h, N_gpu[j+1:,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[j+1:,j].gpudata, n)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
コード例 #4
0
def ijk_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDestroy, cublasDdot#, cublasDscal
    import skcuda.misc as misc
    #import skcuda.linalg as linalg
    #linalg.init()

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for i in range(0,n):
        for j in range(0,n):
            #N[i,j] -= N[i,:min(i,j)].dot(N[:min(i,j),j])
            N_gpu[i,j] -= cublasDdot(h, N_gpu[i,:min(i,j)].size, N_gpu[i,:min(i,j)].gpudata, 1, N_gpu[:min(i,j),j].gpudata, n)
            if j<i:
                N_gpu[i,j] /= N_gpu[j,j]
                #cublasDscal(h, N_gpu[i,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[i,j].gpudata, 1)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
コード例 #5
0
ファイル: cuda.py プロジェクト: skallumadi/chainer
def shutdown():
    """Finalizes CUDA global state.

    This function is automatically called by :mod:`atexit`. Multiple calls are
    allowed, so user can manually call this function if necessary.

    """
    global _contexts, _cublas_handles, _pid, _pools

    _check_cuda_available()

    pid = os.getpid()
    if _pid != pid:  # not initialized
        return

    for cublas_handle in six.itervalues(_cublas_handles):
        cublas.cublasDestroy(cublas_handle)
    _cublas_handles = {}

    cumisc.shutdown()

    _pools = {}

    for ctx in six.itervalues(_contexts):
        ctx.detach()
    _contexts = {}
    _pid = None  # mark as uninitialized
コード例 #6
0
def kij_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDestroy
    import skcuda.misc as misc

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for k in range(0,n):
        for i in range(k+1,n):
            N_gpu[i,k] = N_gpu[i,k] / N_gpu[k,k]
            #N[i,k+1:] -= N[i,k] * N[k,k+1:]
            cublasDaxpy(h, N_gpu[k,k+1:].size, -np.float64(N_gpu[i,k].get()), N_gpu[k,k+1:].gpudata, 1, N_gpu[i,k+1:].gpudata, 1)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
コード例 #7
0
ファイル: cuda.py プロジェクト: jheymann85/chainer
def shutdown():
    """Finalizes CUDA global state.

    This function is automatically called by :mod:`atexit`. Multiple calls are
    allowed, so user can manually call this function if necessary.

    """
    global _contexts, _cublas_handles, _pid, _pools

    pid = os.getpid()
    if _pid != pid:  # not initialized
        return

    for cublas_handle in six.itervalues(_cublas_handles):
        cublas.cublasDestroy(cublas_handle)
    _cublas_handles = {}

    cumisc.shutdown()

    _pools = {}

    for ctx in six.itervalues(_contexts):
        ctx.detach()
    _contexts = {}
    _pid = None  # mark as uninitialized
コード例 #8
0
def compute_gflops(precision='S'):


	if precision=='S':
		float_type = 'float32'
	elif precision=='D':
		float_type = 'float64'
	else:
		return -1
		
		
	A = np.random.randn(m, k).astype(float_type)
	B = np.random.randn(k, n).astype(float_type)
	C = np.random.randn(m, n).astype(float_type)

	A_cm = A.T.copy()
	B_cm = B.T.copy()
	C_cm = C.T.copy()

	A_gpu = gpuarray.to_gpu(A_cm)
	B_gpu = gpuarray.to_gpu(B_cm)
	C_gpu = gpuarray.to_gpu(C_cm)

	alpha = np.random.randn()
	beta = np.random.randn()

	transa = cublas._CUBLAS_OP['N']
	transb = cublas._CUBLAS_OP['N']

	lda = m
	ldb = k
	ldc = m

	t = time()
	handle = cublas.cublasCreate()

	
	exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \
						B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision)
	
	cublas.cublasDestroy(handle)
	t = time() - t

	gflops = 2*m*n*(k+1)*(10**-9) / t 
	
	return gflops
コード例 #9
0
def count_triangles_cublas(adjacency_list):
    driver.init()
    context = tools.make_default_context()
    h = cublas.cublasCreate()
    n = len(adjacency_list)
    A = np.zeros([n,n], dtype=np.float64)
    for row_idx, neighbor_list in adjacency_list:
        A[row_idx, neighbor_list] = 1.0
    a_gpu = gpuarray.to_gpu(A)
    b_gpu = gpuarray.empty(A.shape, A.dtype)
    c_gpu = gpuarray.empty(A.shape, A.dtype)
    one = np.float64(1.0)
    zero = np.float64(0.0)
    cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, a_gpu.gpudata, n, zero, b_gpu.gpudata, n)
    cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, b_gpu.gpudata, n, zero, c_gpu.gpudata, n)
    trace = linalg.trace(c_gpu, h)
    cublas.cublasDestroy(h)
    context.detach()
    return int(trace/6)
コード例 #10
0
def compute_gflops(precision='S'):
    if precision == 'S':
        float_type = 'float32'
    elif precision == 'D':
        float_type = 'float64'
    else:
        return -1

    # some random matrices that are of the appropriate precision that we will use for timing
    A = np.random.randn(m, k).astype(float_type)
    B = np.random.randn(k, n).astype(float_type)
    C = np.random.randn(m, n).astype(float_type)

    # wie gehabt
    A_cm = A.T.copy()
    B_cm = B.T.copy()
    C_cm = C.T.copy()
    A_gpu = gpuarray.to_gpu(A_cm)
    B_gpu = gpuarray.to_gpu(B_cm)
    C_gpu = gpuarray.to_gpu(C_cm)
    alpha = np.random.randn()
    beta = np.random.randn()
    transa = cublas._CUBLAS_OP['N']
    transb = cublas._CUBLAS_OP['N']
    lda = m
    ldb = k
    ldc = m

    t = time()
    handle = cublas.cublasCreate()

    # two different (relevant) precision modes, D and S
    exec(
        'cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \
						B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision)

    cublas.cublasDestroy(handle)
    t = time() - t

    # a total of 2kmn - mn + 3mn = 2kmn + 2mn = 2mn(k+1) floating point operations in a given GEMM operation
    gflops = 2 * m * n * (k + 1) * (10**-9) / t

    return gflops
コード例 #11
0
    def matmul(self, mat, return_time=False):
        '''Matrix multiplication between two matrices'''

        # check dimensions first:
        if self.ncols != mat.nrows:
            raise ValueError("Dimensions {0} and {1} do not match.".format(
                self.ncols, mat.nrows))

        # move matrices to gpu
        # somehow we need to transpose this to make it work, not sure why tho
        a_gpu = gpuarray.to_gpu(self.arr.T.copy())
        b_gpu = gpuarray.to_gpu(mat.arr.T.copy())
        c_gpu = gpuarray.to_gpu(
            np.zeros((self.nrows, mat.ncols)).astype(np.float32).T.copy())

        # initialize culas context
        h = cublasCreate()

        # evaluate the matrix multiplication
        # cubals syntax as follows:
        # h = handle, "n" : op(A) = A (for op(A) = A^t, use "t"),
        # then list m, n, k if we have m x k dot k x n
        # then first value is scalar in front of product (set to 1)
        # then give array data followed by row dim of each matrix
        # last value is for adding another matrix, set to 0.
        # also record the time it takes for the evaluation
        t0 = time.perf_counter_ns()
        cublasSgemm(h, "n", "n", self.nrows, mat.ncols, self.ncols,
                    np.float32(1.0), a_gpu.gpudata, self.nrows, b_gpu.gpudata,
                    mat.nrows, np.float32(0.0), c_gpu.gpudata, self.nrows)
        t1 = time.perf_counter_ns()

        eval_time = (t1 - t0) * (1e-9)  # time for each matmul evaluation

        # move from device to host
        prod_arr = c_gpu.get().T

        # free allocated memory for handle
        cublasDestroy(h)

        return (cublasMatrix(prod_arr),
                eval_time) if return_time else cublasMatrix(prod_arr)
コード例 #12
0
ファイル: gpu_test9.py プロジェクト: eretana/hera_sandbox
def vis_gpu(antpos,
            freq,
            eq2tops,
            crd_eq,
            I_sky,
            bm_cube,
            nthreads=NTHREADS,
            max_memory=MAX_MEMORY,
            real_dtype=np.float32,
            complex_dtype=np.complex64,
            verbose=False):
    # ensure shapes
    nant = antpos.shape[0]
    assert (antpos.shape == (nant, 3))
    npix = crd_eq.shape[1]
    assert (crd_eq.shape == (3, npix))
    assert (I_sky.shape == (npix, ))
    beam_px = bm_cube.shape[1]
    assert (bm_cube.shape == (nant, beam_px, beam_px))
    ntimes = eq2tops.shape[0]
    assert (eq2tops.shape == (ntimes, 3, 3))
    # ensure data types
    antpos = antpos.astype(real_dtype)
    eq2tops = eq2tops.astype(real_dtype)
    crd_eq = crd_eq.astype(real_dtype)
    Isqrt = np.sqrt(I_sky).astype(real_dtype)
    bm_cube = bm_cube.astype(real_dtype)  # XXX complex?
    chunk = max(min(npix, MIN_CHUNK),
                2**int(ceil(np.log2(float(nant * npix) / max_memory / 2))))
    npixc = npix / chunk
    # blocks of threads are mapped to (pixels,ants,freqs)
    block = (max(1, nthreads / nant), min(nthreads, nant), 1)
    grid = (int(ceil(npixc / float(block[0]))),
            int(ceil(nant / float(block[1]))))
    gpu_code = GPU_TEMPLATE % {
        'NANT': nant,
        'NPIX': npixc,
        'BEAM_PX': beam_px,
        'BLOCK_PX': block[0],
    }
    gpu_module = compiler.SourceModule(gpu_code)
    bm_interp = gpu_module.get_function("InterpolateBeam")
    meas_eq = gpu_module.get_function("MeasEq")
    bm_texref = gpu_module.get_texref("bm_tex")
    import pycuda.autoinit
    h = cublasCreate()  # handle for managing cublas
    # define GPU buffers and transfer initial values
    bm_texref.set_array(
        numpy3d_to_array(bm_cube)
    )  # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT)
    antpos_gpu = gpuarray.to_gpu(
        antpos)  # never changes, set to -2*pi*antpos/c
    Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype)
    A_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=real_dtype)  # will be set on GPU by bm_interp
    crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype)
    eq2top_gpu = gpuarray.empty(shape=(3, 3),
                                dtype=real_dtype)  # sent from CPU each time
    crdtop_gpu = gpuarray.empty(shape=(3, npixc),
                                dtype=real_dtype)  # will be set on GPU
    tau_gpu = gpuarray.empty(shape=(nant, npixc),
                             dtype=real_dtype)  # will be set on GPU
    v_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=complex_dtype)  # will be set on GPU
    vis_gpus = [
        gpuarray.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    # output CPU buffers for downloading answers
    vis_cpus = [
        np.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    streams = [driver.Stream() for i in xrange(chunk)]
    event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate',
                   'meas_eq', 'vis', 'end')
    vis = np.empty((ntimes, nant, nant), dtype=complex_dtype)
    for t in xrange(ntimes):
        if verbose: print '%d/%d' % (t + 1, ntimes)
        eq2top_gpu.set(
            eq2tops[t])  # defines sky orientation for this time step
        events = [{e: driver.Event()
                   for e in event_order} for i in xrange(chunk)]
        for c in xrange(chunk + 2):
            cc = c - 1
            ccc = c - 2
            if 0 <= ccc < chunk:
                stream = streams[ccc]
                vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream)
                events[ccc]['end'].record(stream)
            if 0 <= cc < chunk:
                stream = streams[cc]
                cublasSetStream(h, stream.handle)
                ## compute crdtop = dot(eq2top,crd_eq)
                # cublas arrays are in Fortran order, so P=M*N is actually
                # peformed as P.T = N.T * M.T
                cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata,
                            npixc, eq2top_gpu.gpudata, 3, 0.,
                            crdtop_gpu.gpudata, npixc)
                events[cc]['eq2top'].record(stream)
                ## compute tau = dot(antpos,crdtop)
                cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1.,
                            crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3,
                            0., tau_gpu.gpudata, npixc)
                events[cc]['tau'].record(stream)
                ## interpolate bm_tex at specified topocentric coords, store interpolation in A
                ## threads are parallelized across pixel axis
                bm_interp(crdtop_gpu,
                          A_gpu,
                          grid=grid,
                          block=block,
                          stream=stream)
                events[cc]['interpolate'].record(stream)
                # compute v = A * I * exp(1j*tau*freq)
                meas_eq(A_gpu,
                        Isqrt_gpu,
                        tau_gpu,
                        real_dtype(freq),
                        v_gpu,
                        grid=grid,
                        block=block,
                        stream=stream)
                events[cc]['meas_eq'].record(stream)
                # compute vis = dot(v, v.T)
                # transpose below incurs about 20% overhead
                cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata,
                            npixc, v_gpu.gpudata, npixc, 0.,
                            vis_gpus[cc].gpudata, nant)
                events[cc]['vis'].record(stream)
            if c < chunk:
                stream = streams[c]
                events[c]['start'].record(stream)
                crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc],
                                     stream=stream)
                Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc],
                                    stream=stream)
                events[c]['upload'].record(stream)
        events[chunk - 1]['end'].synchronize()
        vis[t] = sum(vis_cpus)
        if verbose:
            for c in xrange(chunk):
                print '%d:%d START->END:' % (
                    c, chunk), events[c]['start'].time_till(
                        events[c]['end']) * 1e-3
                #for i,e in enumerate(event_order[:-1]):
                #    print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3
            print 'TOTAL:', events[0]['start'].time_till(
                events[chunk - 1]['end']) * 1e-3
    # teardown GPU configuration
    cublasDestroy(h)
    return vis
コード例 #13
0
 def _shutdown_gpucsrarray():
     cublas.cublasDestroy(cublas_handle)
     cusparse.cusparseDestroy(cusparse_handle)
コード例 #14
0
    a * x + y, y_gpu.get())

w_gpu = gpuarray.to_gpu(x)
v_gpu = gpuarray.to_gpu(y)

#perform a dot product
dot_output = cublas.cublasSdot(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1,
                               w_gpu.gpudata, 1)

print(dot_output)

l2_output = cublas.cublasSnrm2(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1)

print(l2_output)

cublas.cublasDestroy(cublas_context_h)

#(f we want to operate on arrays of 64-bit real floating point values, (float64 in NumPy and PyCUDA), then we would use the cublasDaxpy)
"""Level-2 GEMV (general matrix-vector)"""

# m and n are the number of rows and columns
m = 10
n = 100
# is the floating-point value for α
alpha = 1
# s the floating-point value for β
beta = 0
# set alpha to 1 and beta to 0 to get a direct matrix multiplication with no scaling
A = np.random.rand(m, n).astype('float32')
x = np.random.rand(n).astype('float32')
y = np.zeros(m).astype('float32')
コード例 #15
0
 def tearDownClass(cls):
     cublas.cublasDestroy(cls.cublas_handle)
コード例 #16
0
ファイル: gpucsrarray.py プロジェクト: thejonan/binet
 def _shutdown_gpucsrarray():
     cublas.cublasDestroy(cublas_handle)
     cusparse.cusparseDestroy(cusparse_handle)
コード例 #17
0
def register_multiple_images_subpix_cuda(stack, template):

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as drv
    import pycuda.cumath as cumath
    import skcuda.fft as cu_fft
    import skcuda.linalg as lin
    import skcuda.cublas as cub
    from numpy import pi, newaxis, floor
    import cmath
    from pycuda.elementwise import ElementwiseKernel
    from pycuda.compiler import SourceModule

    from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix
    from numpy.fft import fftshift, ifftshift
    fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1,
                                                   use_numpy_fft=False)

    mod = SourceModule("""
   #include <pycuda-complex.hpp>"
   
    __global__ void load_convert(unsigned short *a, float *b,int f, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = f * imlen;
        if (idx <imlen)
        {
            b[idx] = (float)a[offset+idx];
        }
    }
        
    __global__ void convert_export(float *a, unsigned short *b,int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ;
        }
    }
        
    __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx] = x[idx] * y[idx];
        }
    }
        
    __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = x[idx]._M_re;
            y[idx]._M_im = -x[idx]._M_im;
        }
    }
        
        
    __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = 0;
            y[idx]._M_im = x[idx] * sx;
        }
    }
        
    __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen,  int nlargeh, int nh)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = imlenl*3/4;
        if (idx<imlen)
        {
            int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl;
            x[target_ind] = y[idx];
        }      
    
    }    
        
    __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx]._M_re = 0;
            z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg;
        }
    }
        
    __global__ void sub_float(float *x, float *y, float sv,  int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            x[idx] = y[idx]-sv;
        }
    }
        

    """)

    load_convert_kernel = mod.get_function('load_convert')
    convert_export_kernel = mod.get_function('convert_export')
    convert_multiply_kernel = mod.get_function('convert_multiply')
    multiply_float_kernel = mod.get_function('multiply_comp_float')
    transfer_array_kernel = mod.get_function('transfer_array')
    calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix')
    conj_kernel = mod.get_function('calc_conj')
    sub_float_kernel = mod.get_function('sub_float')

    Z = stack.shape[0]
    M = stack.shape[1]
    N = stack.shape[2]
    max_memsize = 4200000000

    imlen = M * N
    half_imlen = M * (N // 2 + 1)
    grid_dim = (64, int(imlen / (512 * 64)) + 1, 1)
    block_dim = (512, 1, 1)  #512 threads per block

    stack_bin = int(max_memsize / (M * N * stack.itemsize))
    stack_ite = int(Z / stack_bin) + 1

    usfac = 100  ## needs to be bigger than 10

    if not template.shape == stack.shape[1:]:
        raise ValueError("Images must have same shape.")

    if np.any(np.isnan(template)):
        template = template.copy()
        template[template != template] = 0
    if np.any(np.isnan(stack)):
        stack = stack.copy()
        stack[stack != stack] = 0

    mlarge = M * 2
    nlarge = N * 2

    t = time.time()

    plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64)
    plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32)
    plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32)
    cub_h = cub.cublasCreate()

    template_gpu = gpuarray.to_gpu(template.astype('float32'))
    source_gpu = gpuarray.empty((M, N), np.float32)
    ifft_gpu = gpuarray.empty((M, N), np.float32)
    result_gpu = gpuarray.empty((M, N), np.uint16)

    templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64)

    cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True)
    templatef_gpu = templatef_gpu.conj()

    move_list = np.zeros((Z, 2))

    largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64)
    largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32)
    imlenl = mlarge * (nlarge // 2 + 1)

    zoom_factor = 1.5
    dftshiftG = trunc(ceil(usfac * zoom_factor) / 2)
    #% Center of output array at dftshift+1
    upsample_dim = int(ceil(usfac * zoom_factor))

    term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)).
              T[:, newaxis]) / N  # fftfreq  # output points
    term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :]
    term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis]
    term2r = (ifftshift(np.arange(M, dtype='float')) -
              floor(M / 2))[newaxis, :]  # fftfreq
    term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) +
                                             1), :].astype('float32'))
    term2c_gpu = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu = gpuarray.to_gpu(term1r.astype('float32'))
    term2r_gpu = gpuarray.to_gpu(term2r.astype('float32'))
    term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32'))

    kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32)
    kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32)
    kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64)
    kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64)

    Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M))
    Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N))
    [Nc, Nr] = np.meshgrid(Nc, Nr)
    Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32'))
    Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32'))

    upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64)
    upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64)

    source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype)
    copy = drv.Memcpy3D()
    copy.set_src_host(stack.data)
    copy.set_dst_device(source_stack.gpudata)
    copy.width_in_bytes = copy.src_pitch = stack.strides[1]
    copy.src_height = copy.height = M

    for zb in range(stack_ite):

        zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z))
        copy.depth = len(zrange)
        copy.src_z = int(zrange[0])
        copy()

        for i in range(len(zrange)):

            t = zb * stack_bin + i
            load_convert_kernel(source_stack,
                                source_gpu.gpudata,
                                np.int32(i),
                                np.int32(imlen),
                                block=block_dim,
                                grid=grid_dim)
            cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True)

            multiply_float_kernel(sourcef_gpu,
                                  templatef_gpu,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            transfer_array_kernel(largearray1_gpu,
                                  prod_gpu1,
                                  np.int32(imlenl),
                                  np.int32(half_imlen),
                                  np.int32(nlarge // 2 + 1),
                                  np.int32(N // 2 + 1),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(largearray1_gpu,
                        largearray2_gpu,
                        plan_inverse_big,
                        scale=True)
            peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size,
                                       largearray2_gpu.gpudata, 1)
            rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape)

            md2 = trunc(mlarge / 2)
            nd2 = trunc(nlarge / 2)
            if rloc > md2:
                row_shift2 = rloc - mlarge
            else:
                row_shift2 = rloc
            if cloc > nd2:
                col_shift2 = cloc - nlarge
            else:
                col_shift2 = cloc
            row_shiftG = row_shift2 / 2.
            col_shiftG = col_shift2 / 2.

            # Initial shift estimate in upsampled grid

            row_shiftG0 = round(row_shiftG * usfac) / usfac
            col_shiftG0 = round(col_shiftG * usfac) / usfac
            # Matrix multiply DFT around the current shift estimate
            roffG = dftshiftG - row_shiftG0 * usfac
            coffG = dftshiftG - col_shiftG0 * usfac

            sub_float_kernel(term2c_gpu,
                             term2c_gpu_ori,
                             np.float32(coffG / usfac),
                             np.int32(term2c_gpu.size),
                             block=block_dim,
                             grid=grid_dim)
            sub_float_kernel(term1r_gpu,
                             term1r_gpu_ori,
                             np.float32(roffG),
                             np.int32(term1r_gpu.size),
                             block=block_dim,
                             grid=grid_dim)

            lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu)
            lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu)
            convert_multiply_kernel(kernc_gpu,
                                    kernc_gpuc,
                                    np.float32(-2 * pi),
                                    np.int32(kernc_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            convert_multiply_kernel(kernr_gpu,
                                    kernr_gpuc,
                                    np.float32(-2 * pi / (M * usfac)),
                                    np.int32(kernr_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(kernc_gpuc, out=kernc_gpuc)
            cumath.exp(kernr_gpuc, out=kernr_gpuc)

            conj_kernel(prod_gpu1,
                        prod_gpu2,
                        np.int32(half_imlen),
                        block=block_dim,
                        grid=grid_dim)

            lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1)
            lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2)

            CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2)
            rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape)
            CCGmax = CCG[rlocG, clocG]

            rlocG = rlocG - dftshiftG  #+ 1 # +1 # questionable/failed hack + 1;
            clocG = clocG - dftshiftG  #+ 1 # -1 # questionable/failed hack - 1;
            row_shiftG = row_shiftG0 + rlocG / usfac
            col_shiftG = col_shiftG0 + clocG / usfac

            diffphaseG = arctan2(imag(CCGmax), real(CCGmax))

            # Compute registered version of source stack
            calc_shiftmatrix_kernel(Nr_gpu,
                                    Nc_gpu,
                                    shiftmatrix,
                                    np.float32(row_shiftG * 2 * np.pi),
                                    np.float32(col_shiftG * 2 * np.pi),
                                    np.float32(diffphaseG),
                                    np.int32(half_imlen),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(shiftmatrix, out=shiftmatrix)
            multiply_float_kernel(sourcef_gpu,
                                  shiftmatrix,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse)
            convert_export_kernel(ifft_gpu,
                                  result_gpu,
                                  np.int32(imlen),
                                  block=block_dim,
                                  grid=grid_dim)

            move_list[t, :] = (row_shiftG, col_shiftG)
            stack[t, :, :] = result_gpu.get()

    cub.cublasDestroy(cub_h)
    return (stack, move_list)
コード例 #18
0
    def destroy_contexts(self):
        """ Destroy context """

        cublas.cublasDestroy(self.cublasContext)
        cudnn.cudnnDestroy(self.cudnnContext)
        self.cudaContext.pop()
コード例 #19
0
ファイル: linalg.py プロジェクト: neurokernel/retina
 def destroy(self):
     if self.handle is not None:
         cublas.cublasDestroy(self.handle)
コード例 #20
0
ファイル: test_cublas.py プロジェクト: lebedov/scikit-cuda
 def tearDownClass(cls):
     cublas.cublasDestroy(cls.cublas_handle)
     cls.ctx.pop()
     clear_context_caches()
コード例 #21
0
def cor_mat_3(BOLD, upper_tri, N, L, OOO):
	# calcolo memoria disponibile
	meminfo = cuda.mem_get_info()
	print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
	available_mem = float(meminfo[0])
	available_mem /= np.dtype(np.float32).itemsize
	available_mem -= N * L
	# print("Available memory: ", available_mem)

	# preprocessing fMRI data in CPU
	start_time = time.time()
	BOLD = preprocessing(BOLD, N, L)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time for preprocessing: ", delta, "\n")


	# passaggio di BOLD in device
	start_time = time.time()
	BOLD_device = gpuarray.to_gpu(BOLD)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time matrices to device: ", delta, "\n")

	# calcolo memoria disponibile
	# meminfo = cuda.mem_get_info()
	# print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))

	# inizializzazione variabili
	flag = 1
	ii=0
	upper_size = (N-1) * N / 2
	block = OOO
	N_prime = N
	temp = 0
	temp2 = 0
	temp3 = 0
	pak = 0
	so_far = 0
	count = 1
	temp4 = 0

	alpha = np.float32(1.0)
	beta = np.float32(0.0)

	while flag is 1:
		print("###### ITERAZIONE ", count, " #####")
		# calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))

		# print("block: ", block)
		# print("N_prime: ", N_prime)
		# checking for the last chunk
		if block == N_prime:
			flag = 0

		if pak is not 0:
			del dev_upper
			del result_device

		temp = block
		temp *= (block + 1)
		temp /= 2
		# M1 is the size of upper triangle part of chunk
		M1 = N_prime
		M1 *= block
		M1 -= temp

		M1 = int(M1)
		
		# print("M1: ", M1)
		
		pak += 1

		# print("so_far*L: ", so_far*L)
		start_time = time.time()
		result = np.zeros((block, N_prime), np.float32)
		# print("result shape: ", result.shape)

		BOLD_device = BOLD_device.reshape(-1)
		
		# allocate memory on the device for the result
		result_device = gpuarray.to_gpu(result)
		# print("result_device shape: ", result_device.shape)

		# # calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("Before cublasSgemm free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
		stop_time = time.time()
		delta = stop_time - start_time	
		print("Running time matrices to device: ", delta, "\n")

		start_time = time.time()
		h = cublas.cublasCreate()
		cublas.cublasSgemm(h,
				   'T',
				   'n',
				   block,
				   N_prime,
				   L,
				   alpha,
				   BOLD_device[so_far*L:].gpudata,
				   L,
				   BOLD_device[so_far*L:].gpudata,
				   L,
				   beta,
				   result_device.gpudata,
				   block)
		stop_time = time.time()
		delta = stop_time - start_time	
		print("Running time core function: ", delta, "\n")

		temp2 = block
		temp2 *= N_prime

		# calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
		
		# result_device = gpuarray.to_gpu(result1)

		start_time = time.time()
		threads_per_block = 1024
		blocks_per_grid = 1 + math.ceil(((temp2-1) / threads_per_block))
		grid = (blocks_per_grid, 1)

		# print("temp2:", temp2)
		# print("threads_per_block: ", threads_per_block)
		# print("blocks_per_grid: ", blocks_per_grid)

		upper = np.zeros(M1, np.float32)
		# print("upper shape:", upper.shape)
		dev_upper = gpuarray.to_gpu(upper)

		# print("dev_upper shape: ", dev_upper.shape)
		# print("result_device shape: ", result_device.shape)

		mod = pycuda.compiler.SourceModule("""
			__global__ void ker2(float * cormat, float * upper,int n1,int n,long long upper_size,int N,int i_so_far,long long M1)
			{
				long long idx = blockDim.x;
				idx*=blockIdx.x;
				idx+=threadIdx.x;
				long i = idx/n;
				long j = idx%n;

				if(i<j && i<n1 && j<n)// &&i<N &&j<N && idx<(n1*n))
				{
				        long long tmp=i;
				        tmp*=(i+1);
				        tmp/=2;
				        long long tmp_2=i;
				        tmp_2*=n;
				        tmp_2=tmp_2-tmp;
				        tmp_2+=j;
				        tmp_2-=i;
				        long long indexi=n1;
				        indexi*=j;
				        indexi=indexi+i;
				        upper[tmp_2-1]=cormat[indexi];
				}
			}
  			""")
		# calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
		
		funct = mod.get_function("ker2")
		funct(result_device, 
			  dev_upper, 
			  np.int32(block), 
			  np.int32(N_prime),
			  np.int64(upper_size),
			  np.int32(N),
			  np.int32(ii), 
			  np.int64(M1),       
			  block=(threads_per_block, 1, 1),
              grid=grid
              )

		temp3+=M1
		# print("upper_tri shape:", upper_tri.shape)
		upper_tri[temp4:temp3] = dev_upper.get()
		stop_time = time.time()
		delta = stop_time - start_time	
		print("Running time to get upper tri: ", delta, "\n")

		temp4 += M1
		ii += block

		cublas.cublasDestroy(h)

		so_far += block

		if N_prime > block:
			N_prime = N_prime - block
			block = remaining_N2(N_prime, L, available_mem)
			if N_prime < block:
				block = N_prime

		count += 1

	# liberare la memoria 
	del BOLD_device
	del result_device
	del dev_upper

	# calcolo memoria disponibile
	# meminfo = cuda.mem_get_info()
	# print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))

	return upper_tri
コード例 #22
0
 def tearDownClass(cls):
     cublas.cublasDestroy(cls.cublas_handle)
     cls.ctx.pop()
     clear_context_caches()
コード例 #23
0
import pycuda.autoinit
from pycuda import gpuarray
import numpy as np

from skcuda import cublas

a = np.float32(10)
x = np.float32([1, 2, 3])

y = np.float32([-.345, 8.15, -15.867])

x_gpu = gpuarray.to_gpu(x)
y_gpu = gpuarray.to_gpu(y)

cublas_context_h = cublas.cublasCreate()

cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1)

cublas.cublasDestroy(cublas_context_h)

print("This is close to the NumPy approximation: {}".format(np.allclose(a * x + y, y_gpu.get())))
コード例 #24
0
def cor_mat_2(BOLD, upper_tri, N, L):
	# preprocessing fMRI data in CPU
	start_time = time.time()
	BOLD = preprocessing(BOLD, N, L)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time for preprocessing: ", delta, "\n")
	
	alpha = np.float32(1.0)
	beta = np.float32(0.0)

	# passaggio su device
	start_time = time.time()
	BOLD_device = gpuarray.to_gpu(BOLD)
	result = np.zeros((BOLD.shape[0], BOLD.shape[0]), np.float32)
	result_device = gpuarray.to_gpu(result)
	# print("BOLD_device shape:", BOLD_device.shape)
	# print("result_device shape:", result_device.shape)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time matrices to device: ", delta, "\n")

	start_time = time.time()
	h = cublas.cublasCreate()
	cublas.cublasSgemm(h,
					   'T',
					   'n',
					   N,
					   N,
					   L,
					   alpha,
					   BOLD_device.gpudata,
					   L,
					   BOLD_device.gpudata,
					   L,
					   beta,
					   result_device.gpudata,
					   N)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time core function: ", delta, "\n")

	start_time = time.time()
	threads_per_block = 1024
	blocks_per_grid = int(math.ceil(1 + ((N*N - 1) / threads_per_block)))
	mod = pycuda.compiler.SourceModule("""
		__global__ void ker(float * cormat, float * upper,int n1,int n)
		{
			long idx = blockDim.x*blockIdx.x+threadIdx.x;
			long i = idx%n1;
			long j = idx/n1;
			if(i<j && i<n1 && j<n)
			{
		        long tmp=i;
		        tmp*=(i+1);
		        tmp/=2;
		        long tmp_2=i;
		        tmp_2*=n;
		        tmp_2=tmp_2-tmp;
		        tmp_2+=j;
		        tmp_2-=i;
		        upper[tmp_2-1]=cormat[j*n+i];
			}
		}
		""")
	result_device = result_device.reshape(-1)
	# print("result device shape:", result_device.shape)
	upper_tri_device = gpuarray.to_gpu(upper_tri)
	funct = mod.get_function("ker")
	funct(result_device, 
		  upper_tri_device, 
		  np.int32(N),
		  np.int32(N),    
		  block=(threads_per_block, 1, 1),
          grid=(blocks_per_grid, 1)
          )
	upper_tri = upper_tri_device.get()
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time to get upper tri: ", delta, "\n")
	
	cublas.cublasDestroy(h)
	
	return upper_tri
コード例 #25
0
ファイル: redcal.py プロジェクト: HERA-Team/hera_gpu
def omnical(ggu_indices,
            gains,
            ubls,
            data,
            wgts,
            conv_crit=1e-10,
            maxiter=50,
            check_every=4,
            check_after=1,
            gain=0.3,
            nthreads=NTHREADS,
            precision=1,
            verbose=False):
    '''CPU-side function for organizing GPU-acceleration primitives into
    a cohesive omnical algorithm.
        
    Args:
        ggu_indices: (nbls,3) array of (i,j,k) indices denoting data order
                     as gains[i] * gains[j].conj() * ubl[k]
        gains: (ndata, nants) array of estimated complex gains
        ubls: (ndata, nubls) array of estimated complex unique baselines
        data: (ndata, nbls) array of data to be calibrated
        wgts: (ndata, nbls) array of weights for each data
        conv_crit: maximum allowed relative change in solutions to be 
            considered converged
        maxiter: maximum number of omnical iterations allowed before it
            gives up.
        check_every: Compute convergence every Nth iteration (saves 
            computation).  Default 4.
        check_after: Start computing convergence only after N iterations.  
            Default 1.
        gain: The fractional step made toward the new solution each 
            iteration. Values in the range 0.1 to 0.5 are generally safe.  
            Increasing values trade speed for stability.  Default is 0.3.
        nthreads: Number of GPU threads to use. Default NTHREADS=1024
        precision: 1=float32, 2=double (float64). Default 1.
        verbose: If True, print things. Default False.

    Returns:
        info dictionary of {'gains':gains, 'ubls':ubls, 'chisq':chisq, 
            'iters':iters, 'conv':conv}
    '''
    # Sanity check input array dimensions
    nbls = ggu_indices.shape[0]
    assert ggu_indices.shape == (nbls, 3)
    ndata = data.shape[0]
    assert data.shape == (ndata, nbls)
    assert wgts.shape == (ndata, nbls)
    nants = gains.shape[1]
    assert gains.shape == (ndata, nants)
    nubls = ubls.shape[1]
    assert ubls.shape == (ndata, nubls)
    assert precision in (1, 2)
    assert check_every > 1
    if verbose:
        print('PRECISION:', precision)
        print('NDATA:', ndata)
        print('NANTS:', nants)
        print('NUBLS:', nubls)
        print('NBLS:', nbls)

    # Choose between float/double primitives
    if precision == 1:
        real_dtype, complex_dtype = np.float32, np.complex64
        DTYPE, CDTYPE = 'float', 'cuFloatComplex'
        CMULT, CONJ, CSUB = 'cuCmulf', 'cuConjf', 'cuCsubf'
        COPY = cublasCcopy
    else:
        real_dtype, complex_dtype = np.float64, np.complex128
        DTYPE, CDTYPE = 'double', 'cuDoubleComplex'
        CMULT, CONJ, CSUB = 'cuCmul', 'cuConj', 'cuCsub'
        COPY = cublasZcopy

    # ensure data types
    ggu_indices = ggu_indices.astype(np.uint32)
    data = data.astype(complex_dtype)
    wgts = wgts.astype(real_dtype)
    gains = gains.astype(complex_dtype)
    ubls = ubls.astype(complex_dtype)

    # Profiling info used to determine optimal chunk size:
    # Model: 15.3 s + ndata/140 => px=8192 in 75 s on Quadro T2000
    # px=2000, nants=126, precision=2
    #chunk_size = min(nthreads // 2, ndata) # 8.5 s
    #chunk_size = min(nthreads // 4, ndata) # 6.7 s
    #chunk_size = min(nthreads // 8, ndata) # 5.6 s
    #chunk_size = min(nthreads // 16, ndata) # 5.2 s
    #chunk_size = min(nthreads // 32, ndata) # 5.4 s
    #chunk_size = min(nthreads // 64, ndata) # 5.9 s
    # px=2000, nants=54, precision=2
    #chunk_size = min(nthreads // 2, ndata) # 1.5 s
    #chunk_size = min(nthreads // 4, ndata) # 1.3 s
    #chunk_size = min(nthreads // 8, ndata) # 1.3 s
    #chunk_size = min(nthreads // 16, ndata) # 1.5 s
    #chunk_size = min(nthreads // 32, ndata) # 1.9 s
    #chunk_size = min(nthreads // 64, ndata) # 3.0 s
    # px=2000, nants=180, precision=2
    #chunk_size = min(nthreads // 2, ndata) # 16.8 s
    #chunk_size = min(nthreads // 4, ndata) # 13.8 s
    #chunk_size = min(nthreads // 8, ndata) # 11.1 s
    #chunk_size = min(nthreads // 16, ndata) # 9.9 s
    #chunk_size = min(nthreads // 32, ndata) # 10.0 s
    #chunk_size = min(nthreads // 64, ndata) # 10.14 s
    # px=2000, nants=180, precision=1
    #chunk_size = min(nthreads // 2, ndata) # 7.6 s
    #chunk_size = min(nthreads // 4, ndata) # 8.3 s
    #chunk_size = min(nthreads // 8, ndata) # 7.4 s
    #chunk_size = min(nthreads // 16, ndata) # 7.3 s
    #chunk_size = min(nthreads // 32, ndata) # 7.4 s
    #chunk_size = min(nthreads // 64, ndata) # 7.5 s
    # Upshot: nthreads // 16 seems to cover most cases, no need to tune.

    # Build the CUDA code
    gpu_code = GPU_TEMPLATE.format(
        **{
            'NBLS': nbls,
            'NUBLS': nubls,
            'NANTS': nants,
            'GAIN': gain,
            'CMULT': CMULT,
            'CONJ': CONJ,
            'CSUB': CSUB,
            'DTYPE': DTYPE,
            'CDTYPE': CDTYPE,
        })
    # Extract functions from CUDA, suffix _cuda indicates GPU operation
    gpu_module = compiler.SourceModule(gpu_code)
    gen_dmdl_cuda = gpu_module.get_function("gen_dmdl")
    calc_chisq_cuda = gpu_module.get_function("calc_chisq")
    calc_dwgts_cuda = gpu_module.get_function("calc_dwgts")
    calc_gu_wgt_cuda = gpu_module.get_function("calc_gu_wgt")
    calc_gu_buf_cuda = gpu_module.get_function("calc_gu_buf")
    clear_complex_cuda = gpu_module.get_function("clear_complex")
    set_val_real_cuda = gpu_module.get_function("set_val_real")
    set_val_uint_cuda = gpu_module.get_function("set_val_uint")
    update_gains_cuda = gpu_module.get_function("update_gains")
    update_ubls_cuda = gpu_module.get_function("update_ubls")
    calc_conv_cuda = gpu_module.get_function("calc_conv")
    update_active_cuda = gpu_module.get_function("update_active")

    h = cublasCreate()  # handle for managing cublas, used for buffer copies

    # define GPU buffers, suffix _g indicates GPU buffer
    chunk_size = min(nthreads // 16, ndata)
    data_chunks = 1
    ANT_SHAPE = (chunk_size, nants)
    UBL_SHAPE = (chunk_size, nubls)
    BLS_SHAPE = (chunk_size, nbls)
    block = (chunk_size, int(np.floor(nthreads / chunk_size)), 1)
    ant_grid = (data_chunks, int(np.ceil(nants / block[1])))
    ubl_grid = (data_chunks, int(np.ceil(nubls / block[1])))
    bls_grid = (data_chunks, int(np.ceil(nbls / block[1])))
    conv_grid = (data_chunks, int(np.ceil(max(nants, nubls) / block[1])))
    if verbose:
        print('GPU block:', block)
        print('ANT grid:', ant_grid)
        print('UBL grid:', ubl_grid)
        print('BLS grid:', bls_grid)

    ggu_indices_g = gpuarray.empty(shape=ggu_indices.shape, dtype=np.uint32)
    active_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32)
    iters_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32)

    gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype)
    new_gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype)
    gbuf_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype)
    gwgt_g = gpuarray.empty(shape=ANT_SHAPE, dtype=real_dtype)

    ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype)
    new_ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype)
    ubuf_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype)
    uwgt_g = gpuarray.empty(shape=UBL_SHAPE, dtype=real_dtype)

    data_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype)
    dmdl_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype)
    wgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype)
    dwgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype)

    chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    new_chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    conv_sum_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    conv_wgt_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    conv_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)

    # Define return buffers
    chisq = np.empty((ndata, ), dtype=real_dtype)
    conv = np.empty((ndata, ), dtype=real_dtype)
    iters = np.empty((ndata, ), dtype=np.uint32)
    active = np.empty((chunk_size, ), dtype=np.uint32)

    # upload data indices
    ggu_indices_g.set_async(ggu_indices)

    # initialize structures used to time code
    event_order = ('start', 'upload', 'dmdl', 'calc_chisq', 'loop_top',
                   'calc_gu_wgt', 'calc_gu_buf', 'copy_gains', 'copy_ubls',
                   'update_gains', 'update_ubls', 'dmdl2', 'chisq2',
                   'calc_conv', 'update_active', 'get_active', 'end')
    event_pairs = list((event_order[i], event_order[i + 1])
                       for i in range(len(event_order[:-1])))
    cum_time = {}
    t0 = time.time()

    # Loop over chunks of parallel omnical problems
    for px in range(0, ndata, chunk_size):
        events = {e: driver.Event() for e in event_order}
        events['start'].record()

        end = min(ndata, px + chunk_size)
        beg = end - chunk_size
        offset = px - beg
        gains_g.set_async(gains[beg:end])
        ubls_g.set_async(ubls[beg:end])
        data_g.set_async(data[beg:end])
        wgts_g.set_async(wgts[beg:end])
        active = np.ones((chunk_size, ), dtype=np.uint32)
        if offset > 0:
            active[:offset] = 0
        active_g.set_async(active)
        set_val_real_cuda(conv_sum_g,
                          np.uint32(chunk_size),
                          real_dtype(0),
                          block=(chunk_size, 1, 1),
                          grid=(1, 1))
        set_val_real_cuda(conv_wgt_g,
                          np.uint32(chunk_size),
                          real_dtype(0),
                          block=(chunk_size, 1, 1),
                          grid=(1, 1))
        events['upload'].record()

        gen_dmdl_cuda(ggu_indices_g,
                      gains_g,
                      ubls_g,
                      dmdl_g,
                      active_g,
                      block=block,
                      grid=bls_grid)
        events['dmdl'].record()

        set_val_real_cuda(chisq_g,
                          np.uint32(chunk_size),
                          real_dtype(0),
                          block=(chunk_size, 1, 1),
                          grid=(1, 1))
        calc_chisq_cuda(data_g,
                        dmdl_g,
                        wgts_g,
                        chisq_g,
                        active_g,
                        block=block,
                        grid=bls_grid)
        events['calc_chisq'].record()

        if TIME_IT:
            events['calc_chisq'].synchronize()
            for (e1, e2) in event_pairs[:3]:
                cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \
                                    events[e2].time_since(events[e1])

        # Loop over iterations within an omnical problem
        for i in range(1, maxiter + 1):
            events['loop_top'].record()

            if (i % check_every) == 1:
                # Per standard omnical algorithm, only update gwgt/uwgt
                # every few iterations to save compute.
                calc_dwgts_cuda(dmdl_g,
                                wgts_g,
                                dwgts_g,
                                active_g,
                                block=block,
                                grid=bls_grid)
                set_val_real_cuda(
                    gwgt_g,
                    np.uint32(nants * chunk_size),
                    real_dtype(0),
                    block=(nthreads, 1, 1),
                    grid=(int(np.ceil(nants * chunk_size / nthreads)), 1))
                set_val_real_cuda(
                    uwgt_g,
                    np.uint32(nubls * chunk_size),
                    real_dtype(0),
                    block=(nthreads, 1, 1),
                    grid=(int(np.ceil(nubls * chunk_size / nthreads)), 1))
                calc_gu_wgt_cuda(ggu_indices_g,
                                 dmdl_g,
                                 dwgts_g,
                                 gwgt_g,
                                 uwgt_g,
                                 active_g,
                                 block=block,
                                 grid=bls_grid)
            events['calc_gu_wgt'].record()

            clear_complex_cuda(gbuf_g,
                               np.uint32(nants * chunk_size),
                               block=(nthreads, 1, 1),
                               grid=(int(np.ceil(nants * chunk_size /
                                                 nthreads)), 1))
            clear_complex_cuda(ubuf_g,
                               np.uint32(nubls * chunk_size),
                               block=(nthreads, 1, 1),
                               grid=(int(np.ceil(nubls * chunk_size /
                                                 nthreads)), 1))
            # This is 75% of the compute load
            calc_gu_buf_cuda(ggu_indices_g,
                             data_g,
                             dwgts_g,
                             dmdl_g,
                             gbuf_g,
                             ubuf_g,
                             active_g,
                             block=block,
                             grid=bls_grid)
            events['calc_gu_buf'].record()

            if (i < maxiter) and (i < check_after or (i % check_every != 0)):
                # Fast branch: don't check convergence/divergence
                events['copy_gains'].record()
                events['copy_ubls'].record()

                update_gains_cuda(gbuf_g,
                                  gwgt_g,
                                  gains_g,
                                  np.float32(gain),
                                  active_g,
                                  block=block,
                                  grid=ant_grid)
                events['update_gains'].record()

                update_ubls_cuda(ubuf_g,
                                 uwgt_g,
                                 ubls_g,
                                 np.float32(gain),
                                 active_g,
                                 block=block,
                                 grid=ubl_grid)
                events['update_ubls'].record()

                gen_dmdl_cuda(ggu_indices_g,
                              gains_g,
                              ubls_g,
                              dmdl_g,
                              active_g,
                              block=block,
                              grid=bls_grid)
                events['dmdl2'].record()

                events['chisq2'].record()
                events['calc_conv'].record()
                events['update_active'].record()
                events['get_active'].record()

            else:
                # Slow branch: check convergence/divergence
                COPY(h, nants * chunk_size, gains_g.gpudata, 1,
                     new_gains_g.gpudata, 1)
                events['copy_gains'].record()

                COPY(h, nubls * chunk_size, ubls_g.gpudata, 1,
                     new_ubls_g.gpudata, 1)
                events['copy_ubls'].record()

                update_gains_cuda(gbuf_g,
                                  gwgt_g,
                                  new_gains_g,
                                  np.float32(gain),
                                  active_g,
                                  block=block,
                                  grid=ant_grid)
                events['update_gains'].record()

                update_ubls_cuda(ubuf_g,
                                 uwgt_g,
                                 new_ubls_g,
                                 np.float32(gain),
                                 active_g,
                                 block=block,
                                 grid=ubl_grid)
                events['update_ubls'].record()

                gen_dmdl_cuda(ggu_indices_g,
                              new_gains_g,
                              new_ubls_g,
                              dmdl_g,
                              active_g,
                              block=block,
                              grid=bls_grid)
                events['dmdl2'].record()

                set_val_real_cuda(new_chisq_g,
                                  np.uint32(chunk_size),
                                  real_dtype(0),
                                  block=(chunk_size, 1, 1),
                                  grid=(1, 1))
                calc_chisq_cuda(data_g,
                                dmdl_g,
                                wgts_g,
                                new_chisq_g,
                                active_g,
                                block=block,
                                grid=bls_grid)
                events['chisq2'].record()

                set_val_real_cuda(conv_sum_g,
                                  np.uint32(chunk_size),
                                  real_dtype(0),
                                  block=(chunk_size, 1, 1),
                                  grid=(1, 1))
                set_val_real_cuda(conv_wgt_g,
                                  np.uint32(chunk_size),
                                  real_dtype(0),
                                  block=(chunk_size, 1, 1),
                                  grid=(1, 1))
                calc_conv_cuda(new_gains_g,
                               gains_g,
                               new_ubls_g,
                               ubls_g,
                               conv_sum_g,
                               conv_wgt_g,
                               active_g,
                               block=block,
                               grid=conv_grid)
                events['calc_conv'].record()

                update_active_cuda(new_gains_g,
                                   gains_g,
                                   new_ubls_g,
                                   ubls_g,
                                   conv_sum_g,
                                   conv_wgt_g,
                                   conv_g,
                                   real_dtype(conv_crit),
                                   new_chisq_g,
                                   chisq_g,
                                   iters_g,
                                   np.uint32(i),
                                   active_g,
                                   block=block,
                                   grid=conv_grid)
                events['update_active'].record()

                active_g.get_async(ary=active)
                events['get_active'].record()

                if not np.any(active):
                    break
            events['end'].record()

            if TIME_IT:
                events['end'].synchronize()
                for (e1, e2) in event_pairs[4:]:
                    cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \
                                        events[e2].time_since(events[e1])

        # Download final answers into buffers returned to user
        # use offset to trim off parts of chunk that were never active
        _chisq = np.empty((chunk_size, ), dtype=real_dtype)
        chisq_g.get_async(ary=_chisq)
        chisq[px:end] = _chisq[offset:]
        _iters = np.empty((chunk_size, ), dtype=np.uint32)
        iters_g.get_async(ary=_iters)
        iters[px:end] = _iters[offset:]
        _conv = np.empty((chunk_size, ), dtype=real_dtype)
        conv_g.get_async(ary=_conv)
        conv[px:end] = _conv[offset:]
        _gains = np.empty(ANT_SHAPE, dtype=complex_dtype)
        gains_g.get_async(ary=_gains)
        gains[px:end, :] = _gains[offset:, :]
        _ubls = np.empty(UBL_SHAPE, dtype=complex_dtype)
        ubls_g.get_async(ary=_ubls)
        ubls[px:end, :] = _ubls[offset:, :]

    t1 = time.time()
    if TIME_IT:
        print('Final, nthreads=%d' % nthreads)
        for (e1, e2) in event_pairs:
            try:
                print('%6.3f' % cum_time[(e1, e2)], e1, e2)
            except (KeyError):
                pass
        print(t1 - t0)
        print()

    cublasDestroy(h)  # teardown GPU configuration

    return {
        'gains': gains,
        'ubls': ubls,
        'chisq': chisq,
        'iters': iters,
        'conv': conv
    }
コード例 #26
0
ファイル: linalg.py プロジェクト: neurokernel/retina
 def destroy(self):
     if self.handle is not None:
         cublas.cublasDestroy(self.handle)
コード例 #27
0
y_pin = [
    cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]

h = cublas.cublasCreate()

x_gpu = np.empty(nStreams, dtype=object)
y_gpu = np.empty(nStreams, dtype=object)
ans = np.empty(nStreams, dtype=object)

for i in range(nStreams):
    cublas.cublasSetStream(h, streams[i].handle)

    x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i])
    y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i])

    cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1,
                       y_gpu[i].gpudata, 1)
    ans[i] = y_gpu[i].get_async(stream=streams[i])

cublas.cublasDestroy(h)

# Uncomment to check for errors in the calculation
#y_gpu = np.array([yg.get() for yg in y_gpu])
#y_gpu = np.array(y_gpu).reshape(y.shape)
#print np.allclose(y_gpu, a*x + y)

e.record()
e.synchronize()
print s.time_till(e), " ms"
コード例 #28
0
ファイル: test_cublas.py プロジェクト: lvaleriu/scikit-cuda
 def tearDown(self):
     cublas.cublasDestroy(self.cublas_handle)
コード例 #29
0
ファイル: matMul_v1.py プロジェクト: CIuliusC/PyCUDA
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas

A = np.array(([1, 2, 3], [4, 5, 6]), order='F').astype(np.float64)
B = np.array(([7, 8, 1, 5], [9, 10, 0, 9], [11, 12, 5, 5]),
             order='F').astype(np.float64)

A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)

m, k = A_gpu.shape
k, n = B_gpu.shape

C_gpu = gpuarray.empty((m, n), np.float64)

alpha = np.float64(1.0)
beta = np.float64(0.0)

cublas_handle = cublas.cublasCreate()
cublas.cublasDgemm(cublas_handle, 'n', 'n', m, n, k, alpha, A_gpu.gpudata, m,
                   B_gpu.gpudata, k, beta, C_gpu.gpudata, m)
cublas.cublasDestroy(cublas_handle)

C_gpu = C_gpu.reshape(C_gpu.shape, order='F')

print(np.dot(A, B))
print(C_gpu)