'PyObject* newShape = Py_BuildValue("(i)", intLength)', 'PyObject* kwargs = Py_BuildValue("{sOsOs%s}", "shape", newShape, "dtype", type, "gpudata", diffResult)' % ptr_sz_uint_conv, 'PyObject* GPUArrayClass = PyObject_GetAttrString(gpuArray, "__class__")', 'PyObject* remoteResult = PyObject_Call(GPUArrayClass, args, kwargs)', 'return remoteResult'] host_mod.add_function( FunctionBody( FunctionDeclaration(Pointer(Value("PyObject", "adjacentDifference")), [Pointer(Value("PyObject", "gpuArray"))]), Block([Statement(x) for x in statements]))) host_mod.add_to_preamble([Include('boost/python/extract.hpp')]) cuda_mod = CudaModule(host_mod) cuda_mod.add_to_preamble([Include('cuda.h')]) globalIndex = 'int index = blockIdx.x * blockDim.x + threadIdx.x' compute_diff = 'outputPtr[index] = inputPtr[index] - inputPtr[index-1]' launch = ['CUdeviceptr output', 'cuMemAlloc(&output, sizeof(T) * length)', 'int bSize = 256', 'int gSize = (length-1)/bSize + 1', 'diffKernel<<<gSize, bSize>>>((T*)inputPtr, length, (T*)output)', 'return output'] diff =[ Template('typename T', CudaGlobal(FunctionDeclaration(Value('void', 'diffKernel'), [Value('T*', 'inputPtr'), Value('int', 'length'),
def _cusp_solver(M, parameters): cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p['ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor']) module = _cusp_cache.get(cache_key(M.ctype, parameters)) if module: return module import codepy.toolchain from cgen import FunctionBody, FunctionDeclaration from cgen import Block, Statement, Include, Value from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() if 'CUSP_HOME' in os.environ: nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], []) host_mod = BoostPythonModule() nvcc_mod = CudaModule(host_mod) nvcc_includes = ['thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h', 'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h', 'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h', 'cusp/precond/ainv.h', 'string'] nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes]) nvcc_mod.add_to_preamble([Statement('using namespace std')]) # We're translating PETSc preconditioner types to CUSP diag = Statement('cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)') ainv = Statement( 'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)') amg = Statement( 'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)') none = Statement( 'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)') preconditioners = { 'diagonal': diag, 'jacobi': diag, 'ainv': ainv, 'ainvcusp': ainv, 'amg': amg, 'hypre': amg, 'none': none, None: none } try: precond_call = preconditioners[parameters['pc_type']] except KeyError: raise RuntimeError("Cusp does not support preconditioner type %s" % parameters['pc_type']) solvers = { 'cg': Statement('cusp::krylov::cg(A, x, b, monitor, M)'), 'bicgstab': Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'), 'gmres': Statement('cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' % parameters) } try: solve_call = solvers[parameters['ksp_type']] except KeyError: raise RuntimeError("Cusp does not support solver type %s" % parameters['ksp_type']) monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters nvcc_function = FunctionBody( FunctionDeclaration(Value('void', '__cusp_solve'), [Value('CUdeviceptr', '_rowptr'), Value('CUdeviceptr', '_colidx'), Value('CUdeviceptr', '_csrdata'), Value('CUdeviceptr', '_b'), Value('CUdeviceptr', '_x'), Value('int', 'nrows'), Value('int', 'ncols'), Value('int', 'nnz')]), Block([ Statement('typedef int IndexType'), Statement('typedef %s ValueType' % M.ctype), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices'), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values'), Statement( 'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix'), Statement('thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)'), Statement('thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)'), Statement('thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)'), Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'), Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'), Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'), Statement('indices column_indices(colidx, colidx + nnz)'), Statement('values matrix_values(csrdata, csrdata + nnz)'), Statement('values b(d_b, d_b + nrows)'), Statement('values x(d_x, d_x + ncols)'), Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'), Statement( 'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)'), Statement('cusp::%s_monitor< ValueType > %s' % ('verbose' if parameters['ksp_monitor'] else 'default', monitor)), precond_call, solve_call ])) host_mod.add_to_preamble([Include('boost/python/extract.hpp'), Include('string')]) host_mod.add_to_preamble([Statement('using namespace boost::python')]) host_mod.add_to_preamble([Statement('using namespace std')]) nvcc_mod.add_function(nvcc_function) host_mod.add_function( FunctionBody( FunctionDeclaration(Value('void', 'solve'), [Value('object', '_rowptr'), Value('object', '_colidx'), Value('object', '_csrdata'), Value('object', '_b'), Value('object', '_x'), Value('object', '_nrows'), Value('object', '_ncols'), Value('object', '_nnz')]), Block([ Statement( 'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))'), Statement( 'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))'), Statement( 'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))'), Statement('CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))'), Statement('CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))'), Statement('int nrows = extract<int>(_nrows)'), Statement('int ncols = extract<int>(_ncols)'), Statement('int nnz = extract<int>(_nnz)'), Statement('__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)') ]))) nvcc_toolchain.cflags.append('-arch') nvcc_toolchain.cflags.append('sm_20') nvcc_toolchain.cflags.append('-O3') module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=configuration["debug"]) _cusp_cache[cache_key(M.ctype, parameters)] = module return module
import pycuda import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np from codepy.cgen import * from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule #Make a host_module, compiled for CPU host_mod = BoostPythonModule() #Make a device module, compiled with NVCC nvcc_mod = CudaModule(host_mod) #Describe device module code #NVCC includes nvcc_includes = [ 'thrust/sort.h', 'thrust/device_vector.h', 'cuda.h', ] #Add includes to module nvcc_mod.add_to_preamble([Include(x) for x in nvcc_includes]) #NVCC function nvcc_function = FunctionBody( FunctionDeclaration(Value('void', 'my_sort'), [Value('CUdeviceptr', 'input_ptr'),
Pointer, Module, Block, Initializer, Assign, \ Include, Statement, If from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule import codepy.jit, codepy.toolchain # INFO: the following code goes much along the tutorial to be found at # http://wiki.tiker.net/PyCuda/Examples/ThrustInterop # many thanks to Bryan Catanzaro! #Make a host_module, compiled for CPU host_mod = BoostPythonModule() #Make a device module, compiled with NVCC nvcc_mod = CudaModule(host_mod) #Describe device module code #NVCC includes nvcc_includes = [ 'thrust/sort.h', 'thrust/binary_search.h', 'thrust/device_vector.h', 'cuda.h', ] #Add includes to module nvcc_mod.add_to_preamble([Include(x) for x in nvcc_includes]) #NVCC function nvcc_functions = [ FunctionBody(
def _cusp_solver(M, parameters): cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p[ 'ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor']) module = _cusp_cache.get(cache_key(M.ctype, parameters)) if module: return module import codepy.toolchain from cgen import FunctionBody, FunctionDeclaration from cgen import Block, Statement, Include, Value from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() if 'CUSP_HOME' in os.environ: nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], []) host_mod = BoostPythonModule() nvcc_mod = CudaModule(host_mod) nvcc_includes = [ 'thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h', 'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h', 'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h', 'cusp/precond/ainv.h', 'string' ] nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes]) nvcc_mod.add_to_preamble([Statement('using namespace std')]) # We're translating PETSc preconditioner types to CUSP diag = Statement( 'cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)') ainv = Statement( 'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)' ) amg = Statement( 'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)' ) none = Statement( 'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)' ) preconditioners = { 'diagonal': diag, 'jacobi': diag, 'ainv': ainv, 'ainvcusp': ainv, 'amg': amg, 'hypre': amg, 'none': none, None: none } try: precond_call = preconditioners[parameters['pc_type']] except KeyError: raise RuntimeError("Cusp does not support preconditioner type %s" % parameters['pc_type']) solvers = { 'cg': Statement('cusp::krylov::cg(A, x, b, monitor, M)'), 'bicgstab': Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'), 'gmres': Statement( 'cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' % parameters) } try: solve_call = solvers[parameters['ksp_type']] except KeyError: raise RuntimeError("Cusp does not support solver type %s" % parameters['ksp_type']) monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters nvcc_function = FunctionBody( FunctionDeclaration(Value('void', '__cusp_solve'), [ Value('CUdeviceptr', '_rowptr'), Value('CUdeviceptr', '_colidx'), Value('CUdeviceptr', '_csrdata'), Value('CUdeviceptr', '_b'), Value('CUdeviceptr', '_x'), Value('int', 'nrows'), Value('int', 'ncols'), Value('int', 'nnz') ]), Block([ Statement('typedef int IndexType'), Statement('typedef %s ValueType' % M.ctype), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices' ), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values' ), Statement( 'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix' ), Statement( 'thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)' ), Statement( 'thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)' ), Statement( 'thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)' ), Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'), Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'), Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'), Statement('indices column_indices(colidx, colidx + nnz)'), Statement('values matrix_values(csrdata, csrdata + nnz)'), Statement('values b(d_b, d_b + nrows)'), Statement('values x(d_x, d_x + ncols)'), Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'), Statement( 'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)' ), Statement('cusp::%s_monitor< ValueType > %s' % ('verbose' if parameters['ksp_monitor'] else 'default', monitor)), precond_call, solve_call ])) host_mod.add_to_preamble( [Include('boost/python/extract.hpp'), Include('string')]) host_mod.add_to_preamble([Statement('using namespace boost::python')]) host_mod.add_to_preamble([Statement('using namespace std')]) nvcc_mod.add_function(nvcc_function) host_mod.add_function( FunctionBody( FunctionDeclaration(Value('void', 'solve'), [ Value('object', '_rowptr'), Value('object', '_colidx'), Value('object', '_csrdata'), Value('object', '_b'), Value('object', '_x'), Value('object', '_nrows'), Value('object', '_ncols'), Value('object', '_nnz') ]), Block([ Statement( 'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))' ), Statement( 'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))' ), Statement( 'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))' ), Statement( 'CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))' ), Statement( 'CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))' ), Statement('int nrows = extract<int>(_nrows)'), Statement('int ncols = extract<int>(_ncols)'), Statement('int nnz = extract<int>(_nnz)'), Statement( '__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)' ) ]))) nvcc_toolchain.cflags.append('-arch') nvcc_toolchain.cflags.append('sm_20') nvcc_toolchain.cflags.append('-O3') module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=configuration["debug"]) _cusp_cache[cache_key(M.ctype, parameters)] = module return module
#!python import pycuda import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np from cgen import * from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule #Make a host_module, compiled for CPU host_mod = BoostPythonModule() #Make a device module, compiled with NVCC nvcc_mod = CudaModule(host_mod) #Describe device module code #NVCC includes nvcc_includes = [ 'thrust/sort.h', 'thrust/device_vector.h', 'cuda.h', ] #Add includes to module nvcc_mod.add_to_preamble([Include(x) for x in nvcc_includes]) #NVCC function nvcc_function = FunctionBody( FunctionDeclaration( Value('void', 'my_sort'),
cleanup_s = """void Cleanup() { if (d_A) cudaFree(d_A); if (d_B) cudaFree(d_B); if (d_C) cudaFree(d_C); if (h_A) free(h_A); if (h_B) free(h_B); if (h_C) free(h_C); cutilSafeCall( cudaThreadExit() ); return;}""" host.add_function(c_main_rend, fname="main") host.module.mod_body.append(Line(cleanup_s)) host.module.mod_body.insert(0, FunctionDeclaration(Value('void', "Cleanup"),[])) host.module.mod_body.append(Line(randominit_s)) host.module.mod_body.insert(0, FunctionDeclaration(Value('void', "RandomInit"),[Pointer(POD(numpy.float32,'data')),POD(numpy.int32,'n')])) cuda_mod = CudaModule(host.module) cuda_mod.add_to_preamble([Include('cuda.h')]) cuda_mod.add_to_module([Line(cu_kern_rend)]) launch_statements = [ 'int threadsPerBlock = 256;' 'int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;' 'VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);'] launch_func = FunctionBody( FunctionDeclaration(Value('void', 'launch_VecAdd'), [ Pointer(Value('float', 'd_A')), Pointer(Value('float', 'd_B')), Pointer(Value('float', 'd_C')), Value('int', 'N') ]), Block([Statement(s) for s in launch_statements]) )