def test_multiple_2d_textures(self): mod = SourceModule(""" texture<float, 2, cudaReadModeElementType> mtx_tex; texture<float, 2, cudaReadModeElementType> mtx2_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col) + tex2D(mtx2_tex, row, col); } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx2_tex = mod.get_texref("mtx2_tex") shape = (3, 4) a = np.random.randn(*shape).astype(np.float32) b = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex, mtx2_tex]) assert la.norm(dest - a - b) < 1e-6
def __init__(self): # load kernel from file path = os.path.join(os.path.dirname(__file__), 'deskew.cu') with open(path, 'r') as fd: try: module = SourceModule(fd.read()) except cuda.CompileError as err: logger.error("compile error: " + str(err)) raise self._shear_kernel = module.get_function('shear_kernel') self._rotate_kernel = module.get_function('rotate_kernel') # preset texture shear_texture = module.get_texref('shear_tex') shear_texture.set_address_mode(0, cuda.address_mode.BORDER) shear_texture.set_address_mode(1, cuda.address_mode.BORDER) shear_texture.set_address_mode(2, cuda.address_mode.BORDER) shear_texture.set_filter_mode(cuda.filter_mode.LINEAR) self._shear_texture = shear_texture rotate_texture = module.get_texref('rotate_tex') rotate_texture.set_address_mode(0, cuda.address_mode.BORDER) rotate_texture.set_address_mode(1, cuda.address_mode.BORDER) rotate_texture.set_filter_mode(cuda.filter_mode.LINEAR) self._rotate_texture = rotate_texture # preset kernel launch parameters self._shear_kernel.prepare('PffIIffIII', texrefs=[shear_texture]) self._rotate_kernel.prepare('PfIIffII', texrefs=[rotate_texture]) # output staging buffer self._out_buf = None
def initCUDA(): global plotData_dArray global tex, transferTex global transferFuncArray_d global c_invViewMatrix global renderKernel #print "Compiling CUDA code for volumeRender" cudaCodeFile = open(volRenderDirectory + "/CUDAvolumeRender.cu","r") cudaCodeString = cudaCodeFile.read() cudaCodeStringComplete = cudaCodeString cudaCode = SourceModule(cudaCodeStringComplete, no_extern_c=True, include_dirs=[volRenderDirectory] ) tex = cudaCode.get_texref("tex") transferTex = cudaCode.get_texref("transferTex") c_invViewMatrix = cudaCode.get_global('c_invViewMatrix')[0] renderKernel = cudaCode.get_function("d_render") if not plotData_dArray: plotData_dArray = np3DtoCudaArray( plotData_h ) tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tex.set_filter_mode(cuda.filter_mode.LINEAR) tex.set_address_mode(0, cuda.address_mode.CLAMP) tex.set_address_mode(1, cuda.address_mode.CLAMP) tex.set_array(plotData_dArray) set_transfer_function( cmap_indx_0, trans_ramp_0, trans_center_0 ) print "CUDA volumeRender initialized\n"
def test_multiple_2d_textures(self): mod = SourceModule(""" texture<float, 2, cudaReadModeElementType> mtx_tex; texture<float, 2, cudaReadModeElementType> mtx2_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col) + tex2D(mtx2_tex, row, col); } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx2_tex = mod.get_texref("mtx2_tex") shape = (3,4) a = np.random.randn(*shape).astype(np.float32) b = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex, mtx2_tex] ) assert la.norm(dest-a-b) < 1e-6
def prepare_functions(s): from pycuda.compiler import SourceModule kernels = ''.join(open("dielectric.cu", 'r').readlines()) mod = SourceModule( kernels.replace('Dx', str(s.Dx)).replace('Dy', str(s.Dy)).replace( 'nyz', str(s.ny * s.nz)).replace('nx', str(s.nx)).replace( 'ny', str(s.ny)).replace('nz', str(s.nz))) s.updateH = mod.get_function("update_h") s.updateE = mod.get_function("update_e") s.updateE_src = mod.get_function("update_src") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tcex.set_array(s.tcex_gpu) tcey.set_array(s.tcey_gpu) tcez.set_array(s.tcez_gpu) Bx, By = s.nz / s.Dx, s.nx * s.ny / s.Dy # number of block s.MaxBy = s.MAX_BLOCK / Bx s.bpg_list = [(Bx, s.MaxBy) for i in range(By / s.MaxBy)] if By % s.MaxBy != 0: s.bpg_list.append((Bx, By % s.MaxBy)) s.updateH.prepare("iPPPPPP", block=(s.Dx, s.Dy, 1)) s.updateE.prepare("iPPPPPP", block=(s.Dx, s.Dy, 1), texrefs=[tcex, tcey, tcez]) s.updateE_src.prepare("fP", block=(s.nz, 1, 1))
def conv3d_tex(data, kernel=None): assert data.ndim == 3 and kernel.ndim == 3 with open(__cudafile__, "r") as f: _mod_conv = SourceModule(f.read()) gpu_conv3d_t = _mod_conv.get_function("conv3d_tex") gpu_conv3d_tex1 = _mod_conv.get_texref("texSrc") gpu_conv3d_tex2 = _mod_conv.get_texref("texK") im_shape = np.asarray(data.shape[::-1], dtype=int3) k_radius = np.asarray(tuple(k // 2 for k in kernel.shape[::-1]), dtype=int3) data_tex = to_tex3d(data) gpu_conv3d_tex1.set_array(data_tex) gpu_conv3d_tex1.set_address_mode(0, cuda.address_mode.WRAP) gpu_conv3d_tex1.set_address_mode(1, cuda.address_mode.WRAP) gpu_conv3d_tex1.set_address_mode(2, cuda.address_mode.WRAP) kernel_tex = to_tex3d(kernel) gpu_conv3d_tex2.set_array(kernel_tex) r_gpu = gpuarray.zeros(data.shape, np.float32) block, grid = grid_kernel_config(gpu_conv3d_t, data.shape, isotropic=kernel.shape) gpu_conv3d_t( r_gpu, im_shape, k_radius, block=block, grid=grid, texrefs=[gpu_conv3d_tex1, gpu_conv3d_tex2], ) return r_gpu
def cuda_interpolate3D(self, img, m, size_result): cols = size_result[0] rows = size_result[1] kernel_code = """ texture<float, 2> texR; texture<float, 2> texG; texture<float, 2> texB; __global__ void interpolation(float *dest, float *m0, float *m1) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) { dest[3*(%(NDIM)s * idx + idy)] = tex2D(texR, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); dest[3*(%(NDIM)s * idx + idy) + 1] = tex2D(texG, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); dest[3*(%(NDIM)s * idx + idy) + 2] = tex2D(texB, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); } } """ kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows} mod = SourceModule(kernel_code) interpolation = mod.get_function("interpolation") texrefR = mod.get_texref("texR") texrefG = mod.get_texref("texG") texrefB = mod.get_texref("texB") img = img.astype("float32") drv.matrix_to_texref(img[:, :, 0], texrefR, order="F") texrefR.set_filter_mode(drv.filter_mode.LINEAR) drv.matrix_to_texref(img[:, :, 1], texrefG, order="F") texrefG.set_filter_mode(drv.filter_mode.LINEAR) drv.matrix_to_texref(img[:, :, 2], texrefB, order="F") texrefB.set_filter_mode(drv.filter_mode.LINEAR) bdim = (16, 16, 1) dx, mx = divmod(cols, bdim[0]) dy, my = divmod(rows, bdim[1]) gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1]) dest = np.zeros((rows, cols, 3)).astype("float32") m0 = (m[0, :] - 1).astype("float32") m1 = (m[1, :] - 1).astype("float32") interpolation(drv.Out(dest), drv.In(m0), drv.In(m1), block=bdim, grid=gdim, texrefs=[texrefR, texrefG, texrefB]) return dest.astype("uint8")
def test_multichannel_2d_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture<float4, 2, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5, 6) channels = 4 a = np.asarray(np.random.randn(*((channels, ) + shape)), dtype=np.float32, order="F") drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape + (channels, ), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) #print reshaped_a #print dest assert la.norm(dest - reshaped_a) == 0
def get_cuda_csr(block_size=128, warp_size=32): ''' Method read ERTILP CUDA code from file, build it with arguments, compile it, and returns ready kernel and texture. The parameters of method must be equal parameters of converted matrix, which is multiplied. Parameters ========== block_size : int (Recommended 128 or 256) Size of block warp_size : int > 0 (Recommended 32) Size of warp. This value depends on the specifications of the graphics card. Returns ======= Tuple of compiled kernel and texture ''' kernel_info = {'file_' : 'csr_kernel.c', 'kernel' : 'SpMV_Csr', 'texref' : 'mainVecTexRef'} with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_: tpl = file_.read() tpl = convert_string(tpl, BLOCK_SIZE=block_size, WARP_SIZE=warp_size) mod = SourceModule(tpl) kernel = mod.get_function(kernel_info['kernel']) texref = mod.get_texref(kernel_info['texref']) return (kernel, texref)
def test_multichannel_linear_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture<float4, 1, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int i = threadIdx.x+blockDim.x*threadIdx.y; float4 texval = tex1Dfetch(mtx_tex, i); dest[i*CHANNELS + 0] = texval.x; dest[i*CHANNELS + 1] = texval.y; dest[i*CHANNELS + 2] = texval.z; dest[i*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (16, 16) channels = 4 a = np.random.randn(*(shape + (channels, ))).astype(np.float32) a_gpu = drv.to_device(a) mtx_tex.set_address(a_gpu, a.nbytes) mtx_tex.set_format(drv.array_format.FLOAT, 4) dest = np.zeros(shape + (channels, ), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex]) #print a #print dest assert la.norm(dest - a) == 0
def get_cuda_sliced(sh_cache_size, threads_per_row=2): ''' Method read SLICED CUDA code from file, build it with arguments, compile it, and returns ready kernel and texture. The parameters of method must be equal parameters of converted matrix, which is multiplied. Parameters ========== sh_cache_size : int Size of cache array. For Sliced format must be equal threads per row * slice size. If get another value execute badly. threads_per_row : int > 0 (Recommended 2, 4 or 8) Threads per row Returns ======= Tuple of compiled kernel and texture ''' kernel_info = {'file_' : 'sliced_kernel.c', 'kernel' : 'SpMV_Sliced', 'texref' : 'mainVecTexRef'} with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_: tpl = file_.read() tpl = convert_string(tpl, sh_cache_size=sh_cache_size, threadPerRow=threads_per_row) mod = SourceModule(tpl) kernel = mod.get_function(kernel_info['kernel']) texref = mod.get_texref(kernel_info['texref']) return (kernel, texref)
def test_multichannel_2d_texture(self): mod = SourceModule( """ #define CHANNELS 4 texture<float4, 2, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """ ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5, 6) channels = 4 a = np.asarray(np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F") drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape + (channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) # print reshaped_a # print dest assert la.norm(dest - reshaped_a) == 0
def test_multichannel_linear_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture<float4, 1, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int i = threadIdx.x+blockDim.x*threadIdx.y; float4 texval = tex1Dfetch(mtx_tex, i); dest[i*CHANNELS + 0] = texval.x; dest[i*CHANNELS + 1] = texval.y; dest[i*CHANNELS + 2] = texval.z; dest[i*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (16, 16) channels = 4 a = np.random.randn(*(shape+(channels,))).astype(np.float32) a_gpu = drv.to_device(a) mtx_tex.set_address(a_gpu, a.nbytes) mtx_tex.set_format(drv.array_format.FLOAT, 4) dest = np.zeros(shape+(channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex] ) #print a #print dest assert la.norm(dest-a) == 0
def test_3d_fp_textures(self): orden = "C" npoints = 32 for prec in [ np.int16, np.float32, np.float64, np.complex64, np.complex128 ]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = "fp_tex_cfloat" elif prec == np.complex128: fpName_str = "fp_tex_cdouble" elif prec == np.float64: fpName_str = "fp_tex_double" else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = """ #include <pycuda-helpers.hpp> texture<fpName, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } """ myKern = myKern.replace("fpName", fpName_str) myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (8, 8, 8) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, npoints) cuGrid = ( npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), ) copy_texture.prepare("P", texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
class BornThread(threading.Thread): def __init__(self, gpu, work_queue, result, density, x, y, z, Qx, Qy, Qz): threading.Thread.__init__(self) self.born_args = density, x, y, z, Qx, Qy, Qz, result self.work_queue = work_queue self.gpu = gpu self.precision = Qx.dtype def run(self): self.dev = cuda.Device(self.gpu) self.ctx = self.dev.make_context() if self.precision == numpy.float32: self.cudamod = SourceModule(BORN_KERNEL32) else: self.cudamod = SourceModule(BORN_KERNEL64) self.cudaBorn = self.cudamod.get_function("cudaBorn") self.kernel() self.ctx.pop() del self.ctx del self.dev def kernel(self): density, x, y, z, Qx, Qy, Qz, result = self.born_args nx, ny, nz, nqx, nqy, nqz = [numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz] texture_data = [] for v,t in (x,"tx"),(y,"ty"),(z,"tz"),(Qx,"tQx"),(Qy,"tQy"),(Qz,"tQz"),(density,"tdensity"): cv = gpuarray.to_gpu(v) tv = self.cudamod.get_texref(t) cv.bind_to_texref_ext(tv) texture_data.append(cv) cframe = cuda.mem_alloc(result[0].nbytes) n = int(1*nqy*nqz) while True: try: qxi = numpy.int32(self.work_queue.get(block=False)) except Queue.Empty: break #print "%d of %d on %d\n"%(qxi,nqx,self.gpu), self.cudaBorn(nx,ny,nz,nqx,nqy,nqz,qxi,cframe, **cuda_partition(n)) ## Delay fetching result until the kernel is complete cuda_sync() ## Fetch result back to the CPU cuda.memcpy_dtoh(result[qxi], cframe) #print "%d %s\n"%(qxi,ctemp.get()) del cframe for cv in texture_data: del cv
def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = np.asarray( np.random.randn(*shape), dtype=np.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = np.zeros(shape, dtype=np.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest-a) == 0
def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = numpy.asarray(numpy.random.randn(*shape), dtype=numpy.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = numpy.zeros(shape, dtype=numpy.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest - a) == 0
def get_dckernel(slen): # Right now, hardcoding the number of threads per block nt = 1024 nb = int(numpy.ceil(slen / 1024.0)) if nb > 1024: raise ValueError("More than 1024 blocks not supported yet") try: return dckernel_cache[nb] except KeyError: mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb)) freq_tex = mod.get_texref("freq_tex") amp_tex = mod.get_texref("amp_tex") phase_tex = mod.get_texref("phase_tex") fn1 = mod.get_function("find_block_indices") fn1.prepare("PPifff", texrefs=[freq_tex]) fn2 = mod.get_function("linear_interp") fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex]) dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb) return dckernel_cache[nb]
def prepare_functions(s): from pycuda.compiler import SourceModule kernels = ''.join( open("dielectric.cu",'r').readlines() ) mod = SourceModule( kernels.replace('Dx',str(s.Dx)).replace('Dy',str(s.Dy)).replace('nyz',str(s.ny*s.nz)).replace('nx',str(s.nx)).replace('ny',str(s.ny)).replace('nz',str(s.nz)) ) s.updateH = mod.get_function("update_h") s.updateE = mod.get_function("update_e") s.updateE_src = mod.get_function("update_src") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tcex.set_array(s.tcex_gpu) tcey.set_array(s.tcey_gpu) tcez.set_array(s.tcez_gpu) Bx, By = s.nz/s.Dx, s.nx*s.ny/s.Dy # number of block s.MaxBy = s.MAX_BLOCK/Bx s.bpg_list = [(Bx,s.MaxBy) for i in range(By/s.MaxBy)] if By%s.MaxBy != 0: s.bpg_list.append( (Bx,By%s.MaxBy) ) s.updateH.prepare("iPPPPPP", block=(s.Dx,s.Dy,1)) s.updateE.prepare("iPPPPPP", block=(s.Dx,s.Dy,1), texrefs=[tcex,tcey,tcez]) s.updateE_src.prepare("fP", block=(s.nz,1,1))
def test_3d_fp_textures(self): orden = "C" npoints = 32 for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = "fp_tex_cfloat" elif prec == np.complex128: fpName_str = "fp_tex_cdouble" elif prec == np.float64: fpName_str = "fp_tex_double" else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = """ #include <pycuda-helpers.hpp> texture<fpName, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } """ myKern = myKern.replace("fpName", fpName_str) myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (8, 8, 8) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, npoints) cuGrid = ( npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), ) copy_texture.prepare("P", texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
def mk_tex_kernel(params, func_name, tex_name, kernel_file, prepare_args=None): kernel_file = kernels_dir + kernel_file key = (params, kernel_file, prepare_args) if key in _kernel_cache: return _kernel_cache[key] with open(kernel_file) as code_file: code = code_file.read() src = code % params mod = SourceModule(src, include_dirs=[kernels_dir]) fn = mod.get_function(func_name) tex = mod.get_texref(tex_name) if prepare_args is not None: fn.prepare(prepare_args) _kernel_cache[key] = (fn, tex) return fn, tex
def test_2d_fp_texturesLayered(self): orden = "F" npoints = 32 for prec in [ np.int16, np.float32, np.float64, np.complex64, np.complex128 ]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = ''' #include <pycuda-helpers.hpp> texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1); } ''' myKern = myKern.replace('fpName', fpName_str) myKern = myKern.replace('cuPres', prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (16, 16, 1) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, 1) cuGrid = (npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), 1) copy_texture.prepare('P', texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
def __init__(self, pyr_scale=0.9, levels=15, winsize=9, num_iterations=5, poly_n=5, poly_sigma=1.2, use_gaussian_kernel: bool = True, use_initial_flow=None, quit_at_level=None, use_gpu=True, upscale_on_termination=True, fast_gpu_scaling=True, vesselmask_gpu=None): self.pyr_scale = pyr_scale self.levels = levels self.winsize = winsize self.num_iterations = num_iterations self.poly_n = poly_n self.poly_sigma = poly_sigma self.use_gaussian_kernel = use_gaussian_kernel self.use_initial_flow = use_initial_flow self.use_gpu = use_gpu self.upscale_on_termination = upscale_on_termination self._fast_gpu_scaling = fast_gpu_scaling self.quit_at_level = quit_at_level self._dump_everything = False self._show_everything = False self._vesselmask_gpu = vesselmask_gpu self._resize_kernel_size_factor = 4 self._max_resize_kernel_size = 9 with open( os.path.join(os.path.dirname(__file__), 'farneback_kernels.cu')) as f: read_data = f.read() f.closed mod = SourceModule(read_data) self._update_matrices_kernel = mod.get_function( 'FarnebackUpdateMatrices') self._invG_gpu = mod.get_global('invG')[0] self._weights_gpu = mod.get_global('weights')[0] self._poly_expansion_kernel = mod.get_function('calcPolyCoeficients') self._warp_kernel = mod.get_function('warpByFlowField') self._r1_texture = mod.get_texref('sourceTex') self._solve_equations_kernel = mod.get_function('solveEquationsCramer')
def get_flat_kernel(self): from pycuda.tools import dtype_to_ctype mod = SourceModule( COO_FLAT_KERNEL_TEMPLATE % { "value_type": dtype_to_ctype(self.dtype), "tex_value_type": dtype_to_ctype( self.dtype, with_fp_tex_hack=True), "index_type": dtype_to_ctype(self.index_dtype), "block_size": self.block_size, "warp_size": drv.Context.get_device().warp_size, }) func = mod.get_function("spmv_coo_flat_kernel") x_texref = mod.get_texref("tex_x") func.prepare(self.index_dtype.char*2 + "PPPP", (self.block_size, 1, 1), texrefs=[x_texref]) return func, x_texref
def cuda_interpolate(self, channel, m, size_result): cols = size_result[0] rows = size_result[1] kernel_code = """ texture<float, 2> tex; __global__ void interpolation(float *dest, float *m0, float *m1) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) { dest[%(NDIM)s * idx + idy] = tex2D(tex, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); } } """ kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows} mod = SourceModule(kernel_code) interpolation = mod.get_function("interpolation") texref = mod.get_texref("tex") channel = channel.astype("float32") drv.matrix_to_texref(channel, texref, order="F") texref.set_filter_mode(drv.filter_mode.LINEAR) bdim = (16, 16, 1) dx, mx = divmod(cols, bdim[0]) dy, my = divmod(rows, bdim[1]) gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1]) dest = np.zeros((rows, cols)).astype("float32") m0 = (m[0, :] - 1).astype("float32") m1 = (m[1, :] - 1).astype("float32") interpolation(drv.Out(dest), drv.In(m0), drv.In(m1), block=bdim, grid=gdim, texrefs=[texref]) return dest.astype("uint8")
def resize_gpu(y_gpu, out_shape): in_shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype if dtype != np.float32: raise NotImplementedException('Only float at the moment') block_size = (16,16,1) grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])), int(np.ceil(float(out_shape[0])/block_size[1]))) preproc = _generate_preproc(dtype) mod = SourceModule(preproc + resize_code, keep=True) resize_fun_gpu = mod.get_function("resize") resized_gpu = cua.empty(tuple((np.int(out_shape[0]), np.int(out_shape[1]))),y_gpu.dtype) temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1], y_gpu.shape[0], 4) copy_object = cu.Memcpy2D() copy_object.set_src_device(y_gpu.gpudata) copy_object.set_dst_device(temp_gpu) copy_object.src_pitch = 4 * y_gpu.shape[1] copy_object.dst_pitch = pitch copy_object.width_in_bytes = 4 * y_gpu.shape[1] copy_object.height = y_gpu.shape[0] copy_object(aligned=False) in_tex = mod.get_texref('in_tex') descr = cu.ArrayDescriptor() descr.width = y_gpu.shape[1] descr.height = y_gpu.shape[0] descr.format = cu.array_format.FLOAT descr.num_channels = 1 #pitch = y_gpu.nbytes / y_gpu.shape[0] in_tex.set_address_2d(temp_gpu, descr, pitch) in_tex.set_filter_mode(cu.filter_mode.LINEAR) in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) resize_fun_gpu(resized_gpu.gpudata, np.uint32(out_shape[0]), np.uint32(out_shape[1]), block=block_size, grid=grid_size) temp_gpu.free() return resized_gpu
def get_cuda_ellpack(): ''' Method read ELLPACK CUDA code from file, compile it, and returns ready kernel and texture. Returns ======= Tuple of compiled kernel and texture ''' kernel_info = {'file_' : 'ellpack_kernel.c', 'kernel' : 'SpMV_Ellpack', 'texref' : 'mainVecTexRef'} with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_: tpl = file_.read() mod = SourceModule(tpl) kernel = mod.get_function(kernel_info['kernel']) texref = mod.get_texref(kernel_info['texref']) return (kernel, texref)
def get_flat_kernel(self): from pycuda.tools import dtype_to_ctype mod = SourceModule( COO_FLAT_KERNEL_TEMPLATE % { "value_type": dtype_to_ctype(self.dtype), "tex_value_type": dtype_to_ctype(self.dtype, with_fp_tex_hack=True), "index_type": dtype_to_ctype(self.index_dtype), "block_size": self.block_size, "warp_size": drv.Context.get_device().warp_size, }) func = mod.get_function("spmv_coo_flat_kernel") x_texref = mod.get_texref("tex_x") func.prepare(self.index_dtype.char * 2 + "PPPP", (self.block_size, 1, 1), texrefs=[x_texref]) return func, x_texref
def mk_tex_kernel(params, func_name, tex_name, kernel_file, prepare_args = None): kernel_file = kernels_dir + kernel_file key = (params, kernel_file, prepare_args) if key in _kernel_cache: return _kernel_cache[key] with open(kernel_file) as code_file: code = code_file.read() src = code % params mod = SourceModule(src, include_dirs = [kernels_dir]) fn = mod.get_function(func_name) tex = mod.get_texref(tex_name) if prepare_args is not None: fn.prepare(prepare_args) _kernel_cache[key] = (fn, tex) return fn, tex
def test_2d_fp_texturesLayered(self): orden = "F" npoints = 32 for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) A_cpu[:] = np.random.rand(npoints,npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden) myKern = ''' #include <pycuda-helpers.hpp> texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1); } ''' myKern = myKern.replace('fpName',fpName_str) myKern = myKern.replace('cuPres',prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (16,16,1) if cuBlock[0]>npoints: cuBlock = (npoints,npoints,1) cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) copy_texture.prepare('P',texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=True) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) A_gpu.gpudata.free()
def _load_kernel(self): path = os.path.join(os.path.dirname(__file__), "deskew.cu") with open(path, 'r') as fd: tpl = Template(fd.read()) source = tpl.render(dst_type=dtype_to_ctype(self._dtype)) module = SourceModule(source) self._kernel = module.get_function("deskew_kernel") self._d_px_shift, _ = module.get_global('px_shift') self._d_vsin, _ = module.get_global('vsin') self._d_vcos, _ = module.get_global('vcos') self._texture = module.get_texref("ref_vol") self._texture.set_address_mode(0, cuda.address_mode.BORDER) self._texture.set_address_mode(1, cuda.address_mode.BORDER) self._texture.set_address_mode(2, cuda.address_mode.BORDER) self._texture.set_filter_mode(cuda.filter_mode.LINEAR) self._kernel.prepare('Piiiii',texrefs=[self._texture])
def get_cuda_sertilp(sh_dot_size=None, threads_per_row=2, slice_size=32, prefetch=2): ''' Method read SERTILP CUDA code from file, build it with arguments, compile it, and returns ready kernel and texture. The parameters of method must be equal parameters of converted matrix, which is multiplied. Parameters ========== sh_dot_size : int Size of cache array. For Sertilp format must be equal threads per row * slice size. If get another value execute badly. If get None calculates this automatically. threads_per_row : int > 0 (Recommended 2, 4 or 8) Threads per row slice_size : int (Recommended multiple 2) Slice simple size. prefetch : int (recommended 2, 4 or 8) Number of requests for access to data notified in advance. Returns ======= Tuple of compiled kernel and texture ''' kernel_info = {'file_' : 'sertilp_kernel.c', 'kernel' : 'SpMV_Sertilp', 'texref' : 'mainVecTexRef'} with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_: tpl = file_.read() if sh_dot_size is None: sh_dot_size = threads_per_row * slice_size tpl = convert_string(tpl, shDot_size=sh_dot_size, threadPerRow=threads_per_row, sliceSize=slice_size, prefetch=prefetch) mod = SourceModule(tpl) kernel = mod.get_function(kernel_info['kernel']) texref = mod.get_texref(kernel_info['texref']) return (kernel, texref)
def test_fp_textures(self): if drv.Context.get_device().compute_capability() < (1, 3): return for tp in [np.float32, np.float64]: tp_cstr = dtype_to_ctype(tp) mod = SourceModule( """ #include <pycuda-helpers.hpp> texture<fp_tex_%(tp)s, 1, cudaReadModeElementType> my_tex; __global__ void copy_texture(%(tp)s *dest) { int i = threadIdx.x; dest[i] = fp_tex1Dfetch(my_tex, i); } """ % {"tp": tp_cstr} ) copy_texture = mod.get_function("copy_texture") my_tex = mod.get_texref("my_tex") shape = (384,) a = np.random.randn(*shape).astype(tp) a_gpu = gpuarray.to_gpu(a) a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True) dest = np.zeros(shape, dtype=tp) copy_texture( drv.Out(dest), block=shape + ( 1, 1, ), texrefs=[my_tex], ) assert la.norm(dest - a) == 0
def get_cuda_ertilp(block_size, threads_per_row, prefetch): ''' Method read ERTILP CUDA code from file, build it with arguments, compile it, and returns ready kernel and texture. The parameters of method must be equal parameters of converted matrix, which is multiplied. Parameters ========== block_size : int (Recommended 128 or 256) Size of block threads_per_row : int > 0 (Recommended 2, 4 or 8) Threads per row prefetch : int (recommended 2, 4 or 8) Number of requests for access to data notified in advance. Returns ======= Tuple of compiled kernel and texture ''' kernel_info = {'file_' : 'ertilp_kernel.c', 'kernel' : 'SpMV_Ertilp', 'texref' : 'mainVecTexRef'} with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_: tpl = file_.read() prefetch_init_tab = '{' + \ ', '.join('0' for i in range(prefetch)) + \ '}' tpl = convert_string(tpl, BLOCK_SIZE=block_size, THREADS_ROW=threads_per_row, PREFETCH_SIZE=prefetch, PREFETCH_INIT_TAB=prefetch_init_tab) mod = SourceModule(tpl) kernel = mod.get_function(kernel_info['kernel']) texref = mod.get_texref(kernel_info['texref']) return (kernel, texref)
def test_fp_textures(self): if drv.Context.get_device().compute_capability() < (1, 3): return for tp in [np.float32, np.float64]: from pycuda.tools import dtype_to_ctype tp_cstr = dtype_to_ctype(tp) mod = SourceModule( """ #include <pycuda-helpers.hpp> texture<fp_tex_%(tp)s, 1, cudaReadModeElementType> my_tex; __global__ void copy_texture(%(tp)s *dest) { int i = threadIdx.x; dest[i] = fp_tex1Dfetch(my_tex, i); } """ % {"tp": tp_cstr} ) copy_texture = mod.get_function("copy_texture") my_tex = mod.get_texref("my_tex") import pycuda.gpuarray as gpuarray shape = (384,) a = np.random.randn(*shape).astype(tp) a_gpu = gpuarray.to_gpu(a) a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True) dest = np.zeros(shape, dtype=tp) copy_texture(drv.Out(dest), block=shape + (1, 1), texrefs=[my_tex]) assert la.norm(dest - a) == 0
def warp_by_flow(vol, flow3d): """ Only used for testing at the moment (warps currently backward) """ with open(os.path.join(os.path.dirname(__file__), 'farneback_kernels.cu')) as f: read_data = f.read() f.closed mod = SourceModule(read_data) interpolation_kernel = mod.get_function('warpByFlowField') r1_texture = mod.get_texref('sourceTex') farneback3d._utils.ndarray_to_float_tex(r1_texture, vol) rtn_gpu = gpuarray.GPUArray(vol.shape, vol.dtype) flow_gpu = gpuarray.to_gpu(flow3d) block = (32, 32, 1) grid = (int(divup(flow3d.shape[3], block[0])), int(divup(flow3d.shape[2], block[1])), 1) interpolation_kernel(flow_gpu, rtn_gpu, np.int32(flow_gpu.shape[3]), np.int32(flow_gpu.shape[2]), np.int32(flow_gpu.shape[1]), np.float32(1), np.float32(1), np.float32(1), block=block, grid=grid) return rtn_gpu.get()
#set thread grid for CUDA kernels block_size_x, block_size_y = 16, 8 #hardcoded, tune to your needs gridx = nWidth // block_size_x + 1 * (nWidth % block_size_x != 0) gridy = nHeight // block_size_y + 1 * (nHeight % block_size_y != 0) grid2D = (gridx, gridy, 1) block2D = (block_size_x, block_size_y, 1) #initialize pyCUDA context cudaDevice = setCudaDevice(devN=useDevice, usingAnimation=True) #Read and compile CUDA code print "Compiling CUDA code" cudaCodeString_raw = open("CUDAlatticeBoltzmann2D.cu", "r").read() cudaCodeString = cudaCodeString_raw # % { "BLOCK_WIDTH":block2D[0], "BLOCK_HEIGHT":block2D[1], "BLOCK_DEPTH":block2D[2], } cudaCode = SourceModule(cudaCodeString) tex_f1 = cudaCode.get_texref('tex_f1') tex_f2 = cudaCode.get_texref('tex_f2') tex_f3 = cudaCode.get_texref('tex_f3') tex_f4 = cudaCode.get_texref('tex_f4') tex_f5 = cudaCode.get_texref('tex_f5') tex_f6 = cudaCode.get_texref('tex_f6') tex_f7 = cudaCode.get_texref('tex_f7') tex_f8 = cudaCode.get_texref('tex_f8') tex_g1 = cudaCode.get_texref('tex_g1') tex_g2 = cudaCode.get_texref('tex_g2') tex_g3 = cudaCode.get_texref('tex_g3') tex_g4 = cudaCode.get_texref('tex_g4') tex_g5 = cudaCode.get_texref('tex_g5') tex_g6 = cudaCode.get_texref('tex_g6') tex_g7 = cudaCode.get_texref('tex_g7') tex_g8 = cudaCode.get_texref('tex_g8')
mcopy.depth = nx arrcopy(mcopy, cf, tcf_gpu) # prepare kernels from pycuda.compiler import SourceModule kernels = kernels.replace('Dx', '16').replace('Dy', '16').replace( 'nyz', str(ny * nz)).replace('nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)) mod = SourceModule(kernels) naive = mod.get_function("naive") tex3d = mod.get_function("tex3d") tcf = mod.get_texref("tcf") tcf.set_array(tcf_gpu) # measure kernel execution time start = cuda.Event() stop = cuda.Event() start.record() naive(f_gpu, cf_gpu, block=(16, 16, 1), grid=(nz / 16, nx * ny / 16)) tex3d(g_gpu, block=(16, 16, 1), grid=(nz / 16, nx * ny / 16), texrefs=[tcf]) cuda.memcpy_dtoh(f, f_gpu) cuda.memcpy_dtoh(g, g_gpu) assert (np.linalg.norm(f - g) == 0)
def weighted_basis_gpu(psf_size, grid_size, im_size, params, lens_file=None, lens_psf_size=None, lens_grid_size=None): # generate grid psf_size = psf_size + (1 - np.mod(psf_size, 2)) if lens_file: grid = scipy.misc.imread(lens_file, flatten=True) if np.max(grid) > 255: grid /= 2**(16-1) else: grid /= 255 else: grid = np.zeros(psf_size, dtype=np.float32) grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1. grid = np.tile(grid, grid_size) lens_psf_size = psf_size #lens_grid_size = (1,1) lens_grid_size = grid_size grid_gpu = cu.matrix_to_array(grid, 'C') params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C') block_size = (16,16,1) output_size = np.array(grid_size)*np.array(psf_size) gpu_grid_size = (int(np.ceil(float(output_size[1])/block_size[0])), int(np.ceil(float(output_size[0])/block_size[1]))) weighted_basis_gpu = cua.empty((int(output_size[0]), int(output_size[1])), np.float32) preproc = '' #_generate_preproc(basis_gpu.dtype) mod = SourceModule(preproc + basis_code, keep=True) basis_fun_gpu = mod.get_function("weighted_basis") in_tex = mod.get_texref('in_tex') in_tex.set_array(grid_gpu) in_tex.set_filter_mode(cu.filter_mode.LINEAR) #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) params_tex = mod.get_texref('params_tex') params_tex.set_array(params_gpu) offset = ((np.array(im_size) - np.array(grid.shape)) / np.array(grid_size).astype(np.float32)) offset = np.float32(offset) grid_scale = ((np.array(lens_grid_size) - 1) / (np.array(grid_size) - 1).astype(np.float32)) grid_scale = np.float32(grid_scale) #psf_scale = ((np.array(lens_psf_size) - 1) / # (np.array(psf_size) - 1).astype(np.float32)) #psf_scale = np.float32(psf_scale) basis_fun_gpu(weighted_basis_gpu.gpudata, np.uint32(output_size[0]), np.uint32(output_size[1]), np.uint32(psf_size[0]), np.uint32(psf_size[1]), np.uint32(im_size[0]), np.uint32(im_size[1]), offset[0], offset[1], grid_scale[0], grid_scale[1], np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]), np.uint32(params.size/3), block=block_size, grid=gpu_grid_size) return weighted_basis_gpu
cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1))) cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1))) cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None))) chx_gpu = cuda.to_device(set_c(cf, (None, 0, 0))) chy_gpu = cuda.to_device(set_c(cf, (0, None, 0))) chz_gpu = cuda.to_device(set_c(cf, (0, 0, None))) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule(kernels) update_e = mod.get_function("update_e") update_h = mod.get_function("update_h") update_src = mod.get_function("update_src") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tchx = mod.get_texref("tchx") tchy = mod.get_texref("tchy") tchz = mod.get_texref("tchz") tcex.set_address(cex_gpu, cf.nbytes) tcey.set_address(cey_gpu, cf.nbytes) tcez.set_address(cez_gpu, cf.nbytes) tchx.set_address(chx_gpu, cf.nbytes) tchy.set_address(chy_gpu, cf.nbytes) tchz.set_address(chz_gpu, cf.nbytes) tpb = 512 bpg = (nx * ny * nz) / tpb
if (x < Nx && y < Ny && z < Nz) { float value = tex3D(tex_in, (float) x, (float) y, float (z)); surf3Dwrite((float) value, surf_out, sizeof(float) * x, y, z, cudaBoundaryModeZero); } } ''' mod = SourceModule(src_module, cache_dir=False, keep=False) kernel = mod.get_function("test_3d_surf") arg_types = (np.int32, np.int32, np.int32) tex_in = mod.get_texref('tex_in') surf_out = mod.get_surfref('surf_out') # random shape shape_x = np.random.randint(1, 255) shape_y = np.random.randint(1, 255) shape_z = np.random.randint(1, 255) dtype = np.float32 # should match src_module's datatype numpy_array_in = np.random.randn(shape_z, shape_y, shape_x).astype(dtype).copy() cuda_array_in = numpy3d_to_array(numpy_array_in) tex_in.set_array(cuda_array_in) zeros = np.zeros_like(numpy_array_in)
def main(): #Set up global timer tot_time = time.time() #Define constants BankSize = 8 # Do not go beyond 8! WarpSize = 32 #Do not change... DimGridX = 19 DimGridY = 19 BlockDimX = 256 BlockDimY = 256 SearchSpaceSize = 2**24 #BlockDimX * BlockDimY * 32 FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize AlignedByteLengthGenome = 4 #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":AlignedByteLengthGenome, "bit_length_edge_type":3, "curand_nr_threads_per_block":32, "nr_tile_types":2, "nr_edge_types":8, "warpsize":WarpSize, "fit_dim_thread_x":32*BankSize, "fit_dim_thread_y":1, "fit_dim_block_x":BlockDimX, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4, "banksize":BankSize, "curand_dim_block_x":BlockDimX } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("SearchSpaceKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8) #for i in range(0, GenomeDim/4): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) # dest[i*4 + 0] = 40 # dest[i*4 + 1] = 0 # dest[i*4 + 2] = 0 # dest[i*4 + 3] = 0 dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes) #drv.memcpy_htod(dest_h, dest) #print "Genomes before: " #print dest #Set-up grids #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up fitness values #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32) #fitness_h = drv.mem_alloc(fitness.nbytes) #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32) fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0) fitness_size_h = drv.mem_alloc(fitness_size.nbytes) #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32) fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0) fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes) #drv.memcpy_htod(fitness_h, fitness) #print "Fitness values:" #print fitness #Set-up grids #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0) grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up curand #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8); #curand_h = drv.mem_alloc(curand.nbytes) curand_h = drv.mem_alloc(40*GenomeDim) #SearchSpace control #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" #Schedule kernel calls #MaxBlockDim = 100 OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize) MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize) BlockCounter = 0 print "Will do that many kernels a ", BlockDimX,"x", BlockDimY,"x ", WarpSize, ":", MaxBlockCycles #quit() #SET UP PROCESSING histo = {} #INITIALISATION CurandKernel(curand_h, block=(WarpSize,1,1), grid=(BlockDimX, BlockDimY)) print "Finished Curand kernel, starting main kernel..." #FIRST GENERATION proc_time = time.time() print "Starting first generation..." start = drv.Event() stop = drv.Event() start.record() Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64(0), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY)) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Copying..." drv.memcpy_dtoh(fitness_size, fitness_size_h) drv.memcpy_dtoh(fitness_hash, fitness_hash_h) drv.memcpy_dtoh(grids, grids_h) #INTERMEDIATE GENERATION for i in range(MaxBlockCycles-1): print "Starting generation: ", i+1 start = drv.Event() stop = drv.Event() start.record() Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64((i+1)*BlockDimX*BlockDimY*WarpSize), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY)) "Processing..." for j in range(grids.shape[0]): # if (fitness_hash[j]!=33) and (fitness_hash[j]!=44) and (fitness_hash[j]!=22): if fitness_hash[j] in histo: histo[fitness_hash[j]] = (histo[fitness_hash[j]][0], histo[fitness_hash[j]][1]+1) else: histo[fitness_hash[j]] = (grids[j], 1) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "This corresponds to %f polyomino classification a second."%((BlockDimX*BlockDimY*WarpSize)/(start.time_till(stop)*1e-3)) print "Copying..." drv.memcpy_dtoh(fitness_size, fitness_size_h) drv.memcpy_dtoh(fitness_hash, fitness_hash_h) drv.memcpy_dtoh(grids, grids_h) #FINAL PROCESSING "Processing..." for i in range(grids.shape[0]): if fitness_hash[i] in histo: histo[fitness_hash[i]] = (histo[fitness_hash[i]][0], histo[fitness_hash[i]][1]+1) else: histo[fitness_hash[i]] = (grids[i], 1) print "Done!" #TIMING RESULTS print "Total time including set-up: ", (time.time() - tot_time) print "Total Processing time: ", (time.time() - proc_time) #OUTPUT print histo
def main(): #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":8, "bit_length_edge_type":3, "curand_nr_threads_per_block":256, "nr_tile_types":4, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":1, "fit_dim_thread_y":1, "fit_dim_block_x":1 } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("TestEdgeSortKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #a = numpy.random.randn(400).astype(numpy.uint8) #b = numpy.random.randn(400).astype(numpy.uint8) dest = numpy.arange(256).astype(numpy.uint8) for i in range(0, 256/8): dest[i*8 + 0] = 36 dest[i*8 + 1] = 151 dest[i*8 + 2] = 90 dest[i*8 + 3] = 109 dest[i*8 + 4] = 224 dest[i*8 + 5] = 4 dest[i*8 + 6] = 0 dest[i*8 + 7] = 0 dest_h = drv.mem_alloc(dest.nbytes) drv.memcpy_htod(dest_h, dest) print "before: " print dest curand = numpy.zeros(40*256).astype(numpy.uint8); curand_h = drv.mem_alloc(curand.nbytes) CurandKernel(curand_h, block=(32,1,1), grid=(1,1)) Kernel(dest_h, curand_h, block=(32,1,1), grid=(1,1)) drv.memcpy_dtoh(dest, dest_h) print "after: " print dest
gridx = nWidth // block_size_x + 1 * (nWidth % block_size_x != 0) gridy = nHeight // block_size_y + 1 * (nHeight % block_size_y != 0) block2D = (block_size_x, block_size_y, 1) grid2D = (gridx, gridy, 1) grid2D_ising = (gridx // 2, gridy, 1 ) #special grid to avoid neighbor conflicts #initialize pyCUDA context cudaDevice = setCudaDevice(devN=useDevice, usingAnimation=True) #Read and compile CUDA code print "\nCompiling CUDA code" cudaCodeString_raw = open("CUDAising2D.cu", "r").read() cudaCodeString = cudaCodeString_raw # % { "BLOCK_WIDTH":block2D[0], "BLOCK_HEIGHT":block2D[1], "BLOCK_DEPTH":block2D[2], } cudaCode = SourceModule(cudaCodeString) tex_spins = cudaCode.get_texref('tex_spinsIn') isingKernel = cudaCode.get_function('ising_kernel') ######################################################################## def sendToScreen(plotData): floatToUchar(plotData, plotData_d) copyToScreenArray() ######################################################################## def swipe(): randomNumbers_d = curandom.rand((nData)) stepNumber = np.int32(0) #saveEnergy = np.int32(0) tex_spins.set_array(spinsInArray_d)
def set_params(self, psf_size, grid_size, im_size, params=None): # generate grid psf_size = np.array(psf_size) grid_size = np.array(grid_size) im_size = np.array(im_size) self.psf_size = psf_size + (1 - np.mod(psf_size, 2)) self.grid_size = grid_size self.im_size = im_size if params != None: self.params = params self.shape = (params.size / 3,) else: self._psf2params() if not self.lens: grid = np.zeros(self.psf_size, dtype=np.float32) grid[(self.psf_size[0]-1)/2, (self.psf_size[1]-1)/2] = 1. grid = np.tile(grid, self.grid_size) self.lens_psf_size = self.psf_size #lens_grid_size = (1,1) self.lens_grid_size = self.grid_size self.grid_gpu = cu.matrix_to_array(grid, 'C') params_count = np.uint32(self.params.size / 3) params_gpu = cu.matrix_to_array(self.params.astype(np.float32), 'C') #self.output_size = np.array(self.grid_size)*np.array(self.psf_size) output_size = np.array((np.prod(self.grid_size), self.psf_size[0], self.psf_size[1])) preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype) mod = SourceModule(preproc + basis_code, keep=True) in_tex = mod.get_texref('in_tex') in_tex.set_array(self.grid_gpu) in_tex.set_filter_mode(cu.filter_mode.LINEAR) #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) params_tex = mod.get_texref('params_tex') params_tex.set_array(params_gpu) offset = ((np.array(self.im_size) - np.array(grid.shape)) / np.array(self.grid_size).astype(np.float32)) offset = np.float32(offset) grid_scale = ((np.array(self.lens_grid_size) - 1) / (np.array(self.grid_size) - 1).astype(np.float32)) grid_scale = np.float32(grid_scale) block_size = (16,16,1) gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])), int(np.ceil(float(params_count)/block_size[1]))) basis_gpu = cua.empty((int(params_count), int(output_size[0]), int(output_size[1]), int(output_size[2])), np.float32) #self.basis_host = cu.pagelocked_empty((int(params_count), # int(output_size[0]), int(output_size[1]), int(output_size[2])), # np.float32, mem_flags=cu.host_alloc_flags.DEVICEMAP) basis_fun_gpu = mod.get_function("basis") basis_fun_gpu(basis_gpu.gpudata, np.uint32(np.prod(output_size)), np.uint32(self.grid_size[1]), np.uint32(self.psf_size[0]), np.uint32(self.psf_size[1]), np.uint32(self.im_size[0]), np.uint32(self.im_size[1]), offset[0], offset[1], grid_scale[0], grid_scale[1], np.uint32(self.lens_psf_size[0]), np.uint32(self.lens_psf_size[1]), params_count, block=block_size, grid=gpu_grid_size) self.basis_host = basis_gpu.get() self._intern_shape = self.basis_host.shape self.basis_host = self.basis_host.reshape((self._intern_shape[0], self._intern_shape[1]*self._intern_shape[2]*self._intern_shape[3])) self.basis_host = scipy.sparse.csr_matrix(self.basis_host)
def generate_basis_gpu(psf_size, grid_size, im_size, params, lens_file=None, lens_psf_size=None, lens_grid_size=None): # generate grid psf_size = psf_size + (1 - np.mod(psf_size, 2)) if lens_file: grid = scipy.misc.imread(lens_file, flatten=True) if np.max(grid) > 255: grid /= 2**(16-1) else: grid /= 255 else: grid = np.zeros(psf_size, dtype=np.float32) grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1. grid = np.tile(grid, grid_size) lens_psf_size = psf_size #lens_grid_size = (1,1) lens_grid_size = grid_size grid_gpu = cu.matrix_to_array(grid, 'C') # generate parameters of basis functions #p = max(1, np.floor(psf_size[0] / 2)) #p = min(8, p) #dp = min(45. / np.floor(psf_size * grid_size / 2)) #dp = min(0.8, dp) #dp = np.radians(dp) #p = np.radians(p) #l = max(1, np.floor(psf_size[0] / 2)) #l = np.ceil(l / 2) #params = np.mgrid[-l:l+1, -l:l+1, -p:p+dp/10:dp].astype(np.float32).T #params = params.reshape(params.size / 3, 3) params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C') block_size = (16,16,1) output_size = np.array((np.prod(np.array(grid_size)),psf_size[0],psf_size[1])) gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])), int(np.ceil(float(params.size/3)/block_size[1]))) basis_gpu = cua.empty((params.size/3, int(output_size[0]), int(output_size[1]), int(output_size[2])), np.float32) preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype) mod = SourceModule(preproc + basis_code, keep=True) basis_fun_gpu = mod.get_function("basis") in_tex = mod.get_texref('in_tex') in_tex.set_array(grid_gpu) in_tex.set_filter_mode(cu.filter_mode.LINEAR) #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) params_tex = mod.get_texref('params_tex') params_tex.set_array(params_gpu) offset = ((np.array(im_size) - np.array(grid.shape)) / np.array(grid_size).astype(np.float32)) offset = np.float32(offset) grid_scale = ((np.array(lens_grid_size) - 1) / (np.array(grid_size) - 1).astype(np.float32)) grid_scale = np.float32(grid_scale) #psf_scale = ((np.array(lens_psf_size) - 1) / # (np.array(psf_size) - 1).astype(np.float32)) #psf_scale = np.float32(psf_scale) basis_fun_gpu(basis_gpu.gpudata, np.uint32(np.prod(output_size)), np.uint32(grid_size[1]), np.uint32(psf_size[0]), np.uint32(psf_size[1]), np.uint32(im_size[0]), np.uint32(im_size[1]), offset[0], offset[1], grid_scale[0], grid_scale[1], np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]), np.uint32(params.size/3), block=block_size, grid=gpu_grid_size) return basis_gpu
class GPURBFEll(object): """RBF Kernel with ellpack format""" cache_size =100 Gamma=1.0 #template func_name='rbfEllpackILPcol2multi' #template module_file = os.path.dirname(__file__)+'/cu/KernelsEllpackCol2.cu' #template texref_nameI='VecI_TexRef' texref_nameJ='VecJ_TexRef' max_concurrent_kernels=1 def __init__(self,gamma=1.0,cache_size=100): """ Initialize object Parameters ------------- max_kernel_nr: int determines maximal concurrent kernel column gpu computation """ self.cache_size=cache_size self.threadsPerRow=1 self.prefetch=2 self.tpb=128 self.Gamma = gamma def init_cuda(self,X,Y, cls_start, max_kernels=1 ): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X =X self.Y = Y self.cls_start=cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y=[] #handle to gpu memory for results for each concurrent classifier self.g_out=[] #gpu kernel out self.kernel_out=[] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg=[] #function reference self.func=[] #texture references for each concurrent kernel self.tex_ref=[] #main vectors #gpu self.g_vecI=[] self.g_vecJ=[] #cpu self.main_vecI=[] self.main_vecJ=[] #cpu class self.cls_count=[] self.cls=[] #gpu class self.g_cls_count=[] self.g_cls=[] self.sum_cls=[] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N,self.Dim = X.shape column_size = self.N*4 cacheMB = self.cache_size*1024*1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB/column_size).astype(int) cache_items = min(self.N,cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open (self.module_file,"r") as CudaFile: module_code = CudaFile.read(); #compile module self.module = SourceModule(module_code,keep=True,no_extern_c=True) (g_gamma,gsize)=self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) ) #get functions reference Dim =self.Dim vecBytes = Dim*4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex=self.module.get_texref('VecI_TexRef') self.g_vecI[f]=cuda.mem_alloc( vecBytes) vecI_tex.set_address(self.g_vecI[f],vecBytes) #init texture for vector J vecJ_tex=self.module.get_texref('VecJ_TexRef') self.g_vecJ[f]=cuda.mem_alloc( vecBytes) vecJ_tex.set_address(self.g_vecJ[f],vecBytes) self.tex_ref.append((vecI_tex,vecJ_tex) ) self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32)) self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist) #transform X to particular format v,c,r=spf.csr2ellpack(self.X,align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start) def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n): """ Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2. Parameters ------------ kernel_nr : int concurrent kernel number y_cls : array-like binary class labels (1,-1) cls1: int first class number cls2: int second class number cls1_n : int number of elements of class 1 cls2_n : int number of elements of class 2 kernel_out : array-like array for gpu kernel result, size=2*len(y_cls) """ warp=32 align_cls1_n = cls1_n+(warp-cls1_n%warp)%warp align_cls2_n = cls2_n+(warp-cls2_n%warp)%warp self.cls1_N_aligned=align_cls1_n sum_cls= align_cls1_n+align_cls2_n self.sum_cls[kernel_nr] = sum_cls self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32) self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32) self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr]) self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr]) self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb )) self.g_y[kernel_nr] = cuda.to_device(y_cls) self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32) ker_out = self.kernel_out[kernel_nr] self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out) #add prepare for device functions def K2Col(self,i,j,i_ds,j_ds,kernel_nr): """ computes i-th and j-th kernel column Parameters --------------- i: int i-th kernel column number in subproblem j: int j-th kernel column number in subproblem i_ds: int i-th kernel column number in whole dataset j_ds: int j-th kernel column number in whole dataset kernel_nr : int number of concurrent kernel ker2ColOut: array like array for output Returns ------- ker2Col """ #make i-th and j-the main vectors vecI= self.main_vecI[kernel_nr] vecJ= self.main_vecJ[kernel_nr] # self.X[i_ds,:].todense(out=vecI) # self.X[j_ds,:].todense(out=vecJ) #vecI.fill(0) #vecJ.fill(0) #self.X[i_ds,:].toarray(out=vecI) #self.X[j_ds,:].toarray(out=vecJ) vecI=self.X.getrow(i_ds).todense() vecJ=self.X.getrow(j_ds).todense() #copy them to texture cuda.memcpy_htod(self.g_vecI[kernel_nr],vecI) cuda.memcpy_htod(self.g_vecJ[kernel_nr],vecJ) # temp = np.empty_like(vecI) # cuda.memcpy_dtoh(temp,self.g_vecI[kernel_nr]) # print 'temp',temp #lauch kernel gfunc=self.func[kernel_nr] gy = self.g_y[kernel_nr] gout = self.g_out[kernel_nr] gN = np.int32(self.N) g_i = np.int32(i) g_j = np.int32(j) g_ids = np.int32(i_ds) g_jds = np.int32(j_ds) gNalign = np.int32(self.cls1_N_aligned) gcs = self.g_cls_start gcc = self.g_cls_count[kernel_nr] gc = self.g_cls[kernel_nr] bpg=self.bpg[kernel_nr] #print 'start gpu i,j,kernel_nr ',i,j,kernel_nr #texReflist = list(self.tex_ref[kernel_nr]) #gfunc(self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc,block=(self.tpb,1,1),grid=(bpg,1),texrefs=texReflist) #print 'end gpu',i,j #copy the results #grid=(bpg,1),block=(self.tpb,1,1) gfunc.prepared_call((bpg,1),(self.tpb,1,1),self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc) cuda.memcpy_dtoh(self.kernel_out[kernel_nr],gout) return self.kernel_out[kernel_nr] def K_vec(self,vec): ''' vec - array-like, row ordered data, should be not to big ''' dot=self.X.dot(vec.T) x2=self.Xsquare.reshape((self.Xsquare.shape[0],1)) if(sp.issparse(vec)): v2 = vec.multiply(vec).sum(1).reshape((1,vec.shape[0])) else: v2 = np.einsum('...i,...i',vec,vec) return np.exp(-self.Gamma*(x2+v2-2*dot)) def compute_diag(self): """ Computes kernel matrix diagonal """ #for rbf diagonal consists of ones exp(0)==1 self.Diag = np.ones(self.X.shape[0],dtype=np.float32) if(sp.issparse(self.X)): # result as matrix self.Xsquare = self.X.multiply(self.X).sum(1) #result as array self.Xsquare = np.asarray(self.Xsquare).flatten() else: self.Xsquare =np.einsum('...i,...i',self.X,self.X) def clean(self,kernel_nr): """ clean the kernel cache """ #self.kernel_cache.clear() self.bpg[kernel_nr]=0 def clean_cuda(self): ''' clean all cuda resources ''' for f in range(self.max_concurrent_kernels): #vecI_tex=?? #self.g_vecI[f].free() del self.g_vecI[f] #init texture for vector J #vecJ_tex=?? #self.g_vecJ[f].free() del self.g_vecJ[f] self.g_cls_count[f].free() self.g_cls[f].free() self.g_y[f].free() self.g_out[f].free() #test it #del self.g_out[f] ?? #copy format data structure to gpu memory self.g_val.free() self.g_col.free() self.g_len.free() self.g_sdot.free() self.g_cls_start.free() print self.ctx self.ctx.pop() print self.ctx del self.ctx def predict_init(self, SV): """ Init the classifier for prediction """ self.X =SV self.compute_diag()
def main(): #Initialise InteractionMatrix def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Initialise GPU (equivalent of autoinit) drv.init() assert drv.Device.count() >= 1 dev = drv.Device(0) ctx = dev.make_context(0) #Convert GlobalParams to List GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32) count = 0 for x in GlobalParamsDict.keys(): GlobalParams[count] = GlobalParamsDict[x] count += 1 #Convert FitnessParams to List FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32) count = 0 for x in FitnessParamsDict.keys(): FitnessParams[count] = FitnessParamsDict[x] count += 1 #Convert GAParams to List GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32) count = 0 for x in GAParamsDict.keys(): GAParams[count] = GAParamsDict[x] count += 1 # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', 'cuda')) # Load source code from file Source = env.get_template('kernel.cu') #Template( file(KernelFile).read() ) #Create dictionary argument for rendering RenderArgs= {"params_size":GlobalParams.nbytes,\ "fitnessparams_size":FitnessParams.nbytes,\ "gaparams_size":GAParams.nbytes,\ "genome_bytelength":int(ByteLengthGenome),\ "genome_bitlength":int(BitLengthGenome),\ "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\ "textures":range( 0, NrFitnessFunctionGrids ),\ "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\ "with_mixed_crossover":WithMixedCrossover, "with_bank_conflict":WithBankConflict, "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection, "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues, "with_uniform_crossover":WithUniformCrossover, "with_single_point_crossover":WithSinglePointCrossover, "with_surefire_mutation":WithSurefireMutation, "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory, "ga_threaddimx":int(ThreadDim), "glob_nr_tiletypes":int(NrTileTypes), "glob_nr_edgetypes":int(NrEdgeTypes), "glob_nr_tileorientations":int(NrTileOrientations), "fit_dimgridx":int(DimGridX), "fit_dimgridy":int(DimGridY), "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids), "fit_nr_fourpermutations":int(NrFourPermutations), "fit_assembly_redundancy":int(NrAssemblyRedundancy), "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock), "sort_threaddimx":int(Sort_ThreadDimX), "glob_nr_genomes":int(NrGenomes), "fit_dimthreadx":int(ThreadDimX), "fit_dimthready":int(ThreadDimY), "fit_dimsubgridx":int(SubgridDimX), "fit_dimsubgridy":int(SubgridDimY), "fit_nr_subgridsperbank":int(NrSubgridsPerBank), "glob_bitlength_edgetype":int(EdgeTypeBitLength) } # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None) #Allocate values on GPU Genomes_h = drv.mem_alloc(Genomes.nbytes) FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes) FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes) AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes) Mutexe_h = drv.mem_alloc(Mutexe.nbytes) ReductionList_h = drv.mem_alloc(ReductionList.nbytes) #Copy values to global memory drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #Copy values to constant / texture memory for id in range(0, NrFitnessFunctionGrids): FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) ) drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C") InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address drv.memcpy_htod(GlobalParams_h[0], GlobalParams) FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address drv.memcpy_htod(FitnessParams_h[0], FitnessParams) GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address drv.memcpy_htod(GAParams_h[0], GAParams) FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst") FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst") #Set up curandStates curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper) CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes) #Compile kernels curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel") fitness_fnc = KernelSourceModule.get_function("FitnessKernel") sorting_fnc = KernelSourceModule.get_function("SortingKernel") ga_fnc = KernelSourceModule.get_function("GAKernel") #Initialise Curand curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1)) #Build parameter lists for FitnessKernel and GAKernel FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h); SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h) GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h); #TEST ONLY return #TEST ONLY #Initialise CUDA timers start = drv.Event() stop = drv.Event() #execute kernels for specified number of generations start.record() for gen in range(0, GlobalParamsDict["NrGenerations"]): #print "Processing Generation: %d"%(gen) #fitness_fnc(*(FitnessKernelParams), block=fit_blocks, grid=fit_grid) #Launch CPU processing (should be asynchroneous calls) sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel drv.memcpy_dtoh(ReductionList, ReductionList_h) #Copy from Device to Host and finish sorting FitnessSumConst = ReductionList.sum() drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitneValues from Device to Device Const ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) drv.memcpy_dtoh(Genomes, Genomes_h) #Copy data from GPU drv.memcpy_dtoh(FitnessValues, FitnessValues_h) drv.memcpy_dtoh(AssembledGrids, AssembledGrids_h) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) pass
def main(): #FourPermutations set-up FourPermutations = numpy.array([ [1,2,3,4], [1,2,4,3], [1,3,2,4], [1,3,4,2], [1,4,2,3], [1,4,3,2], [2,1,3,4], [2,1,4,3], [2,3,1,4], [2,3,4,1], [2,4,1,3], [2,4,3,1], [3,2,1,4], [3,2,4,1], [3,1,2,4], [3,1,4,2], [3,4,2,1], [3,4,1,2], [4,2,3,1], [4,2,1,3], [4,3,2,1], [4,3,1,2], [4,1,2,3], [4,1,3,2],]).astype(numpy.uint8) BankSize = 8 # Do not go beyond 8! #Define constants DimGridX = 19 DimGridY = 19 #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" BlockDimX = 100 BlockDimY = 100 SearchSpaceSize = BlockDimX * BlockDimY * 32 #BlockDimX = 600 #BlockDimY = 600 FitnessValDim = SearchSpaceSize GenomeDim = SearchSpaceSize #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":4, "bit_length_edge_type":3, "curand_nr_threads_per_block":32, "nr_tile_types":2, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":32*BankSize, "fit_dim_thread_y":1, "fit_dim_block_x":BlockDimX, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4, "banksize":BankSize, "curand_dim_block_x":BlockDimX } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("SearchSpaceKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes dest = numpy.arange(GenomeDim*4).astype(numpy.uint8) #for i in range(0, GenomeDim/4): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) # dest[i*4 + 0] = 40 # dest[i*4 + 1] = 0 # dest[i*4 + 2] = 0 # dest[i*4 + 3] = 0 dest_h = drv.mem_alloc(GenomeDim*4) #dest.nbytes) #drv.memcpy_htod(dest_h, dest) #print "Genomes before: " #print dest #Set-up grids #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up fitness values #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32) #fitness_h = drv.mem_alloc(fitness.nbytes) fitness_left = numpy.zeros(FitnessValDim).astype(numpy.float32) fitness_left_h = drv.mem_alloc(fitness_left.nbytes) fitness_bottom = numpy.zeros(FitnessValDim).astype(numpy.float32) fitness_bottom_h = drv.mem_alloc(fitness_bottom.nbytes) #drv.memcpy_htod(fitness_h, fitness) #print "Fitness values:" #print fitness #Set-up grids grids = numpy.zeros((10000*32, DimGridX, DimGridY)).astype(numpy.uint8) #TEST grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up curand #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8); #curand_h = drv.mem_alloc(curand.nbytes) curand_h = drv.mem_alloc(40*GenomeDim) #Set-up four permutations FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) #SearchSpace control #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" #Schedule kernel calls #MaxBlockDim = 100 OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*32) MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*32) BlockCounter=0 print "Will do that many kernels a ",BlockDimX,"x",BlockDimY,":", MaxBlockCycles for i in range(MaxBlockCycles): #Set-up timer start = drv.Event() stop = drv.Event() start.record() print "Start kernel:" #Call kernels CurandKernel(curand_h, block=(32,1,1), grid=(BlockDimX, BlockDimY)) print "Finished Curand kernel, starting main kernel..." Kernel(dest_h, grids_h, fitness_left_h, fitness_bottom_h, curand_h, block=(32*BankSize,1,1), grid=(BlockDimX,BlockDimY)) #End timer stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) pass #Output #drv.memcpy_dtoh(dest, dest_h) #print "Genomes after: " #print dest[0:4] drv.memcpy_dtoh(fitness_left, fitness_left_h) print "FitnessLeft after: " print fitness_left#[1000000:1000500] drv.memcpy_dtoh(fitness_bottom, fitness_bottom_h) print "FitnessBottom after: " print fitness_bottom#[1000000:1000500] drv.memcpy_dtoh(grids, grids_h) print "Grids[0] after: " for i in range(0,5): print "Grid ",i,": " print grids[i]
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared(ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk*self.plan.segment_size)//given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())] ), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [min(given.microblock.elements, (chk*self.plan.segment_size+self.plan.segment_size-1) //given.dofs_per_el()+1) for chk in range(self.plan.segments_per_microblock())] ), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate this dof's element", [ Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL") ]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If("THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ]) ), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_matrix", base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") +[S("__syncthreads()")] ) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend( [S("__syncthreads()")] +[Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start) ) for inl in range(par.inline)] +[S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof-load_segment_start)) ) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign("fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},) for inl in range(par.inline) ]+[ Assign("lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % {"j":j, "row": "SEGMENT_DOF"}, ) ]+[ S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[Line()] +get_mat_mul_code(fetch_count) +[ If(write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl})) ) for inl in range(par.inline) ]) ) ]) ) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append(make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range(1, self.plan.max_elements_touched_by_segment()+1)] )) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare( "PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
mod2 = SourceModule( kernels2.replace('TPB',str(tpb)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) ) mod3 = SourceModule( kernels3.replace('TPB',str(tpb)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) ) mod4 = SourceModule( kernels4.replace('Dx',str(Dx)).replace('Dy',str(Dy)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) ) mod5 = SourceModule( kernels5.replace('Dx',str(Dx)).replace('Dy',str(Dy)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) ) h1 = mod1.get_function("update_h") e1 = mod1.get_function("update_e") h2 = mod2.get_function("update_h") # avoid mis-aligned e2 = mod2.get_function("update_e") h3 = mod3.get_function("update_h") # avoid mis-aligned, duplicated e3 = mod3.get_function("update_e") h4 = mod4.get_function("update_h") # avoid mis-aligned, duplicated and 2d block e4 = mod4.get_function("update_e") e5 = mod5.get_function("update_e") # avoid mis-aligned, duplicated and 2d block, tex3D tcex = mod5.get_texref("tcex") tcey = mod5.get_texref("tcey") tcez = mod5.get_texref("tcez") tcex.set_array(tcex_gpu) tcey.set_array(tcey_gpu) tcez.set_array(tcez_gpu) h1.prepare("PPPPPP", block=(tpb,1,1)) e1.prepare("PPPPPPPPP", block=(tpb,1,1)) h2.prepare("PPPPPP", block=(tpb,1,1)) e2.prepare("PPPPPPPPP", block=(tpb,1,1)) h3.prepare("PPPPPP", block=(tpb,1,1)) e3.prepare("PPPPPPPPP", block=(tpb,1,1)) h4.prepare("PPPPPP", block=(Dx,Dy,1)) e4.prepare("PPPPPPPPP", block=(Dx,Dy,1)) e5.prepare("PPPPPP", block=(Dx,Dy,1), texrefs=[tcex,tcey,tcez])
arrcopy(mcopy, set_c(f, (None, -1, -1)), cex_gpu) arrcopy(mcopy, set_c(f, (-1, None, -1)), cey_gpu) arrcopy(mcopy, set_c(f, (-1, -1, None)), cez_gpu) arrcopy(mcopy, set_c(f, (None, 0, 0)), chx_gpu) arrcopy(mcopy, set_c(f, (0, None, 0)), chy_gpu) arrcopy(mcopy, set_c(f, (0, 0, None)), chz_gpu) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule(kernels) update_e = mod.get_function("update_e") update_h = mod.get_function("update_h") update_src = mod.get_function("update_src") # bind a texture reference to linear memory tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tchx = mod.get_texref("tchx") tchy = mod.get_texref("tchy") tchz = mod.get_texref("tchz") tcex.set_array(cex_gpu) tcey.set_array(cey_gpu) tcez.set_array(cez_gpu) tchx.set_array(chx_gpu) tchy.set_array(chy_gpu) tchz.set_array(chz_gpu) Db = (16, 4, 4) Dg = (nx / 16, ny * nz / (4 * 4))
def main(): #Initialise InteractionMatrix def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Initialise GPU (equivalent of autoinit) drv.init() assert drv.Device.count() >= 1 dev = drv.Device(0) ctx = dev.make_context(0) #Convert GlobalParams to List GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32) count = 0 for x in GlobalParamsDict.keys(): GlobalParams[count] = GlobalParamsDict[x] count += 1 #Convert FitnessParams to List FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32) count = 0 for x in FitnessParamsDict.keys(): FitnessParams[count] = FitnessParamsDict[x] count += 1 #Convert GAParams to List GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32) count = 0 for x in GAParamsDict.keys(): GAParams[count] = GAParamsDict[x] count += 1 # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main_discoverytime', './templates')) # Load source code from file Source = env.get_template('./kernel.cu') #Template( file(KernelFile).read() ) #Create dictionary argument for rendering RenderArgs= {"params_size":GlobalParams.nbytes,\ "fitnessparams_size":FitnessParams.nbytes,\ "gaparams_size":GAParams.nbytes,\ "genome_bytelength":int(ByteLengthGenome),\ "genome_bitlength":int(BitLengthGenome),\ "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\ "textures":range( 0, NrFitnessFunctionGrids ),\ "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\ "with_mixed_crossover":WithMixedCrossover, "with_bank_conflict":WithBankConflict, "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection, "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues, "with_uniform_crossover":WithUniformCrossover, "with_single_point_crossover":WithSinglePointCrossover, "with_surefire_mutation":WithSurefireMutation, "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory, "ga_threaddimx":int(GA_ThreadDim), "glob_nr_tiletypes":int(NrTileTypes), "glob_nr_edgetypes":int(NrEdgeTypes), "glob_nr_tileorientations":int(NrTileOrientations), "fit_dimgridx":int(DimGridX), "fit_dimgridy":int(DimGridY), "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids), "fit_nr_fourpermutations":int(NrFourPermutations), "fit_assembly_redundancy":int(NrAssemblyRedundancy), "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock), "sort_threaddimx":int(Sort_ThreadDimX), "glob_nr_genomes":int(NrGenomes), "fit_dimthreadx":int(ThreadDimX), "fit_dimthready":int(ThreadDimY), "fit_dimsubgridx":int(SubgridDimX), "fit_dimsubgridy":int(SubgridDimY), "fit_nr_subgridsperbank":int(NrSubgridsPerBank), "glob_bitlength_edgetype":int(EdgeTypeBitLength), "fitness_break_value":int(BitLengthGenome), # ADAPTED FOR DISCOVERY KERNEL "fitness_flag_index":int(NrGenomes) } # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_20", code="sm_20", cache_dir=None) #Allocate values on GPU Genomes_h = drv.mem_alloc(Genomes.nbytes) FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes) FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes) AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes) Mutexe_h = drv.mem_alloc(Mutexe.nbytes) #ReductionList_h = drv.mem_alloc(ReductionList.nbytes) #Copy values to global memory drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #Copy values to constant / texture memory for id in range(0, NrFitnessFunctionGrids): FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) ) drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C") InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address drv.memcpy_htod(GlobalParams_h[0], GlobalParams) FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address drv.memcpy_htod(FitnessParams_h[0], FitnessParams) GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address drv.memcpy_htod(GAParams_h[0], GAParams) FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst") FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst") #Set up curandStates curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper) CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes) #Compile kernels curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel") #fitness_fnc = KernelSourceModule.get_function("FitnessKernel") sorting_fnc = KernelSourceModule.get_function("SortingKernel") ga_fnc = KernelSourceModule.get_function("GAKernel") #Initialise Curand curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1)) #Build parameter lists for FitnessKernel and GAKernel FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h); SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h) GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h); #TEST ONLY #return #ADAPTED #TEST ONLY #START ADAPTED print "GENOMES NOW:\n" print Genomes print ":::STARTING KERNEL EXECUTION:::" #STOP ADAPTED #Discovery time parameters min_fitness_value = BitLengthGenome # Want all bits set mutation_rate = -2.0 #normally: -2 #Define Numpy construct to sideways join arrays (glue columns together) #Taken from: http://stackoverflow.com/questions/5355744/numpy-joining-structured-arrays #def join_struct_arrays(arrays): # sizes = np.array([a.itemsize for a in arrays]) # offsets = np.r_[0, sizes.cumsum()] # n = len(arrays[0]) # joint = np.empty((n, offsets[-1]), dtype=np.int32) # for a, size, offset in zip(arrays, sizes, offsets): # joint[:,offset:offset+size] = a.view(np.int32).reshape(n,size) # dtype = sum((a.dtype.descr for a in arrays), []) # return joint.ravel().view(dtype) #Test join_struct_arrays: #a = np.array([[1, 2], [11, 22], [111, 222]]).astype(np.int32); #b = np.array([[3, 4], [33, 44], [333, 444]]).astype(np.int32); #c = np.array([[5, 6], [55, 66], [555, 666]]).astype(np.int32); #print "Test join_struct_arrays:" #print join_struct_arrays([a, b, c]) #FAILED #Set up PYTABLES #class GAGenome(IsDescription): #gen_id = Int32Col() #fitness_val = Float32Col() #genome = StringCol(mByteLengthGenome) #last_nr_mutations = Int32Col() # Contains the Nr of mutations genome underwent during this generation #mother_id = Int32Col() # Contains the crossover "mother" #father_id = Int32Col() # Contains the crossover "father" (empty if no crossing over) #assembledgrid = StringCol(DimGridX*DimGridY) # 16-character String #class GAGenerations(IsDescription): # nr_generations = Int32Col() # nr_genomes = Int32Col() # mutation_rate = Float32Col() # Contains the Nr of mutations genome underwent during this generation #from datetime import datetime #filename = "fujiama_"+str(NrGenomes)+"_"+str(RateMutation)+"_"+".h5" #print filename #h5file = openFile(filename, mode = "w", title = "GA FILE") #group = h5file.createGroup("/", 'fujiama_ga', 'Fujiama Genetic Algorithm output') #table = h5file.createTable(group, 'GaGenerations', GAGenerations, "Raw data") #atom = Atom.from_dtype(np.float32) #Initialise File I/O FILE = open("fujiamakernel_nrgen-" + str(NrGenomes) + "_adaptation.plot", "w") #ADAPTED FOR TESTING HISTOGRAM #TestValues = [13,24,26,31,32,14] #print np.histogram(TestValues, bins=[0, 24, 32])[0][1] #quit() #Initialise CUDA timers start = drv.Event() stop = drv.Event() while mutation_rate < 1: # normally: 1 #ds = h5file.createArray(f.root, 'ga_raw_'+str(mutation_rate), atom, x.shape) mutation_rate += 0.1 GAParams[0] = 10.0 ** mutation_rate drv.memcpy_htod(GAParams_h[0], GAParams) print "Mutation rate: ", GAParams[0] #ADAPTED: Initialise global memory (absolutely necessary!!) drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #execute kernels for specified number of generations start.record() biggest_fit = 0 reprange = 100 average_breakup = np.zeros((reprange)).astype(np.float32) for rep in range(0, reprange): breakup_generation = GlobalParamsDict["NrGenerations"] dontcount = 0 #ADAPTED: Initialise global memory (absolutely necessary!!) drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #execute kernels for specified number of generations start.record() for gen in range(0, GlobalParamsDict["NrGenerations"]): #print "Processing Generation: %d"%(gen) #Launch CPU processing (should be asynchroneous calls) sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel drv.memcpy_dtoh(FitnessPartialSums, FitnessPartialSums_h) #Copy from Device to Host and finish sorting FitnessSumConst = FitnessPartialSums.sum() drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory #drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitnessValues from Device to Device Const #TEST ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) #TEST #Note: Fitness Function is here integrated into GA kernel! drv.memcpy_dtoh(Genomes_res, Genomes_h) #Copy data from GPU drv.memcpy_dtoh(FitnessValues_res, FitnessValues_h) #drv.memcpy_dtoh(AssembledGrids_res, AssembledGrids_h) #Takes about as much time as the whole simulation! #print FitnessValues_res #maxxie = FitnessValues_res.max() #if maxxie > biggest_fit: # biggest_fit = maxxie #print "max fitness:", maxxie #if maxxie >= 25.0 and breakup_generation == -1: if np.histogram(FitnessValues_res, (0, 24, 32))[0][1] >= NrGenomes/2: breakup_generation = gen break # else: # breakup_generation = -1 #if FitnessValues[NrGenomes] == float(1): # breakup_generation = i # break #else: # breakup_generation = -1 #maxxie = FitnessValues_res.max() #if maxxie >= 30: # print "Max fitness value: ", FitnessValues_res.max() #ds[:] = FitnessValues #join_struct_arrays(Genomes, FitnessValues, AssembledGrids); #trow = table.row #trow['nr_generations'] = NrGenerations #trow['nr_genomes'] = NrGenomes #trow['mutation_rate'] = mutation_rate #trow.append() #trow.flush() stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / breakup_generation) print "Discovery time (generations) for mutation rate %f: %d"%(GAParams[0], breakup_generation) #print "Max:", biggest_fit #TEST MODE #print "Printing all FitnessValues now:" #print FitnessValues_res #raw_input("Next redundancy with keystroke!..."); #if breakup_generation==0: # print FitnessValues_res # print "Genomes: " # print Genomes_res average_breakup[rep] = breakup_generation / reprange #if breakup_generation == -1: # dontcount = 1 # break #if dontcount == 1: # average_breakup.fill(20000) FILE.write( str(GAParams[0]) + " " + str(np.median(average_breakup)) + " " + str(np.std(average_breakup)) + "\n"); FILE.flush() #Clean-up pytables #h5file.close() #Clean up File I/O FILE.close()