Exemple #1
0
    def test_multiple_2d_textures(self):
        mod = SourceModule("""
        texture<float, 2, cudaReadModeElementType> mtx_tex;
        texture<float, 2, cudaReadModeElementType> mtx2_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          dest[row*w+col] =
              tex2D(mtx_tex, row, col)
              +
              tex2D(mtx2_tex, row, col);
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")
        mtx2_tex = mod.get_texref("mtx2_tex")

        shape = (3, 4)
        a = np.random.randn(*shape).astype(np.float32)
        b = np.random.randn(*shape).astype(np.float32)
        drv.matrix_to_texref(a, mtx_tex, order="F")
        drv.matrix_to_texref(b, mtx2_tex, order="F")

        dest = np.zeros(shape, dtype=np.float32)
        copy_texture(drv.Out(dest),
                     block=shape + (1, ),
                     texrefs=[mtx_tex, mtx2_tex])
        assert la.norm(dest - a - b) < 1e-6
Exemple #2
0
    def __init__(self):
        # load kernel from file
        path = os.path.join(os.path.dirname(__file__), 'deskew.cu')
        with open(path, 'r') as fd:
            try:
                module = SourceModule(fd.read())
            except cuda.CompileError as err:
                logger.error("compile error: " + str(err))
                raise
        self._shear_kernel = module.get_function('shear_kernel')
        self._rotate_kernel = module.get_function('rotate_kernel')

        # preset texture
        shear_texture = module.get_texref('shear_tex')
        shear_texture.set_address_mode(0, cuda.address_mode.BORDER)
        shear_texture.set_address_mode(1, cuda.address_mode.BORDER)
        shear_texture.set_address_mode(2, cuda.address_mode.BORDER)
        shear_texture.set_filter_mode(cuda.filter_mode.LINEAR)
        self._shear_texture = shear_texture
        rotate_texture = module.get_texref('rotate_tex')
        rotate_texture.set_address_mode(0, cuda.address_mode.BORDER)
        rotate_texture.set_address_mode(1, cuda.address_mode.BORDER)
        rotate_texture.set_filter_mode(cuda.filter_mode.LINEAR)
        self._rotate_texture = rotate_texture

        # preset kernel launch parameters
        self._shear_kernel.prepare('PffIIffIII', texrefs=[shear_texture])
        self._rotate_kernel.prepare('PfIIffII', texrefs=[rotate_texture])

        # output staging buffer
        self._out_buf = None
def initCUDA():
  global plotData_dArray
  global tex, transferTex
  global transferFuncArray_d
  global c_invViewMatrix
  global renderKernel
  #print "Compiling CUDA code for volumeRender"
  cudaCodeFile = open(volRenderDirectory + "/CUDAvolumeRender.cu","r")
  cudaCodeString = cudaCodeFile.read()
  cudaCodeStringComplete = cudaCodeString
  cudaCode = SourceModule(cudaCodeStringComplete, no_extern_c=True, include_dirs=[volRenderDirectory] )
  tex = cudaCode.get_texref("tex")
  transferTex = cudaCode.get_texref("transferTex")
  c_invViewMatrix = cudaCode.get_global('c_invViewMatrix')[0]
  renderKernel = cudaCode.get_function("d_render")

  if not plotData_dArray: plotData_dArray = np3DtoCudaArray( plotData_h )
  tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
  tex.set_filter_mode(cuda.filter_mode.LINEAR)
  tex.set_address_mode(0, cuda.address_mode.CLAMP)
  tex.set_address_mode(1, cuda.address_mode.CLAMP)
  tex.set_array(plotData_dArray)

  set_transfer_function( cmap_indx_0, trans_ramp_0, trans_center_0 )
  print "CUDA volumeRender initialized\n"
Exemple #4
0
    def test_multiple_2d_textures(self):
        mod = SourceModule("""
        texture<float, 2, cudaReadModeElementType> mtx_tex;
        texture<float, 2, cudaReadModeElementType> mtx2_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          dest[row*w+col] =
              tex2D(mtx_tex, row, col)
              +
              tex2D(mtx2_tex, row, col);
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")
        mtx2_tex = mod.get_texref("mtx2_tex")

        shape = (3,4)
        a = np.random.randn(*shape).astype(np.float32)
        b = np.random.randn(*shape).astype(np.float32)
        drv.matrix_to_texref(a, mtx_tex, order="F")
        drv.matrix_to_texref(b, mtx2_tex, order="F")

        dest = np.zeros(shape, dtype=np.float32)
        copy_texture(drv.Out(dest),
                block=shape+(1,),
                texrefs=[mtx_tex, mtx2_tex]
                )
        assert la.norm(dest-a-b) < 1e-6
Exemple #5
0
    def prepare_functions(s):
        from pycuda.compiler import SourceModule
        kernels = ''.join(open("dielectric.cu", 'r').readlines())
        mod = SourceModule(
            kernels.replace('Dx', str(s.Dx)).replace('Dy', str(s.Dy)).replace(
                'nyz', str(s.ny * s.nz)).replace('nx', str(s.nx)).replace(
                    'ny', str(s.ny)).replace('nz', str(s.nz)))
        s.updateH = mod.get_function("update_h")
        s.updateE = mod.get_function("update_e")
        s.updateE_src = mod.get_function("update_src")

        tcex = mod.get_texref("tcex")
        tcey = mod.get_texref("tcey")
        tcez = mod.get_texref("tcez")
        tcex.set_array(s.tcex_gpu)
        tcey.set_array(s.tcey_gpu)
        tcez.set_array(s.tcez_gpu)

        Bx, By = s.nz / s.Dx, s.nx * s.ny / s.Dy  # number of block
        s.MaxBy = s.MAX_BLOCK / Bx
        s.bpg_list = [(Bx, s.MaxBy) for i in range(By / s.MaxBy)]
        if By % s.MaxBy != 0: s.bpg_list.append((Bx, By % s.MaxBy))

        s.updateH.prepare("iPPPPPP", block=(s.Dx, s.Dy, 1))
        s.updateE.prepare("iPPPPPP",
                          block=(s.Dx, s.Dy, 1),
                          texrefs=[tcex, tcey, tcez])
        s.updateE_src.prepare("fP", block=(s.nz, 1, 1))
Exemple #6
0
def conv3d_tex(data, kernel=None):
    assert data.ndim == 3 and kernel.ndim == 3

    with open(__cudafile__, "r") as f:
        _mod_conv = SourceModule(f.read())
        gpu_conv3d_t = _mod_conv.get_function("conv3d_tex")
        gpu_conv3d_tex1 = _mod_conv.get_texref("texSrc")
        gpu_conv3d_tex2 = _mod_conv.get_texref("texK")

    im_shape = np.asarray(data.shape[::-1], dtype=int3)
    k_radius = np.asarray(tuple(k // 2 for k in kernel.shape[::-1]), dtype=int3)

    data_tex = to_tex3d(data)
    gpu_conv3d_tex1.set_array(data_tex)
    gpu_conv3d_tex1.set_address_mode(0, cuda.address_mode.WRAP)
    gpu_conv3d_tex1.set_address_mode(1, cuda.address_mode.WRAP)
    gpu_conv3d_tex1.set_address_mode(2, cuda.address_mode.WRAP)

    kernel_tex = to_tex3d(kernel)
    gpu_conv3d_tex2.set_array(kernel_tex)

    r_gpu = gpuarray.zeros(data.shape, np.float32)

    block, grid = grid_kernel_config(gpu_conv3d_t, data.shape, isotropic=kernel.shape)

    gpu_conv3d_t(
        r_gpu,
        im_shape,
        k_radius,
        block=block,
        grid=grid,
        texrefs=[gpu_conv3d_tex1, gpu_conv3d_tex2],
    )

    return r_gpu
Exemple #7
0
    def cuda_interpolate3D(self, img, m, size_result):
        cols = size_result[0]
        rows = size_result[1]

        kernel_code = """
		texture<float, 2> texR;
		texture<float, 2> texG;
		texture<float, 2> texB;
	
		__global__ void interpolation(float *dest, float *m0, float *m1)
	    {
			int idx = threadIdx.x + blockDim.x * blockIdx.x;
			int idy = threadIdx.y + blockDim.y * blockIdx.y;
	
			if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) {
				dest[3*(%(NDIM)s * idx + idy)] = tex2D(texR, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
				dest[3*(%(NDIM)s * idx + idy) + 1] = tex2D(texG, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
				dest[3*(%(NDIM)s * idx + idy) + 2] = tex2D(texB, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
			}
		}
		"""

        kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows}
        mod = SourceModule(kernel_code)

        interpolation = mod.get_function("interpolation")
        texrefR = mod.get_texref("texR")
        texrefG = mod.get_texref("texG")
        texrefB = mod.get_texref("texB")

        img = img.astype("float32")
        drv.matrix_to_texref(img[:, :, 0], texrefR, order="F")
        texrefR.set_filter_mode(drv.filter_mode.LINEAR)
        drv.matrix_to_texref(img[:, :, 1], texrefG, order="F")
        texrefG.set_filter_mode(drv.filter_mode.LINEAR)
        drv.matrix_to_texref(img[:, :, 2], texrefB, order="F")
        texrefB.set_filter_mode(drv.filter_mode.LINEAR)

        bdim = (16, 16, 1)
        dx, mx = divmod(cols, bdim[0])
        dy, my = divmod(rows, bdim[1])

        gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1])

        dest = np.zeros((rows, cols, 3)).astype("float32")
        m0 = (m[0, :] - 1).astype("float32")
        m1 = (m[1, :] - 1).astype("float32")

        interpolation(drv.Out(dest),
                      drv.In(m0),
                      drv.In(m1),
                      block=bdim,
                      grid=gdim,
                      texrefs=[texrefR, texrefG, texrefB])

        return dest.astype("uint8")
Exemple #8
0
    def test_multichannel_2d_texture(self):
        mod = SourceModule("""
        #define CHANNELS 4
        texture<float4, 2, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          float4 texval = tex2D(mtx_tex, row, col);
          dest[(row*w+col)*CHANNELS + 0] = texval.x;
          dest[(row*w+col)*CHANNELS + 1] = texval.y;
          dest[(row*w+col)*CHANNELS + 2] = texval.z;
          dest[(row*w+col)*CHANNELS + 3] = texval.w;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (5, 6)
        channels = 4
        a = np.asarray(np.random.randn(*((channels, ) + shape)),
                       dtype=np.float32,
                       order="F")
        drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"),
                                 mtx_tex)

        dest = np.zeros(shape + (channels, ), dtype=np.float32)
        copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex])
        reshaped_a = a.transpose(1, 2, 0)
        #print reshaped_a
        #print dest
        assert la.norm(dest - reshaped_a) == 0
Exemple #9
0
def get_cuda_csr(block_size=128, warp_size=32):
    '''
    Method read ERTILP CUDA code from file, build it with arguments,
    compile it, and returns ready kernel and texture.
    The parameters of method must be equal parameters of converted matrix,
    which is multiplied.

    Parameters
    ==========
    block_size : int (Recommended 128 or 256)
        Size of block
    warp_size : int > 0 (Recommended 32)
        Size of warp. This value depends on the specifications
        of the graphics card.
    Returns
    =======
    Tuple of compiled kernel and texture
    '''
    kernel_info = {'file_' : 'csr_kernel.c',
                   'kernel' : 'SpMV_Csr',
                   'texref' : 'mainVecTexRef'}
    with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_:
        tpl = file_.read()

    tpl = convert_string(tpl, BLOCK_SIZE=block_size, WARP_SIZE=warp_size)

    mod = SourceModule(tpl)
    kernel = mod.get_function(kernel_info['kernel'])
    texref = mod.get_texref(kernel_info['texref'])
    return (kernel, texref)
Exemple #10
0
    def test_multichannel_linear_texture(self):
        mod = SourceModule("""
        #define CHANNELS 4
        texture<float4, 1, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int i = threadIdx.x+blockDim.x*threadIdx.y;
          float4 texval = tex1Dfetch(mtx_tex, i);
          dest[i*CHANNELS + 0] = texval.x;
          dest[i*CHANNELS + 1] = texval.y;
          dest[i*CHANNELS + 2] = texval.z;
          dest[i*CHANNELS + 3] = texval.w;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (16, 16)
        channels = 4
        a = np.random.randn(*(shape + (channels, ))).astype(np.float32)
        a_gpu = drv.to_device(a)
        mtx_tex.set_address(a_gpu, a.nbytes)
        mtx_tex.set_format(drv.array_format.FLOAT, 4)

        dest = np.zeros(shape + (channels, ), dtype=np.float32)
        copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex])
        #print a
        #print dest
        assert la.norm(dest - a) == 0
Exemple #11
0
def get_cuda_sliced(sh_cache_size, threads_per_row=2):
    '''
    Method read SLICED CUDA code from file, build it with arguments,
    compile it, and returns ready kernel and texture.
    The parameters of method must be equal parameters of converted matrix,
    which is multiplied.

    Parameters
    ==========
    sh_cache_size : int
        Size of cache array. For Sliced format must be equal
        threads per row * slice size. If get another value execute badly.
    threads_per_row : int > 0 (Recommended 2, 4 or 8)
        Threads per row

    Returns
    =======
    Tuple of compiled kernel and texture
    '''
    kernel_info = {'file_' : 'sliced_kernel.c',
                   'kernel' : 'SpMV_Sliced',
                   'texref' : 'mainVecTexRef'}
    with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_:
        tpl = file_.read()

    tpl = convert_string(tpl, sh_cache_size=sh_cache_size,
                         threadPerRow=threads_per_row)

    mod = SourceModule(tpl)
    kernel = mod.get_function(kernel_info['kernel'])
    texref = mod.get_texref(kernel_info['texref'])
    return (kernel, texref)
Exemple #12
0
    def test_multichannel_2d_texture(self):
        mod = SourceModule(
            """
        #define CHANNELS 4
        texture<float4, 2, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          float4 texval = tex2D(mtx_tex, row, col);
          dest[(row*w+col)*CHANNELS + 0] = texval.x;
          dest[(row*w+col)*CHANNELS + 1] = texval.y;
          dest[(row*w+col)*CHANNELS + 2] = texval.z;
          dest[(row*w+col)*CHANNELS + 3] = texval.w;
        }
        """
        )

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (5, 6)
        channels = 4
        a = np.asarray(np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F")
        drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex)

        dest = np.zeros(shape + (channels,), dtype=np.float32)
        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
        reshaped_a = a.transpose(1, 2, 0)
        # print reshaped_a
        # print dest
        assert la.norm(dest - reshaped_a) == 0
Exemple #13
0
    def test_multichannel_linear_texture(self):
        mod = SourceModule("""
        #define CHANNELS 4
        texture<float4, 1, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int i = threadIdx.x+blockDim.x*threadIdx.y;
          float4 texval = tex1Dfetch(mtx_tex, i);
          dest[i*CHANNELS + 0] = texval.x;
          dest[i*CHANNELS + 1] = texval.y;
          dest[i*CHANNELS + 2] = texval.z;
          dest[i*CHANNELS + 3] = texval.w;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (16, 16)
        channels = 4
        a = np.random.randn(*(shape+(channels,))).astype(np.float32)
        a_gpu = drv.to_device(a)
        mtx_tex.set_address(a_gpu, a.nbytes)
        mtx_tex.set_format(drv.array_format.FLOAT, 4)

        dest = np.zeros(shape+(channels,), dtype=np.float32)
        copy_texture(drv.Out(dest),
                block=shape+(1,),
                texrefs=[mtx_tex]
                )
        #print a
        #print dest
        assert la.norm(dest-a) == 0
Exemple #14
0
    def test_3d_fp_textures(self):
        orden = "C"
        npoints = 32

        for prec in [
                np.int16, np.float32, np.float64, np.complex64, np.complex128
        ]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64:
                fpName_str = "fp_tex_cfloat"
            elif prec == np.complex128:
                fpName_str = "fp_tex_cdouble"
            elif prec == np.float64:
                fpName_str = "fp_tex_double"
            else:
                fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints, npoints],
                             order=orden,
                             dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = """
            #include <pycuda-helpers.hpp>
            texture<fpName, 3, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int slice = blockIdx.z*blockDim.z + threadIdx.z;
              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
            }
            """
            myKern = myKern.replace("fpName", fpName_str)
            myKern = myKern.replace("cuPres", prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (8, 8, 8)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, npoints)
            cuGrid = (
                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
            )
            copy_texture.prepare("P", texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() -
                                 np.transpose(A_cpu))) == np.array(0,
                                                                   dtype=prec)
            A_gpu.gpudata.free()
Exemple #15
0
class BornThread(threading.Thread):
    def __init__(self, gpu, work_queue, result, 
                 density, x, y, z, Qx, Qy, Qz):
        threading.Thread.__init__(self)
        self.born_args = density, x, y, z, Qx, Qy, Qz, result
        self.work_queue = work_queue
        self.gpu = gpu
        self.precision = Qx.dtype
        
    def run(self):
        self.dev = cuda.Device(self.gpu)
        self.ctx = self.dev.make_context()
        if self.precision == numpy.float32:
            self.cudamod = SourceModule(BORN_KERNEL32)
        else:
            self.cudamod = SourceModule(BORN_KERNEL64)
        self.cudaBorn = self.cudamod.get_function("cudaBorn")
        self.kernel()
        self.ctx.pop()
        del self.ctx
        del self.dev


    def kernel(self):
        density, x, y, z, Qx, Qy, Qz, result = self.born_args
        nx, ny, nz, nqx, nqy, nqz = [numpy.int32(len(v)) 
                                     for v in x, y, z, Qx, Qy, Qz]
        texture_data = []
        for v,t in (x,"tx"),(y,"ty"),(z,"tz"),(Qx,"tQx"),(Qy,"tQy"),(Qz,"tQz"),(density,"tdensity"):
            cv = gpuarray.to_gpu(v)
            tv = self.cudamod.get_texref(t)
            cv.bind_to_texref_ext(tv)
            texture_data.append(cv)

        cframe = cuda.mem_alloc(result[0].nbytes)
        n = int(1*nqy*nqz)
        
        while True:
            try:
                qxi = numpy.int32(self.work_queue.get(block=False))
            except Queue.Empty:
                break
            
            #print "%d of %d on %d\n"%(qxi,nqx,self.gpu),
            
            self.cudaBorn(nx,ny,nz,nqx,nqy,nqz,qxi,cframe,
                     **cuda_partition(n))
            ## Delay fetching result until the kernel is complete
            cuda_sync()

            ## Fetch result back to the CPU
            cuda.memcpy_dtoh(result[qxi], cframe)

            #print "%d %s\n"%(qxi,ctemp.get())
        del cframe
	for cv in texture_data: del cv
Exemple #16
0
    def test_3d_texture(self):
        # adapted from code by Nicolas Pinto
        w = 2
        h = 4
        d = 8
        shape = (w, h, d)

        a = np.asarray(
                np.random.randn(*shape),
                dtype=np.float32, order="F")

        descr = drv.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = drv.dtype_to_array_format(a.dtype)
        descr.num_channels = 1
        descr.flags = 0

        ary = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_host(a)
        copy.set_dst_array(ary)
        copy.width_in_bytes = copy.src_pitch = a.strides[1]
        copy.src_height = copy.height = h
        copy.depth = d

        copy()

        mod = SourceModule("""
        texture<float, 3, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int x = threadIdx.x;
          int y = threadIdx.y;
          int z = threadIdx.z;
          int dx = blockDim.x;
          int dy = blockDim.y;
          int i = (z*dy + y)*dx + x;
          dest[i] = tex3D(mtx_tex, x, y, z);
          //dest[i] = x;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        mtx_tex.set_array(ary)

        dest = np.zeros(shape, dtype=np.float32, order="F")
        copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex])
        assert la.norm(dest-a) == 0
Exemple #17
0
    def test_3d_texture(self):
        # adapted from code by Nicolas Pinto
        w = 2
        h = 4
        d = 8
        shape = (w, h, d)

        a = numpy.asarray(numpy.random.randn(*shape),
                          dtype=numpy.float32,
                          order="F")

        descr = drv.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = drv.dtype_to_array_format(a.dtype)
        descr.num_channels = 1
        descr.flags = 0

        ary = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_host(a)
        copy.set_dst_array(ary)
        copy.width_in_bytes = copy.src_pitch = a.strides[1]
        copy.src_height = copy.height = h
        copy.depth = d

        copy()

        mod = SourceModule("""
        texture<float, 3, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int x = threadIdx.x;
          int y = threadIdx.y;
          int z = threadIdx.z;
          int dx = blockDim.x;
          int dy = blockDim.y;
          int i = (z*dy + y)*dx + x;
          dest[i] = tex3D(mtx_tex, x, y, z);
          //dest[i] = x;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        mtx_tex.set_array(ary)

        dest = numpy.zeros(shape, dtype=numpy.float32, order="F")
        copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex])
        assert la.norm(dest - a) == 0
def get_dckernel(slen):
    # Right now, hardcoding the number of threads per block
    nt = 1024
    nb = int(numpy.ceil(slen / 1024.0))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return dckernel_cache[nb]
    except KeyError:
        mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb))
        freq_tex = mod.get_texref("freq_tex")
        amp_tex = mod.get_texref("amp_tex")
        phase_tex = mod.get_texref("phase_tex")
        fn1 = mod.get_function("find_block_indices")
        fn1.prepare("PPifff", texrefs=[freq_tex])
        fn2 = mod.get_function("linear_interp")
        fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex])
        dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb)
        return dckernel_cache[nb]
Exemple #19
0
def get_dckernel(slen):
    # Right now, hardcoding the number of threads per block
    nt = 1024
    nb = int(numpy.ceil(slen / 1024.0))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return dckernel_cache[nb]
    except KeyError:
        mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb))
        freq_tex = mod.get_texref("freq_tex")
        amp_tex = mod.get_texref("amp_tex")
        phase_tex = mod.get_texref("phase_tex")
        fn1 = mod.get_function("find_block_indices")
        fn1.prepare("PPifff", texrefs=[freq_tex])
        fn2 = mod.get_function("linear_interp")
        fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex])
        dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb)
        return dckernel_cache[nb]
Exemple #20
0
	def prepare_functions(s):
		from pycuda.compiler import SourceModule
		kernels = ''.join( open("dielectric.cu",'r').readlines() )
		mod = SourceModule( kernels.replace('Dx',str(s.Dx)).replace('Dy',str(s.Dy)).replace('nyz',str(s.ny*s.nz)).replace('nx',str(s.nx)).replace('ny',str(s.ny)).replace('nz',str(s.nz)) )
		s.updateH = mod.get_function("update_h")
		s.updateE = mod.get_function("update_e")
		s.updateE_src = mod.get_function("update_src")

		tcex = mod.get_texref("tcex")
		tcey = mod.get_texref("tcey")
		tcez = mod.get_texref("tcez")
		tcex.set_array(s.tcex_gpu)
		tcey.set_array(s.tcey_gpu)
		tcez.set_array(s.tcez_gpu)

		Bx, By = s.nz/s.Dx, s.nx*s.ny/s.Dy	# number of block
		s.MaxBy = s.MAX_BLOCK/Bx
		s.bpg_list = [(Bx,s.MaxBy) for i in range(By/s.MaxBy)]
		if By%s.MaxBy != 0: s.bpg_list.append( (Bx,By%s.MaxBy) )

		s.updateH.prepare("iPPPPPP", block=(s.Dx,s.Dy,1))
		s.updateE.prepare("iPPPPPP", block=(s.Dx,s.Dy,1), texrefs=[tcex,tcey,tcez])
		s.updateE_src.prepare("fP", block=(s.nz,1,1))
Exemple #21
0
    def test_3d_fp_textures(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64:
                fpName_str = "fp_tex_cfloat"
            elif prec == np.complex128:
                fpName_str = "fp_tex_cdouble"
            elif prec == np.float64:
                fpName_str = "fp_tex_double"
            else:
                fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = """
            #include <pycuda-helpers.hpp>
            texture<fpName, 3, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int slice = blockIdx.z*blockDim.z + threadIdx.z;
              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
            }
            """
            myKern = myKern.replace("fpName", fpName_str)
            myKern = myKern.replace("cuPres", prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (8, 8, 8)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, npoints)
            cuGrid = (
                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
            )
            copy_texture.prepare("P", texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec)
            A_gpu.gpudata.free()
Exemple #22
0
def mk_tex_kernel(params, func_name, tex_name, kernel_file, prepare_args=None):
    kernel_file = kernels_dir + kernel_file
    key = (params, kernel_file, prepare_args)
    if key in _kernel_cache:
        return _kernel_cache[key]

    with open(kernel_file) as code_file:
        code = code_file.read()
        src = code % params
        mod = SourceModule(src, include_dirs=[kernels_dir])
        fn = mod.get_function(func_name)
        tex = mod.get_texref(tex_name)
        if prepare_args is not None: fn.prepare(prepare_args)
        _kernel_cache[key] = (fn, tex)
        return fn, tex
Exemple #23
0
    def test_2d_fp_texturesLayered(self):
        orden = "F"
        npoints = 32

        for prec in [
                np.int16, np.float32, np.float64, np.complex64, np.complex128
        ]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = '''
            #include <pycuda-helpers.hpp>
            texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row = blockIdx.x*blockDim.x + threadIdx.x;
              int col = blockIdx.y*blockDim.y + threadIdx.y;

              dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1);
            }
            '''
            myKern = myKern.replace('fpName', fpName_str)
            myKern = myKern.replace('cuPres', prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (16, 16, 1)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, 1)
            cuGrid = (npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                      npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                      1)
            copy_texture.prepare('P', texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() -
                                 np.transpose(A_cpu))) == np.array(0,
                                                                   dtype=prec)
            A_gpu.gpudata.free()
Exemple #24
0
    def __init__(self,
                 pyr_scale=0.9,
                 levels=15,
                 winsize=9,
                 num_iterations=5,
                 poly_n=5,
                 poly_sigma=1.2,
                 use_gaussian_kernel: bool = True,
                 use_initial_flow=None,
                 quit_at_level=None,
                 use_gpu=True,
                 upscale_on_termination=True,
                 fast_gpu_scaling=True,
                 vesselmask_gpu=None):
        self.pyr_scale = pyr_scale
        self.levels = levels
        self.winsize = winsize
        self.num_iterations = num_iterations
        self.poly_n = poly_n
        self.poly_sigma = poly_sigma
        self.use_gaussian_kernel = use_gaussian_kernel
        self.use_initial_flow = use_initial_flow
        self.use_gpu = use_gpu
        self.upscale_on_termination = upscale_on_termination
        self._fast_gpu_scaling = fast_gpu_scaling
        self.quit_at_level = quit_at_level
        self._dump_everything = False
        self._show_everything = False
        self._vesselmask_gpu = vesselmask_gpu
        self._resize_kernel_size_factor = 4
        self._max_resize_kernel_size = 9

        with open(
                os.path.join(os.path.dirname(__file__),
                             'farneback_kernels.cu')) as f:
            read_data = f.read()
        f.closed

        mod = SourceModule(read_data)
        self._update_matrices_kernel = mod.get_function(
            'FarnebackUpdateMatrices')
        self._invG_gpu = mod.get_global('invG')[0]
        self._weights_gpu = mod.get_global('weights')[0]
        self._poly_expansion_kernel = mod.get_function('calcPolyCoeficients')
        self._warp_kernel = mod.get_function('warpByFlowField')
        self._r1_texture = mod.get_texref('sourceTex')
        self._solve_equations_kernel = mod.get_function('solveEquationsCramer')
Exemple #25
0
    def get_flat_kernel(self):
        from pycuda.tools import dtype_to_ctype

        mod = SourceModule(
                COO_FLAT_KERNEL_TEMPLATE % {
                    "value_type": dtype_to_ctype(self.dtype),
                    "tex_value_type": dtype_to_ctype(
                        self.dtype, with_fp_tex_hack=True),
                    "index_type": dtype_to_ctype(self.index_dtype),
                    "block_size": self.block_size,
                    "warp_size": drv.Context.get_device().warp_size,
                    })
        func = mod.get_function("spmv_coo_flat_kernel")
        x_texref = mod.get_texref("tex_x")
        func.prepare(self.index_dtype.char*2 + "PPPP",
            (self.block_size, 1, 1), texrefs=[x_texref])
        return func, x_texref
Exemple #26
0
    def cuda_interpolate(self, channel, m, size_result):
        cols = size_result[0]
        rows = size_result[1]

        kernel_code = """
		texture<float, 2> tex;
	
		__global__ void interpolation(float *dest, float *m0, float *m1)
		{
			int idx = threadIdx.x + blockDim.x * blockIdx.x;
			int idy = threadIdx.y + blockDim.y * blockIdx.y;
	
			if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) {
				dest[%(NDIM)s * idx + idy] = tex2D(tex, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
			}
		}
		"""

        kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows}
        mod = SourceModule(kernel_code)

        interpolation = mod.get_function("interpolation")
        texref = mod.get_texref("tex")

        channel = channel.astype("float32")
        drv.matrix_to_texref(channel, texref, order="F")
        texref.set_filter_mode(drv.filter_mode.LINEAR)

        bdim = (16, 16, 1)
        dx, mx = divmod(cols, bdim[0])
        dy, my = divmod(rows, bdim[1])

        gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1])

        dest = np.zeros((rows, cols)).astype("float32")
        m0 = (m[0, :] - 1).astype("float32")
        m1 = (m[1, :] - 1).astype("float32")

        interpolation(drv.Out(dest),
                      drv.In(m0),
                      drv.In(m1),
                      block=bdim,
                      grid=gdim,
                      texrefs=[texref])

        return dest.astype("uint8")
Exemple #27
0
def resize_gpu(y_gpu, out_shape):

  in_shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if dtype != np.float32:
    raise NotImplementedException('Only float at the moment')
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])),
               int(np.ceil(float(out_shape[0])/block_size[1])))

  preproc = _generate_preproc(dtype)
  mod = SourceModule(preproc + resize_code, keep=True)

  resize_fun_gpu = mod.get_function("resize")
  resized_gpu = cua.empty(tuple((np.int(out_shape[0]),
                                 np.int(out_shape[1]))),y_gpu.dtype)

  temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1],
                                       y_gpu.shape[0],
                                       4)
  copy_object = cu.Memcpy2D()
  copy_object.set_src_device(y_gpu.gpudata)
  copy_object.set_dst_device(temp_gpu)
  copy_object.src_pitch = 4 * y_gpu.shape[1]
  copy_object.dst_pitch = pitch
  copy_object.width_in_bytes = 4 * y_gpu.shape[1]
  copy_object.height = y_gpu.shape[0]
  copy_object(aligned=False)
  in_tex = mod.get_texref('in_tex')
  descr = cu.ArrayDescriptor()
  descr.width = y_gpu.shape[1]
  descr.height = y_gpu.shape[0]
  descr.format = cu.array_format.FLOAT
  descr.num_channels = 1
  #pitch = y_gpu.nbytes / y_gpu.shape[0]
  in_tex.set_address_2d(temp_gpu, descr, pitch)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)
    
  resize_fun_gpu(resized_gpu.gpudata,
                 np.uint32(out_shape[0]), np.uint32(out_shape[1]),
                 block=block_size, grid=grid_size)
  temp_gpu.free()

  return resized_gpu
Exemple #28
0
def resize_gpu(y_gpu, out_shape):

  in_shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if dtype != np.float32:
    raise NotImplementedException('Only float at the moment')
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])),
               int(np.ceil(float(out_shape[0])/block_size[1])))

  preproc = _generate_preproc(dtype)
  mod = SourceModule(preproc + resize_code, keep=True)

  resize_fun_gpu = mod.get_function("resize")
  resized_gpu = cua.empty(tuple((np.int(out_shape[0]),
                                 np.int(out_shape[1]))),y_gpu.dtype)

  temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1],
                                       y_gpu.shape[0],
                                       4)
  copy_object = cu.Memcpy2D()
  copy_object.set_src_device(y_gpu.gpudata)
  copy_object.set_dst_device(temp_gpu)
  copy_object.src_pitch = 4 * y_gpu.shape[1]
  copy_object.dst_pitch = pitch
  copy_object.width_in_bytes = 4 * y_gpu.shape[1]
  copy_object.height = y_gpu.shape[0]
  copy_object(aligned=False)
  in_tex = mod.get_texref('in_tex')
  descr = cu.ArrayDescriptor()
  descr.width = y_gpu.shape[1]
  descr.height = y_gpu.shape[0]
  descr.format = cu.array_format.FLOAT
  descr.num_channels = 1
  #pitch = y_gpu.nbytes / y_gpu.shape[0]
  in_tex.set_address_2d(temp_gpu, descr, pitch)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)
    
  resize_fun_gpu(resized_gpu.gpudata,
                 np.uint32(out_shape[0]), np.uint32(out_shape[1]),
                 block=block_size, grid=grid_size)
  temp_gpu.free()

  return resized_gpu
Exemple #29
0
def get_cuda_ellpack():
    '''
    Method read ELLPACK CUDA code from file, compile it, and returns
    ready kernel and texture.

    Returns
    =======
    Tuple of compiled kernel and texture
    '''
    kernel_info = {'file_' : 'ellpack_kernel.c',
                   'kernel' : 'SpMV_Ellpack',
                   'texref' : 'mainVecTexRef'}
    with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_:
        tpl = file_.read()
    mod = SourceModule(tpl)
    kernel = mod.get_function(kernel_info['kernel'])
    texref = mod.get_texref(kernel_info['texref'])
    return (kernel, texref)
Exemple #30
0
    def get_flat_kernel(self):
        from pycuda.tools import dtype_to_ctype

        mod = SourceModule(
            COO_FLAT_KERNEL_TEMPLATE % {
                "value_type": dtype_to_ctype(self.dtype),
                "tex_value_type": dtype_to_ctype(self.dtype,
                                                 with_fp_tex_hack=True),
                "index_type": dtype_to_ctype(self.index_dtype),
                "block_size": self.block_size,
                "warp_size": drv.Context.get_device().warp_size,
            })
        func = mod.get_function("spmv_coo_flat_kernel")
        x_texref = mod.get_texref("tex_x")
        func.prepare(self.index_dtype.char * 2 + "PPPP",
                     (self.block_size, 1, 1),
                     texrefs=[x_texref])
        return func, x_texref
Exemple #31
0
def mk_tex_kernel(params, 
                  func_name, 
                  tex_name, 
                  kernel_file, 
                  prepare_args = None):
  kernel_file = kernels_dir + kernel_file
  key = (params, kernel_file, prepare_args)
  if key in _kernel_cache:
    return _kernel_cache[key]

  with open(kernel_file) as code_file:
    code = code_file.read()
    src = code % params
    mod = SourceModule(src, include_dirs = [kernels_dir])
    fn = mod.get_function(func_name)
    tex = mod.get_texref(tex_name)
    if prepare_args is not None: fn.prepare(prepare_args)
    _kernel_cache[key] = (fn, tex)
    return fn, tex
    def test_2d_fp_texturesLayered(self):
        orden = "F"
        npoints = 32

        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
            A_cpu[:] = np.random.rand(npoints,npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden)

            myKern = '''
            #include <pycuda-helpers.hpp>
            texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row = blockIdx.x*blockDim.x + threadIdx.x;
              int col = blockIdx.y*blockDim.y + threadIdx.y;

              dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1);
            }
            '''
            myKern = myKern.replace('fpName',fpName_str)
            myKern = myKern.replace('cuPres',prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (16,16,1)
            if cuBlock[0]>npoints:
                cuBlock = (npoints,npoints,1)
            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
            copy_texture.prepare('P',texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=True)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
            A_gpu.gpudata.free()
Exemple #33
0
    def _load_kernel(self):
        path = os.path.join(os.path.dirname(__file__), "deskew.cu")
        with open(path, 'r') as fd:
            tpl = Template(fd.read())
            source = tpl.render(dst_type=dtype_to_ctype(self._dtype))
            module = SourceModule(source)

        self._kernel = module.get_function("deskew_kernel")

        self._d_px_shift, _ = module.get_global('px_shift')
        self._d_vsin, _ = module.get_global('vsin')
        self._d_vcos, _ = module.get_global('vcos')

        self._texture = module.get_texref("ref_vol")
        self._texture.set_address_mode(0, cuda.address_mode.BORDER)
        self._texture.set_address_mode(1, cuda.address_mode.BORDER)
        self._texture.set_address_mode(2, cuda.address_mode.BORDER)
        self._texture.set_filter_mode(cuda.filter_mode.LINEAR)

        self._kernel.prepare('Piiiii',texrefs=[self._texture])
Exemple #34
0
def get_cuda_sertilp(sh_dot_size=None, threads_per_row=2,
                     slice_size=32, prefetch=2):
    '''
    Method read SERTILP CUDA code from file, build it with arguments,
    compile it, and returns ready kernel and texture.
    The parameters of method must be equal parameters of converted matrix,
    which is multiplied.

    Parameters
    ==========
    sh_dot_size : int
        Size of cache array. For Sertilp format must be equal
        threads per row * slice size. If get another value execute badly.
        If get None calculates this automatically.
    threads_per_row : int > 0 (Recommended 2, 4 or 8)
        Threads per row
    slice_size : int (Recommended multiple 2)
        Slice simple size.
    prefetch : int (recommended 2, 4 or 8)
        Number of requests for access to data notified in advance.

    Returns
    =======
    Tuple of compiled kernel and texture
    '''
    kernel_info = {'file_' : 'sertilp_kernel.c',
                   'kernel' : 'SpMV_Sertilp',
                   'texref' : 'mainVecTexRef'}
    with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_:
        tpl = file_.read()

    if sh_dot_size is None:
        sh_dot_size = threads_per_row * slice_size
    tpl = convert_string(tpl, shDot_size=sh_dot_size,
                         threadPerRow=threads_per_row,
                         sliceSize=slice_size, prefetch=prefetch)

    mod = SourceModule(tpl)
    kernel = mod.get_function(kernel_info['kernel'])
    texref = mod.get_texref(kernel_info['texref'])
    return (kernel, texref)
Exemple #35
0
    def test_fp_textures(self):
        if drv.Context.get_device().compute_capability() < (1, 3):
            return

        for tp in [np.float32, np.float64]:
            tp_cstr = dtype_to_ctype(tp)
            mod = SourceModule(
                """
            #include <pycuda-helpers.hpp>

            texture<fp_tex_%(tp)s, 1, cudaReadModeElementType> my_tex;

            __global__ void copy_texture(%(tp)s *dest)
            {
              int i = threadIdx.x;
              dest[i] = fp_tex1Dfetch(my_tex, i);
            }
            """
                % {"tp": tp_cstr}
            )

            copy_texture = mod.get_function("copy_texture")
            my_tex = mod.get_texref("my_tex")

            shape = (384,)
            a = np.random.randn(*shape).astype(tp)
            a_gpu = gpuarray.to_gpu(a)
            a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True)

            dest = np.zeros(shape, dtype=tp)
            copy_texture(
                drv.Out(dest),
                block=shape
                + (
                    1,
                    1,
                ),
                texrefs=[my_tex],
            )

            assert la.norm(dest - a) == 0
Exemple #36
0
def get_cuda_ertilp(block_size, threads_per_row, prefetch):
    '''
    Method read ERTILP CUDA code from file, build it with arguments,
    compile it, and returns ready kernel and texture.
    The parameters of method must be equal parameters of converted matrix,
    which is multiplied.

    Parameters
    ==========
    block_size : int (Recommended 128 or 256)
        Size of block
    threads_per_row : int > 0 (Recommended 2, 4 or 8)
        Threads per row
    prefetch : int (recommended 2, 4 or 8)
        Number of requests for access to data notified in advance.

    Returns
    =======
    Tuple of compiled kernel and texture
    '''
    kernel_info = {'file_' : 'ertilp_kernel.c',
                   'kernel' : 'SpMV_Ertilp',
                   'texref' : 'mainVecTexRef'}
    with open(path_join(KERNELS_PATH, kernel_info['file_'])) as file_:
        tpl = file_.read()

    prefetch_init_tab = '{' + \
                        ', '.join('0' for i in range(prefetch)) + \
                        '}'
    tpl = convert_string(tpl, BLOCK_SIZE=block_size,
                         THREADS_ROW=threads_per_row,
                         PREFETCH_SIZE=prefetch,
                         PREFETCH_INIT_TAB=prefetch_init_tab)

    mod = SourceModule(tpl)
    kernel = mod.get_function(kernel_info['kernel'])
    texref = mod.get_texref(kernel_info['texref'])
    return (kernel, texref)
Exemple #37
0
    def test_fp_textures(self):
        if drv.Context.get_device().compute_capability() < (1, 3):
            return

        for tp in [np.float32, np.float64]:
            from pycuda.tools import dtype_to_ctype

            tp_cstr = dtype_to_ctype(tp)
            mod = SourceModule(
                """
            #include <pycuda-helpers.hpp>

            texture<fp_tex_%(tp)s, 1, cudaReadModeElementType> my_tex;

            __global__ void copy_texture(%(tp)s *dest)
            {
              int i = threadIdx.x;
              dest[i] = fp_tex1Dfetch(my_tex, i);
            }
            """
                % {"tp": tp_cstr}
            )

            copy_texture = mod.get_function("copy_texture")
            my_tex = mod.get_texref("my_tex")

            import pycuda.gpuarray as gpuarray

            shape = (384,)
            a = np.random.randn(*shape).astype(tp)
            a_gpu = gpuarray.to_gpu(a)
            a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True)

            dest = np.zeros(shape, dtype=tp)
            copy_texture(drv.Out(dest), block=shape + (1, 1), texrefs=[my_tex])

            assert la.norm(dest - a) == 0
Exemple #38
0
def warp_by_flow(vol, flow3d):
    """
    Only used for testing at the moment
    (warps currently backward)

    """
    with open(os.path.join(os.path.dirname(__file__),
                           'farneback_kernels.cu')) as f:
        read_data = f.read()
    f.closed

    mod = SourceModule(read_data)
    interpolation_kernel = mod.get_function('warpByFlowField')

    r1_texture = mod.get_texref('sourceTex')

    farneback3d._utils.ndarray_to_float_tex(r1_texture, vol)
    rtn_gpu = gpuarray.GPUArray(vol.shape, vol.dtype)
    flow_gpu = gpuarray.to_gpu(flow3d)

    block = (32, 32, 1)
    grid = (int(divup(flow3d.shape[3],
                      block[0])), int(divup(flow3d.shape[2], block[1])), 1)

    interpolation_kernel(flow_gpu,
                         rtn_gpu,
                         np.int32(flow_gpu.shape[3]),
                         np.int32(flow_gpu.shape[2]),
                         np.int32(flow_gpu.shape[1]),
                         np.float32(1),
                         np.float32(1),
                         np.float32(1),
                         block=block,
                         grid=grid)

    return rtn_gpu.get()
#set thread grid for CUDA kernels
block_size_x, block_size_y = 16, 8  #hardcoded, tune to your needs
gridx = nWidth // block_size_x + 1 * (nWidth % block_size_x != 0)
gridy = nHeight // block_size_y + 1 * (nHeight % block_size_y != 0)
grid2D = (gridx, gridy, 1)
block2D = (block_size_x, block_size_y, 1)

#initialize pyCUDA context
cudaDevice = setCudaDevice(devN=useDevice, usingAnimation=True)

#Read and compile CUDA code
print "Compiling CUDA code"
cudaCodeString_raw = open("CUDAlatticeBoltzmann2D.cu", "r").read()
cudaCodeString = cudaCodeString_raw  # % { "BLOCK_WIDTH":block2D[0], "BLOCK_HEIGHT":block2D[1], "BLOCK_DEPTH":block2D[2], }
cudaCode = SourceModule(cudaCodeString)
tex_f1 = cudaCode.get_texref('tex_f1')
tex_f2 = cudaCode.get_texref('tex_f2')
tex_f3 = cudaCode.get_texref('tex_f3')
tex_f4 = cudaCode.get_texref('tex_f4')
tex_f5 = cudaCode.get_texref('tex_f5')
tex_f6 = cudaCode.get_texref('tex_f6')
tex_f7 = cudaCode.get_texref('tex_f7')
tex_f8 = cudaCode.get_texref('tex_f8')
tex_g1 = cudaCode.get_texref('tex_g1')
tex_g2 = cudaCode.get_texref('tex_g2')
tex_g3 = cudaCode.get_texref('tex_g3')
tex_g4 = cudaCode.get_texref('tex_g4')
tex_g5 = cudaCode.get_texref('tex_g5')
tex_g6 = cudaCode.get_texref('tex_g6')
tex_g7 = cudaCode.get_texref('tex_g7')
tex_g8 = cudaCode.get_texref('tex_g8')
    mcopy.depth = nx
    arrcopy(mcopy, cf, tcf_gpu)

    # prepare kernels
    from pycuda.compiler import SourceModule
    kernels = kernels.replace('Dx', '16').replace('Dy', '16').replace(
        'nyz',
        str(ny * nz)).replace('nx',
                              str(nx)).replace('ny',
                                               str(ny)).replace('nz', str(nz))
    mod = SourceModule(kernels)

    naive = mod.get_function("naive")
    tex3d = mod.get_function("tex3d")

    tcf = mod.get_texref("tcf")
    tcf.set_array(tcf_gpu)

    # measure kernel execution time
    start = cuda.Event()
    stop = cuda.Event()
    start.record()

    naive(f_gpu, cf_gpu, block=(16, 16, 1), grid=(nz / 16, nx * ny / 16))
    tex3d(g_gpu,
          block=(16, 16, 1),
          grid=(nz / 16, nx * ny / 16),
          texrefs=[tcf])
    cuda.memcpy_dtoh(f, f_gpu)
    cuda.memcpy_dtoh(g, g_gpu)
    assert (np.linalg.norm(f - g) == 0)
Exemple #41
0
def weighted_basis_gpu(psf_size, grid_size, im_size, params,
                       lens_file=None, lens_psf_size=None, lens_grid_size=None):

  # generate grid
  psf_size = psf_size + (1 - np.mod(psf_size, 2))
  
  if lens_file:
    grid = scipy.misc.imread(lens_file, flatten=True)
    if np.max(grid) > 255:
      grid /= 2**(16-1)
    else:
      grid /= 255
  else:
    grid = np.zeros(psf_size, dtype=np.float32)
    grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1.
    grid = np.tile(grid, grid_size)
    lens_psf_size = psf_size
    #lens_grid_size = (1,1)
    lens_grid_size = grid_size

  grid_gpu = cu.matrix_to_array(grid, 'C')

  params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C')

  block_size = (16,16,1)
  output_size = np.array(grid_size)*np.array(psf_size)
  gpu_grid_size = (int(np.ceil(float(output_size[1])/block_size[0])),
                   int(np.ceil(float(output_size[0])/block_size[1])))

  weighted_basis_gpu = cua.empty((int(output_size[0]),
                                  int(output_size[1])), np.float32)

  preproc = '' #_generate_preproc(basis_gpu.dtype)
  mod = SourceModule(preproc + basis_code, keep=True)
  basis_fun_gpu = mod.get_function("weighted_basis")

  in_tex = mod.get_texref('in_tex')
  in_tex.set_array(grid_gpu)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

  params_tex = mod.get_texref('params_tex')
  params_tex.set_array(params_gpu)
  offset = ((np.array(im_size) - np.array(grid.shape)) /
            np.array(grid_size).astype(np.float32))
  offset = np.float32(offset)
  grid_scale = ((np.array(lens_grid_size) - 1) /
                (np.array(grid_size) - 1).astype(np.float32))
  grid_scale = np.float32(grid_scale)
  #psf_scale = ((np.array(lens_psf_size) - 1) /
  #             (np.array(psf_size) - 1).astype(np.float32))
  #psf_scale = np.float32(psf_scale)

  basis_fun_gpu(weighted_basis_gpu.gpudata,
                np.uint32(output_size[0]), np.uint32(output_size[1]),
                np.uint32(psf_size[0]), np.uint32(psf_size[1]),
                np.uint32(im_size[0]), np.uint32(im_size[1]),
                offset[0], offset[1],
                grid_scale[0], grid_scale[1],
                np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]),
                np.uint32(params.size/3),
                block=block_size, grid=gpu_grid_size)

  return weighted_basis_gpu
    cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1)))
    cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1)))
    cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None)))
    chx_gpu = cuda.to_device(set_c(cf, (None, 0, 0)))
    chy_gpu = cuda.to_device(set_c(cf, (0, None, 0)))
    chz_gpu = cuda.to_device(set_c(cf, (0, 0, None)))

    # prepare kernels
    from pycuda.compiler import SourceModule

    mod = SourceModule(kernels)
    update_e = mod.get_function("update_e")
    update_h = mod.get_function("update_h")
    update_src = mod.get_function("update_src")
    tcex = mod.get_texref("tcex")
    tcey = mod.get_texref("tcey")
    tcez = mod.get_texref("tcez")
    tchx = mod.get_texref("tchx")
    tchy = mod.get_texref("tchy")
    tchz = mod.get_texref("tchz")

    tcex.set_address(cex_gpu, cf.nbytes)
    tcey.set_address(cey_gpu, cf.nbytes)
    tcez.set_address(cez_gpu, cf.nbytes)
    tchx.set_address(chx_gpu, cf.nbytes)
    tchy.set_address(chy_gpu, cf.nbytes)
    tchz.set_address(chz_gpu, cf.nbytes)

    tpb = 512
    bpg = (nx * ny * nz) / tpb
Exemple #43
0
  if (x < Nx && y < Ny && z < Nz) {
    float value = tex3D(tex_in, (float) x, (float) y, float (z));

    surf3Dwrite((float) value, surf_out, sizeof(float) * x, y, z, cudaBoundaryModeZero);
  }

}
'''

mod = SourceModule(src_module, cache_dir=False, keep=False)

kernel = mod.get_function("test_3d_surf")
arg_types = (np.int32, np.int32, np.int32)

tex_in = mod.get_texref('tex_in')
surf_out = mod.get_surfref('surf_out')

# random shape
shape_x = np.random.randint(1, 255)
shape_y = np.random.randint(1, 255)
shape_z = np.random.randint(1, 255)

dtype = np.float32  # should match src_module's datatype

numpy_array_in = np.random.randn(shape_z, shape_y,
                                 shape_x).astype(dtype).copy()
cuda_array_in = numpy3d_to_array(numpy_array_in)
tex_in.set_array(cuda_array_in)

zeros = np.zeros_like(numpy_array_in)
def main():
    #Set up global timer
    tot_time = time.time()

    #Define constants
    BankSize = 8 # Do not go beyond 8!
    WarpSize = 32 #Do not change...
    DimGridX = 19
    DimGridY = 19
    BlockDimX = 256
    BlockDimY = 256
    SearchSpaceSize = 2**24 #BlockDimX * BlockDimY  * 32
    FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
    GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
    AlignedByteLengthGenome = 4

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":AlignedByteLengthGenome,
                 "bit_length_edge_type":3,
                 "curand_nr_threads_per_block":32,
                 "nr_tile_types":2,
                 "nr_edge_types":8,
                 "warpsize":WarpSize,
                 "fit_dim_thread_x":32*BankSize,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":BlockDimX,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4,
                 "banksize":BankSize,
                 "curand_dim_block_x":BlockDimX
                }
    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))
    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")


    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix

    #Set-up genomes
    #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_size_h = drv.mem_alloc(fitness_size.nbytes)
    #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0)
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize)
    BlockCounter = 0
    print "Will do that many kernels a ", BlockDimX,"x", BlockDimY,"x ", WarpSize, ":", MaxBlockCycles
    #quit()

    #SET UP PROCESSING
    histo = {}
     
    #INITIALISATION
    CurandKernel(curand_h, block=(WarpSize,1,1), grid=(BlockDimX, BlockDimY))
    print "Finished Curand kernel, starting main kernel..."

    #FIRST GENERATION
    proc_time = time.time()
    print "Starting first generation..."
    start = drv.Event()
    stop = drv.Event()
    start.record()
    Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64(0), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY))
    stop.record()
    stop.synchronize()
    print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)    
    print "Copying..."
    drv.memcpy_dtoh(fitness_size, fitness_size_h)
    drv.memcpy_dtoh(fitness_hash, fitness_hash_h)
    drv.memcpy_dtoh(grids, grids_h)

    #INTERMEDIATE GENERATION
    for i in range(MaxBlockCycles-1):
        print "Starting generation: ", i+1
        start = drv.Event()
        stop = drv.Event()
        start.record()
        Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64((i+1)*BlockDimX*BlockDimY*WarpSize), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY))
        "Processing..."
        for j in range(grids.shape[0]):
#            if (fitness_hash[j]!=33) and (fitness_hash[j]!=44) and (fitness_hash[j]!=22):
            if fitness_hash[j] in histo: 
                histo[fitness_hash[j]] = (histo[fitness_hash[j]][0], histo[fitness_hash[j]][1]+1)
            else:
                histo[fitness_hash[j]] = (grids[j], 1)

        stop.record()
        stop.synchronize()
        print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
        print "This corresponds to %f polyomino classification a second."%((BlockDimX*BlockDimY*WarpSize)/(start.time_till(stop)*1e-3))
        print "Copying..."
        drv.memcpy_dtoh(fitness_size, fitness_size_h)
        drv.memcpy_dtoh(fitness_hash, fitness_hash_h)
        drv.memcpy_dtoh(grids, grids_h)

    #FINAL PROCESSING
    "Processing..."
    for i in range(grids.shape[0]):
        if fitness_hash[i] in histo:
            histo[fitness_hash[i]] = (histo[fitness_hash[i]][0], histo[fitness_hash[i]][1]+1)
        else:
            histo[fitness_hash[i]] = (grids[i], 1)

    print "Done!"

    #TIMING RESULTS
    print "Total time including set-up: ", (time.time() - tot_time)
    print "Total Processing time: ", (time.time() - proc_time)

    #OUTPUT    
    print histo
def main():

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":8,
                 "bit_length_edge_type":3, 
                 "curand_nr_threads_per_block":256,
                 "nr_tile_types":4,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":1,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":1 }

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("TestEdgeSortKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")

    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix

    #a =  numpy.random.randn(400).astype(numpy.uint8)
    #b = numpy.random.randn(400).astype(numpy.uint8)
    dest = numpy.arange(256).astype(numpy.uint8)
    for i in range(0, 256/8):
        dest[i*8 + 0] = 36
        dest[i*8 + 1] = 151
        dest[i*8 + 2] = 90
        dest[i*8 + 3] = 109
        dest[i*8 + 4] = 224
        dest[i*8 + 5] = 4
        dest[i*8 + 6] = 0
        dest[i*8 + 7] = 0
    
    dest_h = drv.mem_alloc(dest.nbytes)

    drv.memcpy_htod(dest_h, dest)
    print "before: "
    print dest
    curand = numpy.zeros(40*256).astype(numpy.uint8);
    curand_h = drv.mem_alloc(curand.nbytes)
    CurandKernel(curand_h, block=(32,1,1), grid=(1,1))
    Kernel(dest_h, curand_h, block=(32,1,1), grid=(1,1))
    drv.memcpy_dtoh(dest, dest_h)
    print "after: "
    print dest
Exemple #46
0
gridx = nWidth // block_size_x + 1 * (nWidth % block_size_x != 0)
gridy = nHeight // block_size_y + 1 * (nHeight % block_size_y != 0)
block2D = (block_size_x, block_size_y, 1)
grid2D = (gridx, gridy, 1)
grid2D_ising = (gridx // 2, gridy, 1
                )  #special grid to avoid neighbor conflicts

#initialize pyCUDA context
cudaDevice = setCudaDevice(devN=useDevice, usingAnimation=True)

#Read and compile CUDA code
print "\nCompiling CUDA code"
cudaCodeString_raw = open("CUDAising2D.cu", "r").read()
cudaCodeString = cudaCodeString_raw  # % { "BLOCK_WIDTH":block2D[0], "BLOCK_HEIGHT":block2D[1], "BLOCK_DEPTH":block2D[2], }
cudaCode = SourceModule(cudaCodeString)
tex_spins = cudaCode.get_texref('tex_spinsIn')
isingKernel = cudaCode.get_function('ising_kernel')


########################################################################
def sendToScreen(plotData):
    floatToUchar(plotData, plotData_d)
    copyToScreenArray()


########################################################################
def swipe():
    randomNumbers_d = curandom.rand((nData))
    stepNumber = np.int32(0)
    #saveEnergy = np.int32(0)
    tex_spins.set_array(spinsInArray_d)
Exemple #47
0
  def set_params(self, psf_size, grid_size, im_size, params=None):

    # generate grid
    psf_size = np.array(psf_size)
    grid_size = np.array(grid_size)
    im_size = np.array(im_size)
    self.psf_size = psf_size + (1 - np.mod(psf_size, 2))
    self.grid_size = grid_size
    self.im_size = im_size

    if params != None:
      self.params = params
      self.shape = (params.size / 3,)
    else:
      self._psf2params()
    
    if not self.lens:
      grid = np.zeros(self.psf_size, dtype=np.float32)
      grid[(self.psf_size[0]-1)/2, (self.psf_size[1]-1)/2] = 1.
      grid = np.tile(grid, self.grid_size)
      self.lens_psf_size = self.psf_size
      #lens_grid_size = (1,1)
      self.lens_grid_size = self.grid_size
      self.grid_gpu = cu.matrix_to_array(grid, 'C')

    params_count = np.uint32(self.params.size / 3)
    params_gpu = cu.matrix_to_array(self.params.astype(np.float32), 'C')

    #self.output_size = np.array(self.grid_size)*np.array(self.psf_size)
    output_size = np.array((np.prod(self.grid_size),
                            self.psf_size[0], self.psf_size[1]))

    preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype)
    mod = SourceModule(preproc + basis_code, keep=True)

    in_tex = mod.get_texref('in_tex')
    in_tex.set_array(self.grid_gpu)
    in_tex.set_filter_mode(cu.filter_mode.LINEAR)
    #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

    params_tex = mod.get_texref('params_tex')
    params_tex.set_array(params_gpu)
    offset = ((np.array(self.im_size) - np.array(grid.shape)) /
                   np.array(self.grid_size).astype(np.float32))
    offset = np.float32(offset)
    grid_scale = ((np.array(self.lens_grid_size) - 1) /
                       (np.array(self.grid_size) - 1).astype(np.float32))
    grid_scale = np.float32(grid_scale)

    block_size = (16,16,1)
    gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])),
                     int(np.ceil(float(params_count)/block_size[1])))

    basis_gpu = cua.empty((int(params_count),
                           int(output_size[0]), int(output_size[1]),
                           int(output_size[2])), np.float32)
    #self.basis_host = cu.pagelocked_empty((int(params_count),
    #    int(output_size[0]), int(output_size[1]), int(output_size[2])),
    #    np.float32, mem_flags=cu.host_alloc_flags.DEVICEMAP)

    basis_fun_gpu = mod.get_function("basis")

    basis_fun_gpu(basis_gpu.gpudata,
                  np.uint32(np.prod(output_size)),
                  np.uint32(self.grid_size[1]),
                  np.uint32(self.psf_size[0]), np.uint32(self.psf_size[1]),
                  np.uint32(self.im_size[0]), np.uint32(self.im_size[1]),
                  offset[0], offset[1],
                  grid_scale[0], grid_scale[1],
                  np.uint32(self.lens_psf_size[0]),
                  np.uint32(self.lens_psf_size[1]),
                  params_count,
                  block=block_size, grid=gpu_grid_size)

    self.basis_host = basis_gpu.get()
    self._intern_shape = self.basis_host.shape
    self.basis_host = self.basis_host.reshape((self._intern_shape[0],
        self._intern_shape[1]*self._intern_shape[2]*self._intern_shape[3]))
    self.basis_host = scipy.sparse.csr_matrix(self.basis_host)
Exemple #48
0
def generate_basis_gpu(psf_size, grid_size, im_size, params,
                       lens_file=None, lens_psf_size=None, lens_grid_size=None):

  # generate grid
  psf_size = psf_size + (1 - np.mod(psf_size, 2))
  
  if lens_file:
    grid = scipy.misc.imread(lens_file, flatten=True)
    if np.max(grid) > 255:
      grid /= 2**(16-1)
    else:
      grid /= 255
  else:
    grid = np.zeros(psf_size, dtype=np.float32)
    grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1.
    grid = np.tile(grid, grid_size)
    lens_psf_size = psf_size
    #lens_grid_size = (1,1)
    lens_grid_size = grid_size

  grid_gpu = cu.matrix_to_array(grid, 'C')

  # generate parameters of basis functions
  #p = max(1, np.floor(psf_size[0] / 2))
  #p = min(8, p)
  #dp = min(45. / np.floor(psf_size * grid_size / 2))
  #dp = min(0.8, dp)
  #dp = np.radians(dp)
  #p = np.radians(p)
  #l = max(1, np.floor(psf_size[0] / 2))
  #l = np.ceil(l / 2)
  #params = np.mgrid[-l:l+1, -l:l+1, -p:p+dp/10:dp].astype(np.float32).T
  #params = params.reshape(params.size / 3, 3)
  params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C')

  block_size = (16,16,1)
  output_size = np.array((np.prod(np.array(grid_size)),psf_size[0],psf_size[1]))
  gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])),
                   int(np.ceil(float(params.size/3)/block_size[1])))

  basis_gpu = cua.empty((params.size/3,
                         int(output_size[0]), int(output_size[1]),
                         int(output_size[2])), np.float32)

  preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype)
  mod = SourceModule(preproc + basis_code, keep=True)
  basis_fun_gpu = mod.get_function("basis")

  in_tex = mod.get_texref('in_tex')
  in_tex.set_array(grid_gpu)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

  params_tex = mod.get_texref('params_tex')
  params_tex.set_array(params_gpu)
  offset = ((np.array(im_size) - np.array(grid.shape)) /
            np.array(grid_size).astype(np.float32))
  offset = np.float32(offset)
  grid_scale = ((np.array(lens_grid_size) - 1) /
                (np.array(grid_size) - 1).astype(np.float32))
  grid_scale = np.float32(grid_scale)
  #psf_scale = ((np.array(lens_psf_size) - 1) /
  #             (np.array(psf_size) - 1).astype(np.float32))
  #psf_scale = np.float32(psf_scale)

  basis_fun_gpu(basis_gpu.gpudata,
                np.uint32(np.prod(output_size)), np.uint32(grid_size[1]),
                np.uint32(psf_size[0]), np.uint32(psf_size[1]),
                np.uint32(im_size[0]), np.uint32(im_size[1]),
                offset[0], offset[1],
                grid_scale[0], grid_scale[1],
                np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]),
                np.uint32(params.size/3),
                block=block_size, grid=gpu_grid_size)

  return basis_gpu
Exemple #49
0
class GPURBFEll(object):
    """RBF Kernel with ellpack format"""
    
    cache_size =100
    
    Gamma=1.0
    
    #template
    func_name='rbfEllpackILPcol2multi'
    
    #template    
    module_file = os.path.dirname(__file__)+'/cu/KernelsEllpackCol2.cu'
    
    #template
    texref_nameI='VecI_TexRef'
    texref_nameJ='VecJ_TexRef'
    
    max_concurrent_kernels=1
   
    def __init__(self,gamma=1.0,cache_size=100):
        """
        Initialize object
        
        Parameters
        -------------
        
        max_kernel_nr: int
            determines maximal concurrent kernel column gpu computation
        """
        self.cache_size=cache_size
  
        self.threadsPerRow=1
        self.prefetch=2        
        
        self.tpb=128
        self.Gamma = gamma
       
        
        
        
        
    def init_cuda(self,X,Y, cls_start, max_kernels=1 ):
        
        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels 
        
        self.X =X
        self.Y = Y
        
        self.cls_start=cls_start.astype(np.int32)
        
        #handle to gpu memory for y for each concurrent classifier
        self.g_y=[]
        #handle to gpu memory for results for each concurrent classifier
        self.g_out=[] #gpu kernel out
        self.kernel_out=[] #cpu kernel out
        #blocks per grid for each concurrent classifier    
        self.bpg=[]
        
        #function reference
        self.func=[]
        
        #texture references for each concurrent kernel
        self.tex_ref=[]

        #main vectors 
        #gpu        
        self.g_vecI=[]
        self.g_vecJ=[]
        #cpu
        self.main_vecI=[]
        self.main_vecJ=[]    
        
        #cpu class 
        self.cls_count=[]
        self.cls=[]
        #gpu class
        self.g_cls_count=[]
        self.g_cls=[]
        
        self.sum_cls=[]
        
        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)            
#            self.func.append(0)
#            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
#            self.main_vecI.append(0)
#            self.main_vecJ.append(0)
            self.sum_cls.append(0)
            
            
        self.N,self.Dim = X.shape
        column_size = self.N*4
        cacheMB = self.cache_size*1024*1024 #100MB for cache size   
        
        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB/column_size).astype(int)
        
        cache_items = min(self.N,cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)        
        
        self.compute_diag()
        
        #cuda initialization
        cuda.init()        
        
        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code        
        with open (self.module_file,"r") as CudaFile:
            module_code = CudaFile.read();
        
        #compile module
        self.module = SourceModule(module_code,keep=True,no_extern_c=True)
        
        (g_gamma,gsize)=self.module.get_global('GAMMA')       
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) )
        
        #get functions reference

        Dim =self.Dim        
        vecBytes = Dim*4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex=self.module.get_texref('VecI_TexRef')
            self.g_vecI[f]=cuda.mem_alloc( vecBytes)           
            vecI_tex.set_address(self.g_vecI[f],vecBytes)

            #init texture for vector J
            vecJ_tex=self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f]=cuda.mem_alloc( vecBytes)     
            vecJ_tex.set_address(self.g_vecJ[f],vecBytes)
            
            self.tex_ref.append((vecI_tex,vecJ_tex) )
            
            self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32))
            self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32))
            
            texReflist = list(self.tex_ref[f])
            
            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist)
            
        
        #transform X to particular format
        v,c,r=spf.csr2ellpack(self.X,align=self.prefetch)
        #copy format data structure to gpu memory
        
        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)
        
        self.g_cls_start = cuda.to_device(self.cls_start)
        
        
        
        
    def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n):
        """
        Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2.
         
        Parameters
        ------------
        kernel_nr : int
            concurrent kernel number
        y_cls : array-like
            binary class labels (1,-1)
        cls1: int
            first class number
        cls2: int
            second class number
        cls1_n : int
            number of elements of class 1
        cls2_n : int
            number of elements of class 2
        kernel_out : array-like
            array for gpu kernel result, size=2*len(y_cls)
        
        """
        warp=32
        align_cls1_n =  cls1_n+(warp-cls1_n%warp)%warp
        align_cls2_n =  cls2_n+(warp-cls2_n%warp)%warp
        
        self.cls1_N_aligned=align_cls1_n

        sum_cls= align_cls1_n+align_cls2_n   
        self.sum_cls[kernel_nr] = sum_cls
              
        
        self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32)
        self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32)  
        
        self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr])
        
        self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr])
        
        self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb ))
        
        self.g_y[kernel_nr] =  cuda.to_device(y_cls)
        
        self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32)
        
        ker_out = self.kernel_out[kernel_nr]      
        self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out)
        
    
        #add prepare for device functions
        
    
    
    def K2Col(self,i,j,i_ds,j_ds,kernel_nr):
        """ 
        computes i-th and j-th kernel column 

        Parameters
        ---------------
        i: int
            i-th kernel column number in subproblem
        j: int
            j-th kernel column number in subproblem

        i_ds: int
            i-th kernel column number in whole dataset
        j_ds: int
            j-th kernel column number in  whole dataset

        kernel_nr : int
            number of concurrent kernel
            
        ker2ColOut: array like
            array for output
        
        Returns
        -------
        ker2Col
        
        """ 
        
        #make i-th and j-the main vectors
        vecI= self.main_vecI[kernel_nr]
        vecJ= self.main_vecJ[kernel_nr]
        
#        self.X[i_ds,:].todense(out=vecI)        
#        self.X[j_ds,:].todense(out=vecJ)  
        
        #vecI.fill(0)
        #vecJ.fill(0)
        
        
        
        #self.X[i_ds,:].toarray(out=vecI)        
        #self.X[j_ds,:].toarray(out=vecJ)        
        
        vecI=self.X.getrow(i_ds).todense()
        vecJ=self.X.getrow(j_ds).todense()
        
        
        #copy them to texture
        cuda.memcpy_htod(self.g_vecI[kernel_nr],vecI)
        cuda.memcpy_htod(self.g_vecJ[kernel_nr],vecJ)
        
#        temp = np.empty_like(vecI)
#        cuda.memcpy_dtoh(temp,self.g_vecI[kernel_nr])        
#        print 'temp',temp
        #lauch kernel
        
        gfunc=self.func[kernel_nr]
        gy = self.g_y[kernel_nr]
        gout = self.g_out[kernel_nr]
        gN = np.int32(self.N)
        g_i = np.int32(i)
        g_j = np.int32(j)
        g_ids = np.int32(i_ds)
        g_jds = np.int32(j_ds)
        gNalign = np.int32(self.cls1_N_aligned)
        gcs = self.g_cls_start
        gcc = self.g_cls_count[kernel_nr]
        gc  = self.g_cls[kernel_nr]
        bpg=self.bpg[kernel_nr]
        
        
        #print 'start gpu i,j,kernel_nr ',i,j,kernel_nr
        #texReflist = list(self.tex_ref[kernel_nr])                
        #gfunc(self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc,block=(self.tpb,1,1),grid=(bpg,1),texrefs=texReflist)
        #print 'end gpu',i,j
        #copy the results
       
        #grid=(bpg,1),block=(self.tpb,1,1)
        gfunc.prepared_call((bpg,1),(self.tpb,1,1),self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc)
        
        cuda.memcpy_dtoh(self.kernel_out[kernel_nr],gout)

                
        
        return self.kernel_out[kernel_nr]
        
    def K_vec(self,vec):
        '''
        vec - array-like, row ordered data, should be not to big
        '''
        
        dot=self.X.dot(vec.T)  
        x2=self.Xsquare.reshape((self.Xsquare.shape[0],1))
        if(sp.issparse(vec)):        
            v2 = vec.multiply(vec).sum(1).reshape((1,vec.shape[0]))        
        else:
            v2 =  np.einsum('...i,...i',vec,vec)
        
        return np.exp(-self.Gamma*(x2+v2-2*dot))
        
    def compute_diag(self):
        """
        Computes kernel matrix diagonal
        """
        
        #for rbf diagonal consists of ones exp(0)==1
        self.Diag = np.ones(self.X.shape[0],dtype=np.float32)

        if(sp.issparse(self.X)):
            # result as matrix
            self.Xsquare = self.X.multiply(self.X).sum(1)
            #result as array
            self.Xsquare = np.asarray(self.Xsquare).flatten()
        else:
            self.Xsquare =np.einsum('...i,...i',self.X,self.X)
        
        
    def clean(self,kernel_nr):
        """ clean the kernel cache """
        #self.kernel_cache.clear()

        self.bpg[kernel_nr]=0

          
        
        
        


    def clean_cuda(self):
        '''
        clean all cuda resources
        '''
        
        
        for f in range(self.max_concurrent_kernels):
            
            #vecI_tex=??
            #self.g_vecI[f].free()     
            del self.g_vecI[f]

            #init texture for vector J
            #vecJ_tex=??
            #self.g_vecJ[f].free()
            del self.g_vecJ[f]
            self.g_cls_count[f].free()
            self.g_cls[f].free()
            self.g_y[f].free()
            self.g_out[f].free()

        #test it
        #del self.g_out[f] ??
        
        #copy format data structure to gpu memory
        
        self.g_val.free()
        self.g_col.free()
        self.g_len.free()
        self.g_sdot.free()
        self.g_cls_start.free()
         
        print self.ctx 
        self.ctx.pop()
        
        print self.ctx
        del self.ctx
        
        
        

    def predict_init(self, SV):
        """
        Init the classifier for prediction
        """        
        
        self.X =SV
        self.compute_diag()
def main():

    #Initialise InteractionMatrix
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Initialise GPU (equivalent of autoinit)
    drv.init()
    assert drv.Device.count() >= 1
    dev = drv.Device(0)
    ctx = dev.make_context(0)

    #Convert GlobalParams to List
    GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32)
    count = 0
    for x in GlobalParamsDict.keys():
        GlobalParams[count] = GlobalParamsDict[x]
        count += 1

    #Convert FitnessParams to List
    FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32)
    count = 0
    for x in FitnessParamsDict.keys():
        FitnessParams[count] = FitnessParamsDict[x]
        count += 1

    #Convert GAParams to List
    GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32)
    count = 0
    for x in GAParamsDict.keys():
        GAParams[count] = GAParamsDict[x]
        count += 1

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', 'cuda'))

    # Load source code from file
    Source = env.get_template('kernel.cu') #Template( file(KernelFile).read() )

    #Create dictionary argument for rendering
    RenderArgs= {"params_size":GlobalParams.nbytes,\
                "fitnessparams_size":FitnessParams.nbytes,\
                "gaparams_size":GAParams.nbytes,\
                "genome_bytelength":int(ByteLengthGenome),\
                "genome_bitlength":int(BitLengthGenome),\
                "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\
                "textures":range( 0, NrFitnessFunctionGrids ),\
                "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\
                "with_mixed_crossover":WithMixedCrossover,
                "with_bank_conflict":WithBankConflict,
                "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection,
                "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues,
                "with_uniform_crossover":WithUniformCrossover,
                "with_single_point_crossover":WithSinglePointCrossover,
                "with_surefire_mutation":WithSurefireMutation,
                "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory,
                "ga_threaddimx":int(ThreadDim),
                "glob_nr_tiletypes":int(NrTileTypes),
                "glob_nr_edgetypes":int(NrEdgeTypes),
                "glob_nr_tileorientations":int(NrTileOrientations),
                "fit_dimgridx":int(DimGridX),
                "fit_dimgridy":int(DimGridY),
                "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids),
                "fit_nr_fourpermutations":int(NrFourPermutations),
                "fit_assembly_redundancy":int(NrAssemblyRedundancy),
                "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock),
                "sort_threaddimx":int(Sort_ThreadDimX),
                "glob_nr_genomes":int(NrGenomes),
                "fit_dimthreadx":int(ThreadDimX),
                "fit_dimthready":int(ThreadDimY),
                "fit_dimsubgridx":int(SubgridDimX),
                "fit_dimsubgridy":int(SubgridDimY),
                "fit_nr_subgridsperbank":int(NrSubgridsPerBank),
                "glob_bitlength_edgetype":int(EdgeTypeBitLength)
                }

    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None)

    #Allocate values on GPU
    Genomes_h = drv.mem_alloc(Genomes.nbytes)
    FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
    FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
    AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
    Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
    ReductionList_h = drv.mem_alloc(ReductionList.nbytes)

    #Copy values to global memory
    drv.memcpy_htod(Genomes_h, Genomes)
    drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
    drv.memcpy_htod(FitnessValues_h, FitnessValues)
    drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
    drv.memcpy_htod(Mutexe_h, Mutexe)

    #Copy values to constant / texture memory
    for id in range(0, NrFitnessFunctionGrids):
        FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
        drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")

    GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
    drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
    FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
    drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
    GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
    drv.memcpy_htod(GAParams_h[0], GAParams)
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
    FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
    FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")

    #Set up curandStates
    curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
    CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)

    #Compile kernels
    curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
    fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
    sorting_fnc = KernelSourceModule.get_function("SortingKernel")
    ga_fnc = KernelSourceModule.get_function("GAKernel")

    #Initialise Curand
    curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))

    #Build parameter lists for FitnessKernel and GAKernel
    FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
    SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
    GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);

    #TEST ONLY
    return
    #TEST ONLY

    #Initialise CUDA timers
    start = drv.Event()
    stop = drv.Event()

    #execute kernels for specified number of generations
    start.record()
    for gen in range(0, GlobalParamsDict["NrGenerations"]):
        #print "Processing Generation: %d"%(gen)

        #fitness_fnc(*(FitnessKernelParams), block=fit_blocks, grid=fit_grid)

        #Launch CPU processing (should be asynchroneous calls)

        sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel

        drv.memcpy_dtoh(ReductionList, ReductionList_h) #Copy from Device to Host and finish sorting
        FitnessSumConst = ReductionList.sum()
        drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory
        drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitneValues from Device to Device Const

        ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids)

        drv.memcpy_dtoh(Genomes, Genomes_h) #Copy data from GPU
        drv.memcpy_dtoh(FitnessValues, FitnessValues_h)
        drv.memcpy_dtoh(AssembledGrids, AssembledGrids_h)

    stop.record()
    stop.synchronize()
    print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
    print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
    pass
def main():

    #FourPermutations set-up
    FourPermutations = numpy.array([ [1,2,3,4],
                                  [1,2,4,3],
                                  [1,3,2,4],
                                  [1,3,4,2],
                                  [1,4,2,3],
                                  [1,4,3,2],
                                  [2,1,3,4],
                                  [2,1,4,3],
                                  [2,3,1,4],
                                  [2,3,4,1],
                                  [2,4,1,3],
                                  [2,4,3,1],
                                  [3,2,1,4],
                                  [3,2,4,1],
                                  [3,1,2,4],
                                  [3,1,4,2],
                                  [3,4,2,1],
                                  [3,4,1,2],
                                  [4,2,3,1],
                                  [4,2,1,3],
                                  [4,3,2,1],
                                  [4,3,1,2],
                                  [4,1,2,3],
                                  [4,1,3,2],]).astype(numpy.uint8)

    BankSize = 8 # Do not go beyond 8!

    #Define constants
    DimGridX = 19
    DimGridY = 19
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
    BlockDimX = 100
    BlockDimY = 100
    SearchSpaceSize = BlockDimX * BlockDimY  * 32
    #BlockDimX = 600
    #BlockDimY = 600
    FitnessValDim = SearchSpaceSize
    GenomeDim = SearchSpaceSize

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":4,
                 "bit_length_edge_type":3,
                 "curand_nr_threads_per_block":32,
                 "nr_tile_types":2,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":32*BankSize,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":BlockDimX,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4,
                 "banksize":BankSize,
                 "curand_dim_block_x":BlockDimX
                }
    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")


    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix



    #Set-up genomes
    dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*4) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    fitness_left = numpy.zeros(FitnessValDim).astype(numpy.float32)
    fitness_left_h = drv.mem_alloc(fitness_left.nbytes)
    fitness_bottom = numpy.zeros(FitnessValDim).astype(numpy.float32)
    fitness_bottom_h = drv.mem_alloc(fitness_bottom.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    grids = numpy.zeros((10000*32, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #Set-up four permutations
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*32)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*32)
    BlockCounter=0
    print "Will do that many kernels a ",BlockDimX,"x",BlockDimY,":", MaxBlockCycles

    for i in range(MaxBlockCycles):
        #Set-up timer
        start = drv.Event()
        stop = drv.Event()
        start.record()
        print "Start kernel:"

        #Call kernels
        CurandKernel(curand_h, block=(32,1,1), grid=(BlockDimX, BlockDimY))
        print "Finished Curand kernel, starting main kernel..."
        Kernel(dest_h, grids_h, fitness_left_h, fitness_bottom_h, curand_h, block=(32*BankSize,1,1), grid=(BlockDimX,BlockDimY))

        #End timer
        stop.record()
        stop.synchronize()
        print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
        #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
        pass

    #Output
    #drv.memcpy_dtoh(dest, dest_h)
    #print "Genomes after: "
    #print dest[0:4]
    
    drv.memcpy_dtoh(fitness_left, fitness_left_h)
    print "FitnessLeft after: "
    print fitness_left#[1000000:1000500]

    drv.memcpy_dtoh(fitness_bottom, fitness_bottom_h)
    print "FitnessBottom after: "
    print fitness_bottom#[1000000:1000500]

    drv.memcpy_dtoh(grids, grids_h)
    print "Grids[0] after: "
    for i in range(0,5):
        print "Grid ",i,": "
        print grids[i]
    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"),
            [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
                ]
            ))

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "in_vector_tex"),
                ])
        if with_scaling:
            cmod.append(
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "scaling_tex"),
                )

        par = self.plan.parallelism

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                    self.plan.aligned_preimage_dofs_per_microblock),
                Define("MB_EL_COUNT", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
                Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()),
                Define("MATRIX_SEGMENT_BYTES",
                    "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),

                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_matrix"),
                    "MATRIX_SEGMENT_FLOATS")),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "dof_buffer"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "DOFS_PER_SEGMENT"),
                    ),
                CudaShared(POD(numpy.uint16, "segment_start_el")),
                CudaShared(POD(numpy.uint16, "segment_stop_el")),
                CudaShared(POD(numpy.uint16, "segment_el_count")),
                Line(),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [(chk*self.plan.segment_size)//given.dofs_per_el()
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [min(given.microblock.elements,
                            (chk*self.plan.segment_size+self.plan.segment_size-1)
                                //given.dofs_per_el()+1)
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate this dof's element", [
            Initializer(POD(numpy.uint8, "mb_el"),
                "MB_DOF/DOFS_PER_EL") ])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If("THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count", "segment_stop_el-segment_start_el"),
                        ])
                    ),
                S("__syncthreads()")
                ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_matrix",
                base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                bytes="MATRIX_SEGMENT_BYTES",
                descr="load matrix segment")
            +[S("__syncthreads()")]
            )

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                    self.plan.segment_size):
                result.extend(
                        [S("__syncthreads()")]
                        +[Assign(
                            "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                            % (inl, load_segment_start)
                            )
                        for inl in range(par.inline)]
                        +[S("__syncthreads()"),
                        Line(),
                        ])

                for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                                S("result%d += "
                                    "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                                    "*"
                                    "dof_buffer[PAR_MB_NR][%d][%d]"
                                    % (inl, dof, inl, dof-load_segment_start))
                                )
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll
        def get_direct_tex_mat_mul_code():
            return (
                    [POD(float_type, "fof%d" % inl) for inl in range(par.inline)]
                    + [POD(float_type, "lm"), Line()]
                    + unroll(
                        lambda j: [
                        Assign("fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)"
                            % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},)
                        for inl in range(par.inline)
                        ]+[
                        Assign("lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]"
                            % {"j":j, "row": "SEGMENT_DOF"},
                            )
                        ]+[
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl})
                        for inl in range(par.inline)
                        ],
                        total_number=self.plan.preimage_dofs_per_el,
                        max_unroll=self.plan.max_unroll)
                    + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex,"
                        "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                    ]+[Line()]
                    +get_mat_mul_code(fetch_count)
                    +[
                    If(write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl,
                                "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl}))
                                )
                            for inl in range(par.inline)
                            ])
                        )
                    ])
                )

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                        mat_mul_outer_loop(fetch_count))
                    for fetch_count in
                    range(1, self.plan.max_elements_touched_by_segment()+1)]
                    ))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=12"]
                )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                    self.plan.debug_name,
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare(
                "PPPI",
                block=(self.plan.segment_size, self.plan.parallelism.parallel, 1),
                texrefs=texrefs)

        return func, in_vector_texref, scaling_texref
mod2 = SourceModule( kernels2.replace('TPB',str(tpb)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) )
mod3 = SourceModule( kernels3.replace('TPB',str(tpb)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) )
mod4 = SourceModule( kernels4.replace('Dx',str(Dx)).replace('Dy',str(Dy)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) )
mod5 = SourceModule( kernels5.replace('Dx',str(Dx)).replace('Dy',str(Dy)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) )

h1 = mod1.get_function("update_h")
e1 = mod1.get_function("update_e")
h2 = mod2.get_function("update_h")	# avoid mis-aligned
e2 = mod2.get_function("update_e")
h3 = mod3.get_function("update_h")	# avoid mis-aligned, duplicated
e3 = mod3.get_function("update_e")
h4 = mod4.get_function("update_h")	# avoid mis-aligned, duplicated and 2d block
e4 = mod4.get_function("update_e")
e5 = mod5.get_function("update_e")	# avoid mis-aligned, duplicated and 2d block, tex3D

tcex = mod5.get_texref("tcex")
tcey = mod5.get_texref("tcey")
tcez = mod5.get_texref("tcez")
tcex.set_array(tcex_gpu)
tcey.set_array(tcey_gpu)
tcez.set_array(tcez_gpu)

h1.prepare("PPPPPP", block=(tpb,1,1))
e1.prepare("PPPPPPPPP", block=(tpb,1,1))
h2.prepare("PPPPPP", block=(tpb,1,1))
e2.prepare("PPPPPPPPP", block=(tpb,1,1))
h3.prepare("PPPPPP", block=(tpb,1,1))
e3.prepare("PPPPPPPPP", block=(tpb,1,1))
h4.prepare("PPPPPP", block=(Dx,Dy,1))
e4.prepare("PPPPPPPPP", block=(Dx,Dy,1))
e5.prepare("PPPPPP", block=(Dx,Dy,1), texrefs=[tcex,tcey,tcez])
    arrcopy(mcopy, set_c(f, (None, -1, -1)), cex_gpu)
    arrcopy(mcopy, set_c(f, (-1, None, -1)), cey_gpu)
    arrcopy(mcopy, set_c(f, (-1, -1, None)), cez_gpu)
    arrcopy(mcopy, set_c(f, (None, 0, 0)), chx_gpu)
    arrcopy(mcopy, set_c(f, (0, None, 0)), chy_gpu)
    arrcopy(mcopy, set_c(f, (0, 0, None)), chz_gpu)

    # prepare kernels
    from pycuda.compiler import SourceModule
    mod = SourceModule(kernels)
    update_e = mod.get_function("update_e")
    update_h = mod.get_function("update_h")
    update_src = mod.get_function("update_src")

    # bind a texture reference to linear memory
    tcex = mod.get_texref("tcex")
    tcey = mod.get_texref("tcey")
    tcez = mod.get_texref("tcez")
    tchx = mod.get_texref("tchx")
    tchy = mod.get_texref("tchy")
    tchz = mod.get_texref("tchz")

    tcex.set_array(cex_gpu)
    tcey.set_array(cey_gpu)
    tcez.set_array(cez_gpu)
    tchx.set_array(chx_gpu)
    tchy.set_array(chy_gpu)
    tchz.set_array(chz_gpu)

    Db = (16, 4, 4)
    Dg = (nx / 16, ny * nz / (4 * 4))
def main():

    #Initialise InteractionMatrix
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Initialise GPU (equivalent of autoinit)
    drv.init()
    assert drv.Device.count() >= 1
    dev = drv.Device(0)
    ctx = dev.make_context(0)

    #Convert GlobalParams to List
    GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32)
    count = 0
    for x in GlobalParamsDict.keys():
        GlobalParams[count] = GlobalParamsDict[x]
        count += 1

    #Convert FitnessParams to List
    FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32)
    count = 0
    for x in FitnessParamsDict.keys():
        FitnessParams[count] = FitnessParamsDict[x]
        count += 1

    #Convert GAParams to List
    GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32)
    count = 0
    for x in GAParamsDict.keys():
        GAParams[count] = GAParamsDict[x]
        count += 1

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main_discoverytime', './templates'))

    # Load source code from file
    Source = env.get_template('./kernel.cu') #Template( file(KernelFile).read() )

    #Create dictionary argument for rendering
    RenderArgs= {"params_size":GlobalParams.nbytes,\
                "fitnessparams_size":FitnessParams.nbytes,\
                "gaparams_size":GAParams.nbytes,\
                "genome_bytelength":int(ByteLengthGenome),\
                "genome_bitlength":int(BitLengthGenome),\
                "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\
                "textures":range( 0, NrFitnessFunctionGrids ),\
                "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\
                "with_mixed_crossover":WithMixedCrossover,
                "with_bank_conflict":WithBankConflict,
                "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection,
                "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues,
                "with_uniform_crossover":WithUniformCrossover,
                "with_single_point_crossover":WithSinglePointCrossover,
                "with_surefire_mutation":WithSurefireMutation,
                "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory,
                "ga_threaddimx":int(GA_ThreadDim),
                "glob_nr_tiletypes":int(NrTileTypes),
                "glob_nr_edgetypes":int(NrEdgeTypes),
                "glob_nr_tileorientations":int(NrTileOrientations),
                "fit_dimgridx":int(DimGridX),
                "fit_dimgridy":int(DimGridY),
                "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids),
                "fit_nr_fourpermutations":int(NrFourPermutations),
                "fit_assembly_redundancy":int(NrAssemblyRedundancy),
                "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock),
                "sort_threaddimx":int(Sort_ThreadDimX),
                "glob_nr_genomes":int(NrGenomes),
                "fit_dimthreadx":int(ThreadDimX),
                "fit_dimthready":int(ThreadDimY),
                "fit_dimsubgridx":int(SubgridDimX),
                "fit_dimsubgridy":int(SubgridDimY),
                "fit_nr_subgridsperbank":int(NrSubgridsPerBank),
                "glob_bitlength_edgetype":int(EdgeTypeBitLength),
                "fitness_break_value":int(BitLengthGenome),   # ADAPTED FOR DISCOVERY KERNEL
                "fitness_flag_index":int(NrGenomes)
                }

    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_20", code="sm_20", cache_dir=None)

    #Allocate values on GPU
    Genomes_h = drv.mem_alloc(Genomes.nbytes)
    FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
    FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
    AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
    Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
    #ReductionList_h = drv.mem_alloc(ReductionList.nbytes)

    #Copy values to global memory
    drv.memcpy_htod(Genomes_h, Genomes)
    drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
    drv.memcpy_htod(FitnessValues_h, FitnessValues)
    drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
    drv.memcpy_htod(Mutexe_h, Mutexe)

    #Copy values to constant / texture memory
    for id in range(0, NrFitnessFunctionGrids):
        FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
        drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")

    GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
    drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
    FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
    drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
    GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
    drv.memcpy_htod(GAParams_h[0], GAParams)
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
    FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
    FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")

    #Set up curandStates
    curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
    CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)

    #Compile kernels
    curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
    #fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
    sorting_fnc = KernelSourceModule.get_function("SortingKernel")
    ga_fnc = KernelSourceModule.get_function("GAKernel")

    #Initialise Curand
    curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))

    #Build parameter lists for FitnessKernel and GAKernel
    FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
    SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
    GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);

    #TEST ONLY
    #return #ADAPTED
    #TEST ONLY

    #START ADAPTED
    print "GENOMES NOW:\n"
    print Genomes
    print ":::STARTING KERNEL EXECUTION:::"
    #STOP ADAPTED

    #Discovery time parameters
    min_fitness_value = BitLengthGenome # Want all bits set
    mutation_rate = -2.0 #normally: -2


    #Define Numpy construct to sideways join arrays (glue columns together)
    #Taken from: http://stackoverflow.com/questions/5355744/numpy-joining-structured-arrays
    #def join_struct_arrays(arrays):
    #    sizes = np.array([a.itemsize for a in arrays])
    #    offsets = np.r_[0, sizes.cumsum()]
    #    n = len(arrays[0])
    #    joint = np.empty((n, offsets[-1]), dtype=np.int32)
    #    for a, size, offset in zip(arrays, sizes, offsets):
    #        joint[:,offset:offset+size] = a.view(np.int32).reshape(n,size)
    #    dtype = sum((a.dtype.descr for a in arrays), [])
    #    return joint.ravel().view(dtype)
    #Test join_struct_arrays:
    #a = np.array([[1, 2], [11, 22],  [111, 222]]).astype(np.int32);
    #b = np.array([[3, 4], [33, 44],  [333, 444]]).astype(np.int32);
    #c = np.array([[5, 6], [55, 66],  [555, 666]]).astype(np.int32);
    #print "Test join_struct_arrays:"
    #print join_struct_arrays([a, b, c]) #FAILED
    #Set up PYTABLES
    #class GAGenome(IsDescription):
        #gen_id = Int32Col()
        #fitness_val = Float32Col()
        #genome = StringCol(mByteLengthGenome)
        #last_nr_mutations = Int32Col() # Contains the Nr of mutations genome underwent during this generation
        #mother_id = Int32Col() # Contains the crossover "mother"
        #father_id = Int32Col()  # Contains the crossover "father" (empty if no crossing over)
        #assembledgrid      = StringCol(DimGridX*DimGridY)   # 16-character String
    #class GAGenerations(IsDescription):
    #    nr_generations = Int32Col()
    #    nr_genomes = Int32Col()
    #    mutation_rate = Float32Col() # Contains the Nr of mutations genome underwent during this generation

    #from datetime import datetime

    #filename = "fujiama_"+str(NrGenomes)+"_"+str(RateMutation)+"_"+".h5"
    #print filename
    #h5file = openFile(filename, mode = "w", title = "GA FILE")
    #group = h5file.createGroup("/", 'fujiama_ga', 'Fujiama Genetic Algorithm output')
    #table = h5file.createTable(group, 'GaGenerations', GAGenerations, "Raw data")
    #atom = Atom.from_dtype(np.float32)

    #Initialise File I/O
    FILE = open("fujiamakernel_nrgen-" + str(NrGenomes) + "_adaptation.plot", "w")

    #ADAPTED FOR TESTING HISTOGRAM
    #TestValues = [13,24,26,31,32,14]
    #print np.histogram(TestValues,  bins=[0, 24, 32])[0][1]
    #quit()

    #Initialise CUDA timers
    start = drv.Event()
    stop = drv.Event()


    while mutation_rate < 1: # normally: 1

        #ds = h5file.createArray(f.root, 'ga_raw_'+str(mutation_rate), atom, x.shape)
        mutation_rate += 0.1
        GAParams[0]  = 10.0 ** mutation_rate
        drv.memcpy_htod(GAParams_h[0], GAParams)
        print "Mutation rate: ", GAParams[0]

        #ADAPTED: Initialise global memory (absolutely necessary!!)
        drv.memcpy_htod(Genomes_h, Genomes)
        drv.memcpy_htod(FitnessValues_h, FitnessValues)
        drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
        drv.memcpy_htod(Mutexe_h, Mutexe)

        #execute kernels for specified number of generations
        start.record()

        biggest_fit = 0
	reprange = 100
        average_breakup = np.zeros((reprange)).astype(np.float32)
        
        for rep in range(0, reprange):
            breakup_generation = GlobalParamsDict["NrGenerations"]
            dontcount = 0
            #ADAPTED: Initialise global memory (absolutely necessary!!)
            drv.memcpy_htod(Genomes_h, Genomes)
            drv.memcpy_htod(FitnessValues_h, FitnessValues)
            drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
            drv.memcpy_htod(Mutexe_h, Mutexe)

            #execute kernels for specified number of generations
            start.record()
            for gen in range(0, GlobalParamsDict["NrGenerations"]):
                #print "Processing Generation: %d"%(gen)

                #Launch CPU processing (should be asynchroneous calls)

                sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel
                drv.memcpy_dtoh(FitnessPartialSums, FitnessPartialSums_h) #Copy from Device to Host and finish sorting
                FitnessSumConst = FitnessPartialSums.sum()
                drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory
                #drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitnessValues from Device to Device Const #TEST

                ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) #TEST
                #Note: Fitness Function is here integrated into GA kernel!

                drv.memcpy_dtoh(Genomes_res, Genomes_h) #Copy data from GPU
                drv.memcpy_dtoh(FitnessValues_res, FitnessValues_h)
                #drv.memcpy_dtoh(AssembledGrids_res, AssembledGrids_h) #Takes about as much time as the whole simulation!

                #print FitnessValues_res

                #maxxie = FitnessValues_res.max()
                #if maxxie > biggest_fit:
            	#    biggest_fit = maxxie

                #print "max fitness:", maxxie
                #if maxxie >= 25.0 and breakup_generation == -1:
	        if np.histogram(FitnessValues_res,  (0, 24, 32))[0][1]  >= NrGenomes/2:
                    breakup_generation = gen
                    break
                # else:
                #    breakup_generation = -1
                #if FitnessValues[NrGenomes]  == float(1):
                #    breakup_generation = i
                #    break
                #else:
                #    breakup_generation = -1

                #maxxie = FitnessValues_res.max()
                #if maxxie >= 30:
                #    print "Max fitness value: ", FitnessValues_res.max()
                #ds[:]  = FitnessValues #join_struct_arrays(Genomes,  FitnessValues,  AssembledGrids);
                #trow = table.row
                #trow['nr_generations'] = NrGenerations
                #trow['nr_genomes'] = NrGenomes
                #trow['mutation_rate'] = mutation_rate
                #trow.append()
                #trow.flush()

            stop.record()
            stop.synchronize()
            print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
            print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / breakup_generation)
            print "Discovery time (generations) for mutation rate %f: %d"%(GAParams[0],  breakup_generation)
            #print "Max:", biggest_fit

	    #TEST MODE
	    #print "Printing all FitnessValues now:"
	    #print FitnessValues_res
            #raw_input("Next redundancy with keystroke!...");

            #if breakup_generation==0:
            #    print FitnessValues_res
            #    print "Genomes: "
            #    print Genomes_res
            average_breakup[rep] = breakup_generation / reprange
            #if breakup_generation == -1:
            #    dontcount = 1
            #    break

        #if dontcount == 1:
        #    average_breakup.fill(20000)
        FILE.write( str(GAParams[0]) + " " + str(np.median(average_breakup)) + " " + str(np.std(average_breakup)) + "\n");
        FILE.flush()

    #Clean-up pytables
    #h5file.close()
    #Clean up File I/O
    FILE.close()