Python SourceModule.get_global Examples

Programming Language: Python

Namespace/Package Name: pycuda.compiler

Class/Type: SourceModule

Method/Function: get_global

Examples at hotexamples.com: 55

Python SourceModule.get_global - 55 examples found. These are the top rated real world Python examples of pycuda.compiler.SourceModule.get_global extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SourceModule(30)

get_function(30)

get_global(30)

get_texref(30)

prepare(16)

get_surfref(4)

prepared_call(2)

find(1)

prepared_call_async(1)

read(1)

Example #1

Show file

def make_GPU_gradient(mesh, context):
    '''Prepare to compute gradient on the GPU w.r.t. the given mesh.
    Return gradient function.
    '''
    mx = int(getattr(mesh, 'nx', 1))
    my = int(getattr(mesh, 'ny', 1))
    mz = int(getattr(mesh, 'nz', 1))

    dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64)
    dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64)
    dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64)

    sizeof_double = 8
    with open(where + 'gradient2.cu') as fdlib:
        source = fdlib.read()
    module = SourceModule(source)

    mx_ptr = module.get_global("mx")[0]
    my_ptr = module.get_global("my")[0]
    mz_ptr = module.get_global("mz")[0]
    cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32))
    cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32))
    cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32))

    dxInv_ptr = module.get_global("dxInv")[0]
    dyInv_ptr = module.get_global("dyInv")[0]
    dzInv_ptr = module.get_global("dzInv")[0]
    cuda.memcpy_htod(dxInv_ptr, dxInv)
    cuda.memcpy_htod(dyInv_ptr, dyInv)
    cuda.memcpy_htod(dzInv_ptr, dzInv)

    deriv_x = module.get_function("gradient_x")
    deriv_y = module.get_function("gradient_y")
    deriv_z = module.get_function("gradient_z")

    block, grid = mesh.get_domain_decomposition(DeviceData().max_threads)

    d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64)
    d_deriv_y = gpuarray.empty_like(d_deriv_x)
    d_deriv_z = gpuarray.empty_like(d_deriv_x)

    def _gradient(scalar_values):
        '''Calculate three-dimensional gradient for GPUArray
        scalar_values.
        '''
        deriv_x(scalar_values, d_deriv_x, block=block, grid=grid)
        deriv_y(scalar_values, d_deriv_y, block=block, grid=grid)
        deriv_z(scalar_values, d_deriv_z, block=block, grid=grid)
        context.synchronize()

        return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension]
    return _gradient

Example #2

Show file

File: test_driver.py Project: davidweichiang/pycuda

    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function("copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0,255,(32,)).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(
                global_result_array,
                grid=(1, 1), block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all

Example #3

Show file

File: volumeRender.py Project: bvillasen/volumeRender

def initCUDA():
  global plotData_dArray
  global tex, transferTex
  global transferFuncArray_d
  global c_invViewMatrix
  global renderKernel
  #print "Compiling CUDA code for volumeRender"
  cudaCodeFile = open(volRenderDirectory + "/CUDAvolumeRender.cu","r")
  cudaCodeString = cudaCodeFile.read()
  cudaCodeStringComplete = cudaCodeString
  cudaCode = SourceModule(cudaCodeStringComplete, no_extern_c=True, include_dirs=[volRenderDirectory] )
  tex = cudaCode.get_texref("tex")
  transferTex = cudaCode.get_texref("transferTex")
  c_invViewMatrix = cudaCode.get_global('c_invViewMatrix')[0]
  renderKernel = cudaCode.get_function("d_render")

  if not plotData_dArray: plotData_dArray = np3DtoCudaArray( plotData_h )
  tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
  tex.set_filter_mode(cuda.filter_mode.LINEAR)
  tex.set_address_mode(0, cuda.address_mode.CLAMP)
  tex.set_address_mode(1, cuda.address_mode.CLAMP)
  tex.set_array(plotData_dArray)

  set_transfer_function( cmap_indx_0, trans_ramp_0, trans_center_0 )
  print "CUDA volumeRender initialized\n"

Example #4

Show file

File: gputools.py Project: matthiaslee/VMBD

def edgetaper_gpu(y_gpu, sf, win='barthann'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))

  # Ensure that sf is odd
  sf = sf+(1-np.mod(sf,2))
  wx = scipy.signal.get_window(win, sf[1])
  wy = scipy.signal.get_window(win, sf[0])
  maxw = wx.max() * wy.max()
  
  hsf = np.floor(sf/2)
  wx = (wx[0:hsf[1]] / maxw).astype(dtype)
  wy = (wy[0:hsf[0]] / maxw).astype(dtype)

  preproc = _generate_preproc(dtype, shape)
  preproc += '#define wx_size %d\n' % wx.size
  preproc += '#define wy_size %d\n' % wy.size
  mod = SourceModule(preproc + edgetaper_code, keep=True)
  edgetaper_gpu = mod.get_function("edgetaper")
  wx_gpu, wx_size = mod.get_global('wx')
  wy_gpu, wy_size = mod.get_global('wy')

  cu.memcpy_htod(wx_gpu, wx)
  cu.memcpy_htod(wy_gpu, wy)

  edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]),
                block=block_size, grid=grid_size)

Example #5

Show file

def edgetaper_gpu(y_gpu, sf, win='barthann'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))

  # Ensure that sf is odd
  sf = sf+(1-np.mod(sf,2))
  wx = scipy.signal.get_window(win, sf[1])
  wy = scipy.signal.get_window(win, sf[0])
  maxw = wx.max() * wy.max()
  
  hsf = np.floor(sf/2)
  wx = (wx[0:hsf[1]] / maxw).astype(dtype)
  wy = (wy[0:hsf[0]] / maxw).astype(dtype)

  preproc = _generate_preproc(dtype, shape)
  preproc += '#define wx_size %d\n' % wx.size
  preproc += '#define wy_size %d\n' % wy.size
  mod = SourceModule(preproc + edgetaper_code, keep=True)
  edgetaper_gpu = mod.get_function("edgetaper")
  wx_gpu, wx_size = mod.get_global('wx')
  wy_gpu, wy_size = mod.get_global('wy')

  cu.memcpy_htod(wx_gpu, wx)
  cu.memcpy_htod(wy_gpu, wy)

  edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]),
                block=block_size, grid=grid_size)

Example #6

Show file

    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function(
            "copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0, 255, (32, )).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(global_result_array,
                                  grid=(1, 1),
                                  block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all

Example #7

Show file

File: _farneback3d.py Project: theHamsta/farneback3d

    def __init__(self,
                 pyr_scale=0.9,
                 levels=15,
                 winsize=9,
                 num_iterations=5,
                 poly_n=5,
                 poly_sigma=1.2,
                 use_gaussian_kernel: bool = True,
                 use_initial_flow=None,
                 quit_at_level=None,
                 use_gpu=True,
                 upscale_on_termination=True,
                 fast_gpu_scaling=True,
                 vesselmask_gpu=None):
        self.pyr_scale = pyr_scale
        self.levels = levels
        self.winsize = winsize
        self.num_iterations = num_iterations
        self.poly_n = poly_n
        self.poly_sigma = poly_sigma
        self.use_gaussian_kernel = use_gaussian_kernel
        self.use_initial_flow = use_initial_flow
        self.use_gpu = use_gpu
        self.upscale_on_termination = upscale_on_termination
        self._fast_gpu_scaling = fast_gpu_scaling
        self.quit_at_level = quit_at_level
        self._dump_everything = False
        self._show_everything = False
        self._vesselmask_gpu = vesselmask_gpu
        self._resize_kernel_size_factor = 4
        self._max_resize_kernel_size = 9

        with open(
                os.path.join(os.path.dirname(__file__),
                             'farneback_kernels.cu')) as f:
            read_data = f.read()
        f.closed

        mod = SourceModule(read_data)
        self._update_matrices_kernel = mod.get_function(
            'FarnebackUpdateMatrices')
        self._invG_gpu = mod.get_global('invG')[0]
        self._weights_gpu = mod.get_global('weights')[0]
        self._poly_expansion_kernel = mod.get_function('calcPolyCoeficients')
        self._warp_kernel = mod.get_function('warpByFlowField')
        self._r1_texture = mod.get_texref('sourceTex')
        self._solve_equations_kernel = mod.get_function('solveEquationsCramer')

Example #8

Show file

class BornThread(threading.Thread):
    def __init__(self, gpu, work_queue, result, density, x, y, z, Qx, Qy, Qz):
        threading.Thread.__init__(self)
        self.born_args = density, x, y, z, Qx, Qy, Qz, result
        self.work_queue = work_queue
        self.gpu = gpu
        self.precision = Qx.dtype
        self.defines = dict(XSIZE=len(x),
                            YSIZE=len(y),
                            ZSIZE=len(z),
                            QXSIZE=len(Qx),
                            QYSIZE=len(Qy),
                            QZSIZE=len(Qz))

    def run(self):
        self.dev = cuda.Device(self.gpu)
        self.ctx = self.dev.make_context()
        src = loadkernelsrc("kernelconstg.c",
                            precision=self.precision,
                            defines=self.defines)
        #print src
        self.cudamod = SourceModule(src)
        self.cudaBorn = self.cudamod.get_function("cudaBorn")
        self.kernel()
        self.ctx.pop()
        del self.ctx
        del self.dev

    def kernel(self):
        density, x, y, z, Qx, Qy, Qz, result = self.born_args
        nx, ny, nz, nqx, nqy, nqz = [
            numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz
        ]
        cdensity = gpuarray.to_gpu(density)
        for v, t in (x, "x"), (y, "y"), (z, "z"), (Qx, "Qx"), (Qy,
                                                               "Qy"), (Qz,
                                                                       "Qz"):
            cv = self.cudamod.get_global(t)[0]
            cuda.memcpy_htod(cv, v)

        cframe = cuda.mem_alloc(result[0].nbytes)
        n = int(1 * nqy * nqz)

        while True:
            try:
                qxi = numpy.int32(self.work_queue.get(block=False))
            except Queue.Empty:
                break

            #print "%d of %d on %d\n"%(qxi,nqx,self.gpu),

            self.cudaBorn(cdensity, qxi, cframe, **cuda_partition(n))
            ## Delay fetching result until the kernel is complete
            cuda_sync()

            ## Fetch result back to the CPU
            cuda.memcpy_dtoh(result[qxi], cframe)

Example #9

Show file

File: tomography_cuda.py Project: martina88esposito/tomohowk

class CudaCalculator(object):
    def __init__(self, eta, beta, L, no_angles, no_pulses, order=5):
        self.mod_K = SourceModule(RADON_KERNEL.format(order, no_angles, no_pulses))
        self.K_gpu = self.mod_K.get_function("K_l")
        self.mod_reduction = SourceModule(REDUCTION_KERNEL)
        self.reduction_gpu = self.mod_reduction.get_function("reduction")
        self.eta = eta
        self.gamma = gamma(eta)
        self.beta = beta
        self.L = L
        self.h = calc_h(L, beta, eta)
        drv.memcpy_htod(self.mod_K.get_global("rsq4pi")[0], scipy.array([1./sqrt(4.*pi)], dtype=scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("sqeta")[0], scipy.array([sqrt(self.eta)], dtype=scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("h")[0], scipy.array([self.h], dtype=scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("four_pi_gamma")[0],
                        scipy.array([4.*pi*self.gamma], dtype=scipy.float32))
        y = sqrt(self.gamma)/self.h
        drv.memcpy_htod(self.mod_K.get_global("y")[0], scipy.array([y], dtype=scipy.float32))
        n = scipy.arange(1, order+1, dtype=scipy.float32)
        n2 = n**2
        ex = exp(-n2/4.)
        pre_s2 = ex*cosh(n*y)
        pre_s3 = ex*n*sinh(n*y)
        drv.memcpy_htod(self.mod_K.get_global("n2")[0], n2)
        drv.memcpy_htod(self.mod_K.get_global("pre_s1")[0], ex)
        drv.memcpy_htod(self.mod_K.get_global("pre_s2")[0], pre_s2)
        drv.memcpy_htod(self.mod_K.get_global("pre_s3")[0], pre_s3)

    def K(self, Q, P, angles, quadratures):
        drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
        drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
        Nx = Q.shape[0]
        Ny = int(floor(quadratures.size / 1024.))
        K = scipy.empty((Nx,), dtype=scipy.float32)
        Kb = drv.mem_alloc(4*Ny*Nx)
        Q_gpu = drv.to_device(Q)
        P_gpu = drv.to_device(P)
        self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
                   block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
        self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
        return K/self.L

    def reconstruct_wigner(self, angles_quadratures, Nq, Np):
        angles, quadratures = angles_quadratures
        q_mean, p_mean, s_max = estimate_position_from_quadratures(self.eta, angles, quadratures)
        q, p, Q, P = build_mesh(q_mean, p_mean, s_max, Nq, Np)
        W = self.K(Q.ravel(), P.ravel(), angles, quadratures)
        return q_mean, p_mean, Q, P, W.reshape(Q.shape)

Example #10

Show file

File: deskew.py Project: liuyenting/utoolbox-legacy

    def _load_kernel(self):
        path = os.path.join(os.path.dirname(__file__), "deskew.cu")
        with open(path, 'r') as fd:
            tpl = Template(fd.read())
            source = tpl.render(dst_type=dtype_to_ctype(self._dtype))
            module = SourceModule(source)

        self._kernel = module.get_function("deskew_kernel")

        self._d_px_shift, _ = module.get_global('px_shift')
        self._d_vsin, _ = module.get_global('vsin')
        self._d_vcos, _ = module.get_global('vcos')

        self._texture = module.get_texref("ref_vol")
        self._texture.set_address_mode(0, cuda.address_mode.BORDER)
        self._texture.set_address_mode(1, cuda.address_mode.BORDER)
        self._texture.set_address_mode(2, cuda.address_mode.BORDER)
        self._texture.set_filter_mode(cuda.filter_mode.LINEAR)

        self._kernel.prepare('Piiiii',texrefs=[self._texture])

Example #11

Show file

    def initCuda(self):
        """
        Initialize CUDA environment
        - Create CUDA C program, and compile it
        - Copy destination PointCloud to CUDA global and constant value
        - Create handler of find_closest function in CUDA

        :return: None
        """
        mod = SourceModule("""
        #define ROW (""" + str(self.src.num) + """)
        #define OFFSET (""" + str(self.numCore) + """)
        __constant__ float dst[ROW][3];

        __global__ void get_dst(float* ret){
            int idx = threadIdx.x;
            ret[idx*3] = dst[idx][0];
            ret[idx*3+1] = dst[idx][1];
            ret[idx*3+2] = dst[idx][2];
        }

        __global__ void find_closest(float* result, float *ret, float* distances)
        {
            for (int idx = threadIdx.x; idx < ROW; idx += OFFSET) {
                float x_src = result[idx*3];
                float y_src = result[idx*3+1];
                float z_src = result[idx*3+2];
                float x_dst, y_dst, z_dst, minDist, dist;
                int minIdx = -1;
                for ( int i = 0; i < ROW; i++){
                    x_dst = dst[i][0];
                    y_dst = dst[i][1];
                    z_dst = dst[i][2];
                    dist = (x_src-x_dst) * (x_src-x_dst) + (y_src-y_dst) * (y_src-y_dst) + (z_src-z_dst) * (z_src-z_dst);
                    if ( dist < minDist || minIdx < 0 ) {
                        minDist = dist;
                        minIdx = i;
                    }
                }
                ret[idx*3] = dst[minIdx][0];
                ret[idx*3+1] = dst[minIdx][1];
                ret[idx*3+2] = dst[minIdx][2];
                distances[idx] = sqrt(minDist);
            }
        }
        """)
        dstCuda, _ = mod.get_global('dst')
        assert self.dst.points.dtype == np.float32
        cuda.memcpy_htod(dstCuda, self.dst.points)
        distances = np.zeros(self.src.num, dtype=np.float32)
        self.distances_gpu = gpuarray.to_gpu(distances)
        self.computeCorrespondenceCuda = mod.get_function('find_closest')
        getDstCuda = mod.get_function('get_dst')

Example #12

Show file

File: riemanntheta_cuda.py Project: tolsonMDA/abelfunctions

    def exponential_growth_d(self, g, TILEHEIGHT):
        template = """

    #include <stdlib.h>
    #include <stdio.h>
    #include <math.h>

    #define GENUS %d
    #define TILEHEIGHT %d

    __device__ __constant__ double Yinvd[GENUS*GENUS]; 

    __global__ void kernel(double* yd, double* u, int g, int y_len)
    {
      int tdy = threadIdx.y;
      int bdy = blockIdx.y;
      int tdx = threadIdx.x;

      __shared__ double yd_s[GENUS*TILEHEIGHT];
      yd_s[tdy*g + tdx] = 0;
      if (bdy*TILEHEIGHT + tdy < y_len) {
        yd_s[tdy*g + tdx] = yd[(bdy*TILEHEIGHT + tdy) * g + tdx];
      }
      __syncthreads();

      if (bdy*TILEHEIGHT + tdy < y_len) {
        int i,j;
        double dot = 0;
        double Yinvy_i;
        for (i = 0; i < g; i++) {
          Yinvy_i = 0;
          for (j = 0; j < g; j++) {
            Yinvy_i += Yinvd[g*i + j] * yd_s[tdy*g+j];
          }
          dot += yd_s[tdy*g+i] * Yinvy_i;
        }
        u[bdy*TILEHEIGHT + tdy] = M_PI * dot;
      }
    }
    """ %(g, TILEHEIGHT)

        mod = SourceModule(template)
        func = mod.get_function("kernel")
        Yinvd = mod.get_global("Yinvd")[0]
        return (func, Yinvd)

Example #13

Show file

File: SSFM_gpu_1d.py Project: wbkifun/my_research

    def set_barrier(s, vmax, vwidth):
        s.vwidth = vwidth
        s.vmax = vmax
        s.vx0 = s.nx / 2 - s.vwidth / 2
        s.vx1 = s.nx / 2 + s.vwidth / 2
        s.vc = np.complex64(np.exp(-1j * s.vmax * s.dt))

        kern = (
            kernels.replace("HNX", str(s.nx / 2))
            .replace("NX", str(s.nx))
            .replace("TID0", str(s.vx0))
            .replace("TID_MAX", str(s.vx1))
        )
        print kern
        mod = SourceModule(kern)
        s.mul_l = mod.get_function("mul_l")
        s.mul_v = mod.get_function("mul_v")
        s.lcx_const, _ = mod.get_global("lcx")

Example #14

Show file

def compile(steps, jinja_env, fields):
    """ Combine all the operation source codes into one large string, compile
    it, load the constant values, and store runtime functions into each 
    operation instance. """

    print 'Compiling all operations...'

    # Get the jinja2 template containing the source code.
    template = jinja_env.get_template('update.cu')

    # Assemble all the source into one long string.
    source = jinja_env.get_template('field_access_macros.cu').render( \
        field_names=sorted(fields), fields=fields)
    for step in steps:
        for op in step.operations:
            for step_dir in range(3):
                op.render(jinja_env, step_dir)
                source += str(op.cuda_source)

    # Write out source code, for debugging purposes.
    f = open('source_code.debug', 'w')
    f.write(source)
    f.close()

    # Compile the cuda source code.
    mod = SourceModule(source)

    # Load the location (on the GPU) of all the fields.
    dest, size = mod.get_global("M_FIELD")
    field_locations = \
        np.array([int(fields[fname].d_data) for fname in sorted(fields)])
    drv.memcpy_htod(dest, field_locations)

    # Store ready-to-run functions back into each My_Operation class instance.
    for step in steps:
        for op in step.operations:
            # Create the list to store runtimes for all 3 step directions.
            op.runtime = []
            for step_dir in range(3):
                op.runtime.append(mod.get_function(op.name + 'XYZ'[step_dir]))
                op.runtime[step_dir].set_cache_config(drv.func_cache.PREFER_L1)

    # Compiling finished.
    print '... compiling complete.', '\n'

Example #15

Show file

File: my_steps.py Project: JesseLu/gce-server

def compile(steps, jinja_env, fields):
    """ Combine all the operation source codes into one large string, compile
    it, load the constant values, and store runtime functions into each 
    operation instance. """

    print 'Compiling all operations...'

    # Get the jinja2 template containing the source code. 
    template = jinja_env.get_template('update.cu')

    # Assemble all the source into one long string.
    source = jinja_env.get_template('field_access_macros.cu').render( \
        field_names=sorted(fields), fields=fields)
    for step in steps:
        for op in step.operations:
            for step_dir in range(3):
                op.render(jinja_env, step_dir)
                source += str(op.cuda_source)
    
    # Write out source code, for debugging purposes.
    f = open('source_code.debug', 'w')
    f.write(source)
    f.close()

    # Compile the cuda source code.
    mod = SourceModule(source)

    # Load the location (on the GPU) of all the fields.
    dest, size = mod.get_global("M_FIELD")
    field_locations = \
        np.array([int(fields[fname].d_data) for fname in sorted(fields)])
    drv.memcpy_htod(dest, field_locations)

    # Store ready-to-run functions back into each My_Operation class instance.
    for step in steps:
        for op in step.operations:
            # Create the list to store runtimes for all 3 step directions.
            op.runtime = [] 
            for step_dir in range(3):
                op.runtime.append(mod.get_function(op.name + 'XYZ'[step_dir]))
                op.runtime[step_dir].set_cache_config(drv.func_cache.PREFER_L1)

    # Compiling finished.
    print '... compiling complete.', '\n'

Example #16

Show file

File: cuda_l1_wvd.py Project: scarab187/pymack04

def l1_wvd(N, block_size=16, use_double=False):
    Nb = block_size
    Ng = int(np.ceil(float(N) / Nb))
    Nb2 = Nb
    Nbd = 2 * N + 1
    Nwrite = int(np.ceil(float(Nbd) / Nb2))
    cfg = dict(N=N,
               Ng=Ng,
               Nb=Nb,
               Nb2=Nb2,
               Nbd=Nbd,
               Nwrite=Nwrite,
               use_double=use_double)
    mod = SourceModule(src % cfg)
    kernel = mod.get_function('kernel')
    dtype = np.float64 if use_double else np.float32
    w0 = np.exp(-2j * np.pi / N)
    ws = np.array([w0**k for k in range(N)])
    wsr_cmem = mod.get_global('wsr')[0]
    wsi_cmem = mod.get_global('wsi')[0]
    drv.memcpy_htod(wsr_cmem, np.ascontiguousarray(ws.real.astype(dtype)))
    drv.memcpy_htod(wsi_cmem, np.ascontiguousarray(ws.imag.astype(dtype)))

    def run(z, zi=None):
        if zi is None:
            indata = np.concatenate([z.real, z.imag])
        else:
            indata = np.concatenate([z, zi])
        indata = drv.In(np.ascontiguousarray(indata.astype(dtype)))
        outdata = drv.InOut(np.zeros(Nbd, dtype=dtype))
        kernel(outdata, indata, block=(Nb, 1, 1), grid=(Ng, N))
        cost = outdata.array[0]
        grad_r = outdata.array[1:N + 1]
        grad_i = outdata.array[N + 1:]
        return cost, grad_r, grad_i

    return run

Example #17

Show file

File: main.py Project: giuseppecalderaro/python-hacks

 def go(self):
     import qimage2ndarray
     image = QtGui.QImage(self.rows, self.columns, QtGui.QImage.Format_RGB32)
     self.data = qimage2ndarray.rgb_view(image)
     # Needs a contiguos buffer
     self.data = numpy.copy(self.data)
     self.spheres = numpy.array(self.CreateSpheres())
     # Init CUDA
     cuda.init()
     # Create CUDA Context
     ctx = pycuda.tools.make_default_context()
     # Declare event(s)
     startEvent = cuda.Event()
     stopEvent = cuda.Event()
     # Memory on Device
     gpu_alloc = cuda.mem_alloc(self.data.nbytes)
     gpu_rows = cuda.mem_alloc(self.rows.nbytes)
     gpu_columns = cuda.mem_alloc(self.columns.nbytes)
     # Copy data from Host to Device
     cuda.memcpy_htod(gpu_rows, self.rows)
     cuda.memcpy_htod(gpu_columns, self.columns)
     # Execute on host
     mod = SourceModule(code)
     gpu_spheres = mod.get_global("spheres")    
     cuda.memcpy_htod(gpu_spheres[0], self.spheres)
     kernel = mod.get_function("RayTracer")
     startEvent.record()
     kernel(gpu_alloc, gpu_rows, gpu_columns,
            block=(self.threads, self.threads, 1), 
            grid=(int(self.rows / self.threads), int(self.columns / self.threads)))
     stopEvent.record()
     stopEvent.synchronize()
     print("Time elapsed: %fms" % startEvent.time_till(stopEvent))
     # Copy data from Device to Host
     cuda.memcpy_dtoh(self.data, gpu_alloc)
     ctx.pop()
     self.SetImage(self.data)

Example #18

Show file

File: photoreceptor.py Project: neurokernel/retina

def get_transduction_func(dtype, block_size, Xaddress,
                          change_ind1, change_ind2, change1, change2, compile_options):
    template = """
/* This is kept for documentation purposes the actual code used is after the end
 * of this template */
#include "curand_kernel.h"

extern "C" {
#include "stdio.h"

#define BLOCK_SIZE %(block_size)d
#define LA 0.5

/* Simulation Constants */
#define C_T     0.5     /* Total concentration of calmodulin */
#define G_T     50      /* Total number of G-protein */
#define PLC_T   100     /* Total number of PLC */
#define T_T     25      /* Total number of TRP/TRPL channels */
#define I_TSTAR 0.68    /* Average current through one opened TRP/TRPL channel (pA)*/

#define GAMMA_DSTAR     4.0 /* s^(-1) rate constant*/
#define GAMMA_GAP       3.0 /* s^(-1) rate constant*/
#define GAMMA_GSTAR     3.5 /* s^(-1) rate constant*/
#define GAMMA_MSTAR     3.7 /* s^(-1) rate constant*/
#define GAMMA_PLCSTAR   144 /* s^(-1) rate constant */
#define GAMMA_TSTAR     25  /* s^(-1) rate constant */

#define H_DSTAR         37.8    /* strength constant */
#define H_MSTAR         40      /* strength constant */
#define H_PLCSTAR       11.1    /* strength constant */
#define H_TSTARP        11.5    /* strength constant */
#define H_TSTARN        10      /* strength constant */

#define K_P     0.3     /* Dissociation coefficient for calcium positive feedback */
#define K_P_INV 3.3333  /* K_P inverse ( too many decimals are not important) */
#define K_N     0.18    /* Dissociation coefficient for calmodulin negative feedback */
#define K_N_INV 5.5555  /* K_N inverse ( too many decimals are not important) */
#define K_U     30      /* (mM^(-1)s^(-1)) Rate of Ca2+ uptake by calmodulin */
#define K_R     5.5     /* (mM^(-1)s^(-1)) Rate of Ca2+ release by calmodulin */
#define K_CA    1000    /* s^(-1) diffusion from microvillus to somata (tuned) */

#define K_NACA  3e-8    /* Scaling factor for Na+/Ca2+ exchanger model */

#define KAPPA_DSTAR         1300.0  /* s^(-1) rate constant - there is also a capital K_DSTAR */
#define KAPPA_GSTAR         7.05    /* s^(-1) rate constant */
#define KAPPA_PLCSTAR       15.6    /* s^(-1) rate constant */
#define KAPPA_TSTAR         150.0   /* s^(-1) rate constant */
#define K_DSTAR             100.0   /* rate constant */

#define F                   96485   /* (mC/mol) Faraday constant (changed from paper)*/
#define N                   4       /* Binding sites for calcium on calmodulin */
#define R                   8.314   /* (J*K^-1*mol^-1)Gas constant */
#define T                   293     /* (K) Absolute temperature */
#define VOL                 3e-9    /* changed from 3e-12microlitres to nlitres
                                     * microvillus volume so that units agree */

#define N_S0_DIM        1   /* initial condition */
#define N_S0_BRIGHT     2

#define A_N_S0_DIM      4   /* upper bound for dynamic increase (of negetive feedback) */
#define A_N_S0_BRIGHT   200

#define TAU_N_S0_DIM    3000    /* time constant for negative feedback */
#define TAU_N_S0_BRIGHT 1000

#define NA_CO           120     /* (mM) Extracellular sodium concentration */
#define NA_CI           8       /* (mM) Intracellular sodium concentration */
#define CA_CO           1.5     /* (mM) Extracellular calcium concentration */

#define G_TRP           8       /* conductance of a TRP channel */
#define TRP_REV         0       /* TRP channel reversal potential (mV) */

__device__ __constant__ long long int d_X[5];
__device__ __constant__ int change_ind1[13];
__device__ __constant__ int change1[13];
__device__ __constant__ int change_ind2[13];
__device__ __constant__ int change2[13];

/* cc = n/(NA*VOL) [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */
__device__ float num_to_mM(int n)
{
    return n * 5.5353e-4; // n/1806.6;
}

/* n = cc*VOL*NA [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */
__device__ float mM_to_num(float cc)
{
    return rintf(cc * 1806.6);
}

/* Assumes Hill constant (=2) for positive calcium feedback */
__device__ float compute_fp(float Ca_cc)
{
    float tmp = Ca_cc*K_P_INV;
    tmp *= tmp;
    return tmp/(1 + tmp);
}

/* Assumes Hill constant(=3) for negative calmodulin feedback */
__device__ float compute_fn(float Cstar_cc, float ns)
{
    float tmp = Cstar_cc*K_N_INV;
    tmp *= tmp*tmp;
    return ns*tmp/(1 + tmp);
}

/* Vm [V] */
__device__ float compute_ca(int Tstar, float Cstar_cc, float Vm)
{
    float I_in = Tstar*G_TRP*fmaxf(-Vm + 0.001*TRP_REV, 0);
    /* CaM = C_T - Cstar_cc */
    float denom = (K_CA + (N*K_U*C_T) - (N*K_U)*Cstar_cc + 179.0952 * expf(-(F/(R*T))*Vm));  // (K_NACA*NA_CO^3/VOL*F)
    /* I_Ca ~= 0.4*I_in */
    float numer = (0.4*I_in)/(2*VOL*F) +
                  ((K_NACA*CA_CO*NA_CI*NA_CI*NA_CI)/(VOL*F)) +  // in paper it's -K_NACA... due to different conventions
                  N*K_R*Cstar_cc;

    return fmaxf(1.6e-4, numer/denom);
}

__global__ void
transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm,
             %(type)s* g_ns, %(type)s* input,
             int* num_microvilli, int total_microvilli, int* count)
{
    int tid = threadIdx.x;
    int gid = threadIdx.x + blockIdx.x * blockDim.x;
    int wid = tid %% 32;

    __shared__ int X[BLOCK_SIZE][7];  // number of molecules
    __shared__ float Ca[BLOCK_SIZE];
    __shared__ float fn[BLOCK_SIZE];
    
    float Vm, ns, lambda;

    float sumrate, dt_advanced;
    int reaction_ind;
    ushort2 tmp;

    // copy random generator state locally to avoid accessing global memory
    curandStateXORWOW_t localstate = state[gid];


    int mid; // microvilli ID
    __shared__ volatile int mi; // starting point of mid per ward
    
    // use atomicAdd to obtain the starting mid for the warp
    if(wid == 0)
    {
        mi = atomicAdd(count, 32);
    }
    mid = mi + wid;
    int mid;
    
    while(mid < total_microvilli)
    {
        // load photoreceptor index of the microvilli
        ind = ((ushort*)d_X[4])[mid];
        
        // load variables that are needed for computing calcium concentration
        tmp = ((ushort2*)d_X[2])[mid];
        X[tid][5] = tmp.x;
        X[tid][6] = tmp.y;
        
        Vm = d_Vm[ind]*1e-3;
        ns = g_ns[ind];

        // update calcium concentration
        Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
        fn[tid] = compute_fn(num_to_mM(X[tid][5]), ns);
        
        lambda = input[ind]/num_microvilli[ind];

        // load the rest of variables
        tmp = ((ushort2*)d_X[1])[mid];
        X[tid][4] = tmp.y;
        X[tid][3] = tmp.x;
        tmp = ((ushort2*)d_X[0])[mid];
        X[tid][2] = tmp.y;
        X[tid][1] = tmp.x;
        X[tid][0] = ((ushort*)d_X[3])[mid];

        // compute total rate of reaction
        sumrate = lambda;
        sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );  //11
        sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]);  //12
        sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6];  // 10
        sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4];  // 8
        sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3];  // 7
        sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0];  // 1
        sumrate += KAPPA_DSTAR * X[tid][3];  // 6
        sumrate += GAMMA_GAP * X[tid][2] * X[tid][3];  // 4
        sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);  // 3
        sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]);  // 5
        sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0];  // 2
        sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                   (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                   X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5 ;  // 9

        // choose the next reaction time
        dt_advanced = -logf(curand_uniform(&localstate))/(LA + sumrate);

        // If the reaction time is smaller than dt,
        // pick the reaction and update,
        // then compute the total rate and next reaction time again
        // until all dt_advanced is larger than dt.
        // Note that you don't have to compensate for
        // the last reaction time that exceeds dt.
        // The reason is that the exponential distribution is MEMORYLESS.
        while(dt_advanced <= dt)
        {
            reaction_ind = 0;
            sumrate = curand_uniform(&localstate) * sumrate;

            if(sumrate > 2e-5)
            {

                sumrate -= lambda;
                reaction_ind = (sumrate<=2e-5) * 13;

                if(!reaction_ind)
                {

                    sumrate -= mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );
                    reaction_ind = (sumrate<=2e-5) * 11;

                    if(!reaction_ind)
                    {
                        sumrate -= mM_to_num(K_R) * num_to_mM(X[tid][5]);
                        reaction_ind = (sumrate<=2e-5) * 12;
                        if(!reaction_ind)
                        {
                            sumrate -= GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6];
                            reaction_ind = (sumrate<=2e-5) * 10;
                            if(!reaction_ind)
                            {
                                sumrate -= GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4];
                                reaction_ind = (sumrate<=2e-5) * 8;

                                if(!reaction_ind)
                                {
                                    sumrate -= GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3];
                                    reaction_ind = (sumrate<=2e-5) * 7;
                                    if(!reaction_ind)
                                    {
                                        sumrate -= GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0];
                                        reaction_ind = (sumrate<=2e-5) * 1;
                                        if(!reaction_ind)
                                        {
                                            sumrate -= KAPPA_DSTAR * X[tid][3];
                                            reaction_ind = (sumrate<=2e-5) * 6;
                                            if(!reaction_ind)
                                            {
                                                sumrate -= GAMMA_GAP * X[tid][2] * X[tid][3];
                                                reaction_ind = (sumrate<=2e-5) * 4;

                                                if(!reaction_ind)
                                                {
                                                    sumrate -= KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);
                                                    reaction_ind = (sumrate<=2e-5) * 3;
                                                    if(!reaction_ind)
                                                    {
                                                        sumrate -= GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]);
                                                        reaction_ind = (sumrate<=2e-5) * 5;
                                                        if(!reaction_ind)
                                                        {
                                                            sumrate -= KAPPA_GSTAR * X[tid][1] * X[tid][0];
                                                            reaction_ind = (sumrate<=2e-5) * 2;
                                                            if(!reaction_ind)
                                                            {
                                                                sumrate -= (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                                                                           (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                                                                           X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5;
                                                                reaction_ind = (sumrate<=2e-5) * 9;
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            int ind;

            // only up to two state variables are needed to be updated
            // update the first one.
            ind = change_ind1[reaction_ind];
            X[tid][ind] += change1[reaction_ind];

            //if(reaction_ind == 9)
            //{
            //    X[tid][ind] = max(X[tid][ind], 0);
            //}

            ind = change_ind2[reaction_ind];
            //update the second one
            if(ind != 0)
            {
                X[tid][ind] += change2[reaction_ind];
            }

            // compute the advance time again
            Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
            fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns );
            //fp[tid] = compute_fp( Ca[tid] );

            sumrate = lambda;
            sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11
            sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12
            sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10
            sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8
            sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7
            sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1
            sumrate += KAPPA_DSTAR * X[tid][3]; // 6
            sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4
            sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);  // 3
            sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5
            sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2
            sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                       (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                       X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; // 9

            dt_advanced -= logf(curand_uniform(&localstate))/(LA + sumrate);

        } // end while

        ((ushort*)d_X[3])[mid] = X[tid][0];
        ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]);
        ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]);
        ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]);
        
        if(wid == 0)
        {
            mi = atomicAdd(count, 32);
        }
        mid = mi + wid;
    }
    // copy the updated random generator state back to global memory
    state[gid] = localstate;
}

}
"""

    template_run = """

#include "curand_kernel.h"

extern "C" {
#include "stdio.h"

#define BLOCK_SIZE %(block_size)d
#define LA 0.5

__device__ __constant__ long long int d_X[5];
__device__ __constant__ int change_ind1[13];
__device__ __constant__ int change1[13];
__device__ __constant__ int change_ind2[13];
__device__ __constant__ int change2[13];


__device__ float num_to_mM(int n)
{
    return n * 5.5353e-4; // n/1806.6;
}

__device__ float mM_to_num(float cc)
{
    return rintf(cc * 1806.6);
}

__device__ float compute_fp( float ca_cc)
{
    float tmp = ca_cc*3.3333333333;
    tmp *= tmp;
    return tmp/(1+tmp);
}

__device__ float compute_fn( float Cstar_cc, float ns)
{
    float tmp = Cstar_cc*5.55555555;
    tmp *= tmp*tmp;
    return ns*tmp/(1+tmp);
}

__device__ float compute_ca(int Tstar, float cstar_cc, float Vm)
{
    float I_in = Tstar*8*fmaxf(-Vm,0);
    float denom = (1060 - 120*cstar_cc + 179.0952 * expf(-39.60793*Vm));
    float numer = I_in * 690.9537 + 0.0795979 + 22*cstar_cc;

    return fmaxf(1.6e-4, numer/denom);
}

__global__ void
transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm,
             %(type)s* g_ns, %(type)s* input,
             int* num_microvilli, int total_microvilli, int* count)
{
    int tid = threadIdx.x;
    int gid = threadIdx.x + blockIdx.x * blockDim.x;
    int wid = tid %% 32;

    __shared__ int X[BLOCK_SIZE][7];  // number of molecules
    __shared__ float Ca[BLOCK_SIZE];
    __shared__ float fn[BLOCK_SIZE];
    
    float Vm, ns, lambda;

    float sumrate, dt_advanced;
    int reaction_ind;
    ushort2 tmp;

    // copy random generator state locally to avoid accessing global memory
    curandStateXORWOW_t localstate = state[gid];


    int mid; // microvilli ID
    __shared__ volatile int mi; // starting point of mid per ward
    
    // use atomicAdd to obtain the starting mid for the warp
    if(wid == 0)
    {
        mi = atomicAdd(count, 32);
    }
    mid = mi + wid;
    int ind;

    while(mid < total_microvilli)
    {
        ind = ((ushort*)d_X[4])[mid];
        
        // load variables that are needed for computing calcium concentration
        tmp = ((ushort2*)d_X[2])[mid];
        X[tid][5] = tmp.x;
        X[tid][6] = tmp.y;
        
        Vm = d_Vm[ind]*1e-3;
        ns = g_ns[ind];

        // update calcium concentration
        Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
        fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns);
        
        lambda = input[ind]/num_microvilli[ind];

        // load the rest of variables
        tmp = ((ushort2*)d_X[1])[mid];
        X[tid][4] = tmp.y;
        X[tid][3] = tmp.x;
        tmp = ((ushort2*)d_X[0])[mid];
        X[tid][2] = tmp.y;
        X[tid][1] = tmp.x;
        X[tid][0] = ((ushort*)d_X[3])[mid];

        sumrate = lambda + 54198 * Ca[tid] * (0.5 - X[tid][5] * 5.5353e-4) + 5.5 * X[tid][5]; // 11, 12
        sumrate += 25 * (1+10*fn[tid]) * X[tid][6]; // 10
        sumrate += 4 * (1+37.8*fn[tid]) * X[tid][4] ; // 8
        sumrate += (1444+1598.4*fn[tid]) * X[tid][3] ; // 7, 6
        sumrate += (3.7*(1+40*fn[tid]) + 7.05 * X[tid][1]) * X[tid][0] ; // 1, 2
        sumrate += (1560 - 12.6 * X[tid][3]) * X[tid][2]; // 3, 4
        sumrate += 3.5 * (50 - X[tid][2] - X[tid][1] - X[tid][3]) ; // 5
        sumrate += 0.015 * (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5 ; // 9

        dt_advanced = -logf(curand_uniform(&localstate))/(LA+sumrate);

        // If the reaction time is smaller than dt,
        // pick the reaction and update,
        // then compute the total rate and next reaction time again
        // until all dt_advanced is larger than dt.
        // Note that you don't have to compensate for
        // the last reaction time that exceeds dt.
        // The reason is that the exponential distribution is MEMORYLESS.
        while (dt_advanced <= dt) {
            reaction_ind = 0;
            sumrate = curand_uniform(&localstate) * sumrate;

            if (sumrate > 2e-5) {
                sumrate -= lambda;
                reaction_ind = (sumrate<=2e-5) * 13;

                if (!reaction_ind) {
                    sumrate -= mM_to_num(30) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );
                    reaction_ind = (sumrate<=2e-5) * 11;

                    if (!reaction_ind) {
                        sumrate -= mM_to_num(5.5) * num_to_mM(X[tid][5]);
                        reaction_ind = (sumrate<=2e-5) * 12;

                        if (!reaction_ind) {
                            sumrate -= 25 * (1+10*fn[tid]) * X[tid][6];
                            reaction_ind = (sumrate<=2e-5) * 10;

                            if (!reaction_ind) {
                                sumrate -= 4 * (1+37.8*fn[tid]) * X[tid][4];
                                reaction_ind = (sumrate<=2e-5) * 8;

                                if (!reaction_ind) {
                                    sumrate -= 144 * (1+11.1*fn[tid]) * X[tid][3];
                                    reaction_ind = (sumrate<=2e-5) * 7;

                                    if (!reaction_ind) {
                                        sumrate -= 3.7*(1+40*fn[tid]) * X[tid][0];
                                        reaction_ind = (sumrate<=2e-5) * 1;

                                        if (!reaction_ind) {
                                            sumrate -= 1300 * X[tid][3];
                                            reaction_ind = (sumrate<=2e-5) * 6;

                                            if (!reaction_ind) {
                                                sumrate -= 3.0 * X[tid][2] * X[tid][3];
                                                reaction_ind = (sumrate<=2e-5) * 4;

                                                if (!reaction_ind) {
                                                    sumrate -= 15.6 * X[tid][2]
                                                        * (100-X[tid][3]);
                                                    reaction_ind = (sumrate<=2e-5) * 3;

                                                    if (!reaction_ind) {
                                                        sumrate -= 3.5 * (50 - X[tid][2]
                                                            - X[tid][1] - X[tid][3]);
                                                        reaction_ind = (sumrate<=2e-5) * 5;

                                                        if(!reaction_ind) {
                                                            sumrate -= 7.05 * X[tid][1]
                                                                * X[tid][0];
                                                            reaction_ind = (sumrate<=2e-5)
                                                                * 2;

                                                            if(!reaction_ind) {
                                                                sumrate -= 0.015 *
                                                                    (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5;
                                                                reaction_ind = (sumrate<=2e-5) * 9;
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            int ind;

            // only up to two state variables are needed to be updated
            // update the first one.
            ind = change_ind1[reaction_ind];
            X[tid][ind] += change1[reaction_ind];

            //update the second one
            ind = change_ind2[reaction_ind];
            if (ind != 0)
                X[tid][ind] += change2[reaction_ind];

            // compute the advance time again
            Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
            fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns );

            sumrate = lambda + 54198*Ca[tid]*(0.5 - X[tid][5]*5.5353e-4)
                + 5.5*X[tid][5]; // 11, 12
            sumrate += 25*(1 + 10*fn[tid])*X[tid][6]; // 10
            sumrate += 4*(1 + 37.8*fn[tid])*X[tid][4]; // 8
            sumrate += (1444 + 1598.4*fn[tid])*X[tid][3]; // 7, 6
            sumrate += (3.7*(1 + 40*fn[tid]) + 7.05*X[tid][1])*X[tid][0]; // 1, 2
            sumrate += (1560 - 12.6*X[tid][3])*X[tid][2]; // 3, 4
            sumrate += 3.5*(50 - X[tid][2] - X[tid][1] - X[tid][3]); // 5
            sumrate += 0.015*(1 + 11.5*compute_fp( Ca[tid] ))
                *X[tid][4]*(X[tid][4] - 1)*(25 - X[tid][6])*0.5; // 9

            dt_advanced -= logf(curand_uniform(&localstate))/(LA+sumrate);

        } // end while

        ((ushort*)d_X[3])[mid] = X[tid][0];
        ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]);
        ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]);
        ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]);
        
        if(wid == 0)
        {
            mi = atomicAdd(count, 32);
        }
        mid = mi + wid;
    }
    // copy the updated random generator state back to global memory
    state[gid] = localstate;
}

}
"""
    try:
        co = [compile_options[0]+' --maxrregcount=54']
    except IndexError:
        co = ['--maxrregcount=54']
    
    scalartype = dtype.type if isinstance(dtype, np.dtype) else dtype
    mod = SourceModule(
        template_run % {
            "type": dtype_to_ctype(dtype),
            "block_size": block_size,
            "fletter": 'f' if scalartype == np.float32 else ''
        },
        options = co,
        no_extern_c = True)
    func = mod.get_function('transduction')
    d_X_address, d_X_nbytes = mod.get_global("d_X")
    cuda.memcpy_htod(d_X_address, Xaddress)
    d_change_ind1_address, d_change_ind1_nbytes = mod.get_global("change_ind1")
    d_change_ind2_address, d_change_ind2_nbytes = mod.get_global("change_ind2")
    d_change1_address, d_change1_nbytes = mod.get_global("change1")
    d_change2_address, d_change2_nbytes = mod.get_global("change2")
    cuda.memcpy_htod(d_change_ind1_address, change_ind1)
    cuda.memcpy_htod(d_change_ind2_address, change_ind2)
    cuda.memcpy_htod(d_change1_address, change1)
    cuda.memcpy_htod(d_change2_address, change2)

    func.prepare('PfPPPPiP')
    func.set_cache_config(cuda.func_cache.PREFER_SHARED)
    return func

Example #19

Show file

File: main.py Project: schroeder-dewitt/polyomino-self-assembly

def main():

    #FourPermutations set-up
    FourPermutations = numpy.array([ [1,2,3,4],
                                  [1,2,4,3],
                                  [1,3,2,4],
                                  [1,3,4,2],
                                  [1,4,2,3],
                                  [1,4,3,2],
                                  [2,1,3,4],
                                  [2,1,4,3],
                                  [2,3,1,4],
                                  [2,3,4,1],
                                  [2,4,1,3],
                                  [2,4,3,1],
                                  [3,2,1,4],
                                  [3,2,4,1],
                                  [3,1,2,4],
                                  [3,1,4,2],
                                  [3,4,2,1],
                                  [3,4,1,2],
                                  [4,2,3,1],
                                  [4,2,1,3],
                                  [4,3,2,1],
                                  [4,3,1,2],
                                  [4,1,2,3],
                                  [4,1,3,2],]).astype(numpy.uint8)

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":8,
                 "bit_length_edge_type":3, 
                 "curand_nr_threads_per_block":256,
                 "nr_tile_types":4,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":1,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":1,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4
                }

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("TestAssemblyKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")

    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix
 
    #Set-up genomes
    dest = numpy.arange(256).astype(numpy.uint8)
    for i in range(0, 256/8):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
        dest[i*8 + 0] = 36
        dest[i*8 + 1] = 151
        dest[i*8 + 2] = 90
        dest[i*8 + 3] = 109
        dest[i*8 + 4] = 224
        dest[i*8 + 5] = 4
        dest[i*8 + 6] = 0
        dest[i*8 + 7] = 0

    dest[0] = 40
    dest[1] = 0
    dest[2] = 0
    dest[3] = 0
    dest[4] = 0
    dest[5] = 0
    dest[6] = 0
    dest[7] = 0

    dest_h = drv.mem_alloc(dest.nbytes)
    drv.memcpy_htod(dest_h, dest)
    print "Genomes before: "
    print dest

    #Set-up grids
    grids = numpy.zeros((32, 19, 19)).astype(numpy.uint8)
    grids_h = drv.mem_alloc(grids.nbytes)
    drv.memcpy_htod(grids_h, grids)
    print "Grids:"
    print grids    

    #Set-up fitness values
    fitness = numpy.zeros(256).astype(numpy.float32)
    fitness_h = drv.mem_alloc(fitness.nbytes)
    drv.memcpy_htod(fitness_h, fitness)
    print "Fitness values:"
    print fitness

    #Set-up curand
    curand = numpy.zeros(40*256).astype(numpy.uint8);
    curand_h = drv.mem_alloc(curand.nbytes)

    #Set-up four permutations
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)

    #Set-up timers
    #start = drv.Event()
    #stop = drv.Event()
    #start.record()        

    #Call kernels
    CurandKernel(curand_h, block=(32,1,1), grid=(1,1))
    Kernel(dest_h, fitness_h, grids_h, curand_h, block=(32,1,1), grid=(1,1))
 
    #drv.Context.synchronize()
    #Clean-up timer
    #stop.synchronize()
    #print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
    #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
    #pass    

    #Output
    drv.memcpy_dtoh(dest, dest_h)
    print "Genomes after: "
    print dest
    drv.memcpy_dtoh(fitness, fitness_h)
    print "Fitness after: "
    print fitness
    drv.memcpy_dtoh(grids, grids_h)
    print "Grids[0] after: "
    print grids[0]
    print "Grids[31] after:"
    print grids[31]

Example #20

Show file

File: gpu_kernels.py Project: ivmreg/pyKMLib

 g_y    = cuda.to_device(y_map_bin)
 g_out = cuda.to_device(results)
 
 g_num_el = np.int32(num_el)
 
 align = np.ceil( 1.0*sliceSize*threadsPerRow/minAlign)*minAlign
 g_align = np.int32(align)
 g_i = np.int32(i)
 g_j = np.int32(j)
 g_i_ds= np.int32(i)
 g_j_ds= np.int32(j)
 
 g_cls1N_aligned = np.int32(align_cls1_n)
 
 #gamma, copy to constant memory
 (g_gamma,gsize)=module.get_global('GAMMA')       
 cuda.memcpy_htod(g_gamma, np.float32(gamma) )
 
 g_cls_start = cuda.to_device(start_cls)
 g_cls_count = cuda.to_device(count_cls)
 
 
 g_cls = cuda.to_device(bin_cls)
 
 #start_event = cuda.Event()
 #stop_event = cuda.Event()
 
 start_event.record()
 
 func(g_val,
      g_col,

Example #21

Show file

class RimeEKBSqrt(Node):
    def __init__(self):
        super(RimeEKBSqrt, self).__init__()

    def initialise(self, solver, stream=None):
        slvr = solver
        ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan')

        # Get a property dictionary off the solver
        D = slvr.template_dict()
        # Include our kernel parameters
        D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS)
        D['rime_const_data_struct'] = slvr.const_data().string_def()

        D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \
            mbu.redistribute_threads(
                D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'],
                npolchan, na, ntime)

        regs = str(FLOAT_PARAMS['maxregs'] \
                if slvr.is_float() else DOUBLE_PARAMS['maxregs'])

        kname = 'rime_jones_EKBSqrt_float' \
            if slvr.is_float() is True else \
            'rime_jones_EKBSqrt_double'

        kernel_string = KERNEL_TEMPLATE.substitute(**D)
        self.mod = SourceModule(kernel_string,
            options=['-lineinfo','-maxrregcount', regs],
            include_dirs=[montblanc.get_source_path()],
            no_extern_c=True)

        self.rime_const_data = self.mod.get_global('C')
        self.kernel = self.mod.get_function(kname)
        self.launch_params = self.get_launch_params(slvr, D)

    def shutdown(self, solver, stream=None):
        pass

    def pre_execution(self, solver, stream=None):
        pass

    def get_launch_params(self, slvr, D):
        polchans_per_block = D['BLOCKDIMX']
        ants_per_block = D['BLOCKDIMY']
        times_per_block = D['BLOCKDIMZ']

        ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan')
        polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block)
        ant_blocks = mbu.blocks_required(na, ants_per_block)
        time_blocks = mbu.blocks_required(ntime, times_per_block)

        return {
            'block' : (polchans_per_block, ants_per_block, times_per_block),
            'grid'  : (polchan_blocks, ant_blocks, time_blocks),
        }

    def execute(self, solver, stream=None):
        slvr = solver

        if stream is not None:
            cuda.memcpy_htod_async(
                self.rime_const_data[0],
                slvr.const_data().ndary(),
                stream=stream)
        else:
            cuda.memcpy_htod(
                self.rime_const_data[0],
                slvr.const_data().ndary())

        self.kernel(slvr.uvw, slvr.lm, slvr.frequency,
            slvr.B_sqrt, slvr.jones,
            stream=stream, **self.launch_params)

    def post_execution(self, solver, stream=None):
        pass

Example #22

Show file

File: Convolution.py Project: eddienko/EuclidVisibleInstrument

    }
}
'''

template = string.Template(template)
code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS,
                           KERNEL_W=KERNEL_W,
                           COLUMN_TILE_H=COLUMN_TILE_H,
                           COLUMN_TILE_W=COLUMN_TILE_W,
                           ROW_TILE_W=ROW_TILE_W,
                           KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)

module = SourceModule(code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
d_Kernel_columns = module.get_global('d_Kernel_columns')[0]

# Helper functions for computing alignment...
def iDivUp(a, b):
    # Round a / b to nearest higher integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a / b + 1) if (a % b != 0) else (a / b)

def iDivDown(a, b):
    # Round a / b to nearest lower integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return a / b

Example #23

Show file

def get_transduction_func(dtype, block_size, num_microvilli, Xaddress,
                          change_ind1, change_ind2, change1, change2):
    template = """

#include "curand_kernel.h"

extern "C" {
#include "stdio.h"

#define NUM_MICROVILLI %(num_microvilli)d
#define BLOCK_SIZE %(block_size)d
#define LA 0.5

/* Simulation Constants */
#define C_T     0.5     /* Total concentration of calmodulin */
#define G_T     50      /* Total number of G-protein */
#define PLC_T   100     /* Total number of PLC */
#define T_T     25      /* Total number of TRP/TRPL channels */
#define I_TSTAR 0.68    /* Average current through one opened TRP/TRPL channel (pA)*/

#define GAMMA_DSTAR     4.0 /* s^(-1) rate constant*/
#define GAMMA_GAP       3.0 /* s^(-1) rate constant*/
#define GAMMA_GSTAR     3.5 /* s^(-1) rate constant*/
#define GAMMA_MSTAR     3.7 /* s^(-1) rate constant*/
#define GAMMA_PLCSTAR   144 /* s^(-1) rate constant */
#define GAMMA_TSTAR     25  /* s^(-1) rate constant */

#define H_DSTAR         37.8    /* strength constant */
#define H_MSTAR         40      /* strength constant */
#define H_PLCSTAR       11.1    /* strength constant */
#define H_TSTARP        11.5    /* strength constant */
#define H_TSTARN        10      /* strength constant */

#define K_P     0.3     /* Dissociation coefficient for calcium positive feedback */
#define K_P_INV 3.3333  /* K_P inverse ( too many decimals are not important) */
#define K_N     0.18    /* Dissociation coefficient for calmodulin negative feedback */
#define K_N_INV 5.5555  /* K_N inverse ( too many decimals are not important) */
#define K_U     30      /* (mM^(-1)s^(-1)) Rate of Ca2+ uptake by calmodulin */
#define K_R     5.5     /* (mM^(-1)s^(-1)) Rate of Ca2+ release by calmodulin */
#define K_CA    1000    /* s^(-1) diffusion from microvillus to somata (tuned) */

#define K_NACA  3e-8    /* Scaling factor for Na+/Ca2+ exchanger model */

#define KAPPA_DSTAR         1300.0  /* s^(-1) rate constant - there is also a capital K_DSTAR */
#define KAPPA_GSTAR         7.05    /* s^(-1) rate constant */
#define KAPPA_PLCSTAR       15.6    /* s^(-1) rate constant */
#define KAPPA_TSTAR         150.0   /* s^(-1) rate constant */
#define K_DSTAR             100.0   /* rate constant */

#define F                   96485   /* (mC/mol) Faraday constant (changed from paper)*/
#define N                   4       /* Binding sites for calcium on calmodulin */
#define R                   8.314   /* (J*K^-1*mol^-1)Gas constant */
#define T                   293     /* (K) Absolute temperature */
#define VOL                 3e-9    /* changed from 3e-12microlitres to nlitres
                                     * microvillus volume so that units agree */

#define N_S0_DIM        1   /* initial condition */
#define N_S0_BRIGHT     2

#define A_N_S0_DIM      4   /* upper bound for dynamic increase (of negetive feedback) */
#define A_N_S0_BRIGHT   200

#define TAU_N_S0_DIM    3000    /* time constant for negative feedback */
#define TAU_N_S0_BRIGHT 1000

#define NA_CO           120     /* (mM) Extracellular sodium concentration */
#define NA_CI           8       /* (mM) Intracellular sodium concentration */
#define CA_CO           1.5     /* (mM) Extracellular calcium concentration */

#define G_TRP           8       /* conductance of a TRP channel */
#define TRP_REV         0       /* TRP channel reversal potential */

__device__ __constant__ long long int d_X[4];
__device__ __constant__ int change_ind1[13];
__device__ __constant__ int change1[13];
__device__ __constant__ int change_ind2[13];
__device__ __constant__ int change2[13];

/* cc = n/(NA*VOL) [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */
__device__ float num_to_mM(int n)
{
    return n * 5.5353e-4; // n/1806.6;
}

/* n = cc*VOL*NA [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */
__device__ float mM_to_num(float cc)
{
    return rintf(cc * 1806.6);
}

/* Assumes Hill constant (=2) for positive calcium feedback */
__device__ float compute_fp(float Ca_cc)
{
    float tmp = Ca_cc*K_P_INV;
    tmp *= tmp;
    return tmp/(1 + tmp);
}

/* Assumes Hill constant(=3) for negative calmodulin feedback */
__device__ float compute_fn(float Cstar_cc, float ns)
{
    float tmp = Cstar_cc*K_N_INV;
    tmp *= tmp*tmp;
    return ns*tmp/(1 + tmp);
}

/* Vm [V] */
__device__ float compute_ca(int Tstar, float Cstar_cc, float Vm)
{
    float I_in = Tstar*G_TRP*fmaxf(-Vm, -TRP_REV);
    /* CaM = C_T - Cstar_cc */
    float denom = (K_CA + (N*K_U*C_T) - (N*K_U)*Cstar_cc + 179.0952 * expf(-(F/(R*T))*Vm));  // (K_NACA*NA_CO^3/VOL*F)
    /* I_Ca ~= 0.4*I_in */
    float numer = (0.4*I_in)/(2*VOL*F) +
                  ((K_NACA*CA_CO*NA_CI*NA_CI*NA_CI)/(VOL*F)) +  // in paper it's -K_NACA... due to different conventions
                  N*K_R*Cstar_cc;

    return fmaxf(1.6e-4, numer/denom);
}

__global__ void
transduction(curandStateXORWOW_t *state, int ld1,
             float dt, %(type)s* d_Vm, float ns)
{
    int tid = threadIdx.x;
    int bid = blockIdx.x;

    __shared__ int X[BLOCK_SIZE][7];  // number of molecules
    __shared__ float Ca[BLOCK_SIZE];
    __shared__ float Vm;  // membrane voltage, shared over all threads
    __shared__ float fn[BLOCK_SIZE];

    if(tid == 0)
    {
        Vm = d_Vm[bid] * 0.001;  // V
    }

    __syncthreads();


    float sumrate;
    float dt_advanced;
    int reaction_ind;
    short2 tmp;

    // copy random generator state locally to avoid accessing global memory
    curandStateXORWOW_t localstate = state[BLOCK_SIZE*bid + tid];

    // iterate over all microvilli in one photoreceptor
    for(int i = tid; i < NUM_MICROVILLI; i += BLOCK_SIZE)
    {
        // load variables that are needed for computing calcium concentration
        //Ca[tid] = ((%(type)s*)d_X[7])[bid*ld2 + i]; // no need to store calcium
        tmp = ((short2*)d_X[2])[bid*ld1 + i];
        X[tid][5] = tmp.x;
        X[tid][6] = tmp.y;

        // update calcium concentration
        Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
        fn[tid] = compute_fn(num_to_mM(X[tid][5]), ns);

        // load the rest of variables
        tmp = ((short2*)d_X[1])[bid*ld1 + i];
        X[tid][4] = tmp.y;
        X[tid][3] = tmp.x;
        tmp = ((short2*)d_X[0])[bid*ld1 + i];
        X[tid][2] = tmp.y;
        X[tid][1] = tmp.x;
        X[tid][0] = ((short*)d_X[3])[bid*ld1 + i];

        // compute total rate of reaction
        sumrate = 0;
        sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );  //11
        sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]);  //12
        sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6];  // 10
        sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4];  // 8
        sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3];  // 7
        sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0];  // 1
        sumrate += KAPPA_DSTAR * X[tid][3];  // 6
        sumrate += GAMMA_GAP * X[tid][2] * X[tid][3];  // 4
        sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);  // 3
        sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]);  // 5
        sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0];  // 2
        sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                   (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                   X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5 ;  // 9

        // choose the next reaction time
        dt_advanced = -logf(curand_uniform(&localstate))/(LA + sumrate);

        // If the reaction time is smaller than dt,
        // pick the reaction and update,
        // then compute the total rate and next reaction time again
        // until all dt_advanced is larger than dt.
        // Note that you don't have to compensate for
        // the last reaction time that exceeds dt.
        // The reason is that the exponential distribution is MEMORYLESS.
        while(dt_advanced <= dt)
        {
            reaction_ind = 0;
            sumrate = curand_uniform(&localstate) * sumrate;

            sumrate -= mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );
            reaction_ind = (sumrate<=2e-5) * 11;

            if(!reaction_ind)
            {
                sumrate -= mM_to_num(K_R) * num_to_mM(X[tid][5]);
                reaction_ind = (sumrate<=2e-5) * 12;
                if(!reaction_ind)
                {
                    sumrate -= GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6];
                    reaction_ind = (sumrate<=2e-5) * 10;
                    if(!reaction_ind)
                    {
                        sumrate -= GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4];
                        reaction_ind = (sumrate<=2e-5) * 8;

                        if(!reaction_ind)
                        {
                            sumrate -= GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3];
                            reaction_ind = (sumrate<=2e-5) * 7;
                            if(!reaction_ind)
                            {
                                sumrate -= GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0];
                                reaction_ind = (sumrate<=2e-5) * 1;
                                if(!reaction_ind)
                                {
                                    sumrate -= KAPPA_DSTAR * X[tid][3];
                                    reaction_ind = (sumrate<=2e-5) * 6;
                                    if(!reaction_ind)
                                    {
                                        sumrate -= GAMMA_GAP * X[tid][2] * X[tid][3];
                                        reaction_ind = (sumrate<=2e-5) * 4;

                                        if(!reaction_ind)
                                        {
                                            sumrate -= KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);
                                            reaction_ind = (sumrate<=2e-5) * 3;
                                            if(!reaction_ind)
                                            {
                                                sumrate -= GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]);
                                                reaction_ind = (sumrate<=2e-5) * 5;
                                                if(!reaction_ind)
                                                {
                                                    sumrate -= KAPPA_GSTAR * X[tid][1] * X[tid][0];
                                                    reaction_ind = (sumrate<=2e-5) * 2;
                                                    if(!reaction_ind)
                                                    {
                                                        sumrate -= (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                                                                   (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                                                                   X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5;
                                                        reaction_ind = (sumrate<=2e-5) * 9;
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            int ind;

            // only up to two state variables are needed to be updated
            // update the first one.
            ind = change_ind1[reaction_ind];
            X[tid][ind] += change1[reaction_ind];

            //if(reaction_ind == 9)
            //{
            //    X[tid][ind] = max(X[tid][ind], 0);
            //}

            ind = change_ind2[reaction_ind];
            //update the second one
            if(ind != 0)
            {
                X[tid][ind] += change2[reaction_ind];
            }

            // compute the advance time again
            Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
            fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns );
            //fp[tid] = compute_fp( Ca[tid] );

            sumrate = 0;
            sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11
            sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12
            sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10
            sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8
            sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7
            sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1
            sumrate += KAPPA_DSTAR * X[tid][3]; // 6
            sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4
            sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);  // 3
            sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5
            sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2
            sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                       (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                       X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; // 9

            dt_advanced -= logf(curand_uniform(&localstate))/(LA + sumrate);

        } // end while

        ((short*)d_X[3])[bid*ld1 + i] = X[tid][0];
        ((short2*)d_X[0])[bid*ld1 + i] = make_short2(X[tid][1], X[tid][2]);
        ((short2*)d_X[1])[bid*ld1 + i] = make_short2(X[tid][3], X[tid][4]);
        ((short2*)d_X[2])[bid*ld1 + i] = make_short2(X[tid][5], X[tid][6]);
    }
    // copy the updated random generator state back to global memory
    state[BLOCK_SIZE*bid + tid] = localstate;
}

}
"""
    #ptxas info    : 77696 bytes gmem, 336 bytes cmem[3]
    #ptxas info    : Compiling entry function 'transduction' for 'sm_35'
    #ptxas info    : Function properties for transduction
    #    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
    #ptxas info    : Used 60 registers, 7176 bytes smem, 352 bytes cmem[0], 324 bytes cmem[2]
    #float : Used 65 registers, 7172 bytes smem, 344 bytes cmem[0], 168 bytes cmem[2]

    scalartype = dtype.type if isinstance(dtype, np.dtype) else dtype
    mod = SourceModule(template % {
        "type": dtype_to_ctype(dtype),
        "block_size": block_size,
        "num_microvilli": num_microvilli,
        "fletter": 'f' if scalartype == np.float32 else ''
    },
                       options=["--ptxas-options=-v --maxrregcount=56"],
                       no_extern_c=True)
    func = mod.get_function('transduction')
    d_X_address, d_X_nbytes = mod.get_global("d_X")
    cuda.memcpy_htod(d_X_address, Xaddress)
    d_change_ind1_address, d_change_ind1_nbytes = mod.get_global("change_ind1")
    d_change_ind2_address, d_change_ind2_nbytes = mod.get_global("change_ind2")
    d_change1_address, d_change1_nbytes = mod.get_global("change1")
    d_change2_address, d_change2_nbytes = mod.get_global("change2")
    cuda.memcpy_htod(d_change_ind1_address, change_ind1)
    cuda.memcpy_htod(d_change_ind2_address, change_ind2)
    cuda.memcpy_htod(d_change1_address, change1)
    cuda.memcpy_htod(d_change2_address, change2)

    func.prepare([np.intp, np.int32, np.float32, np.intp, np.float32])
    func.set_cache_config(cuda.func_cache.PREFER_SHARED)
    return func

Example #24

Show file

File: 044-2d-barrier-gpu-h5.py Project: wbkifun/my_research

	while ( tid < TID_MAX && j >= VY0 && j < VY1 ) {
		rvc = vc[0]; 
		rpsi = psi[tid]; 

		psi[tid].x = rvc.x * rpsi.x - rvc.y * rpsi.y;
		psi[tid].y = rvc.x * rpsi.y + rvc.y * rpsi.x;

		tid += gridDim.x * blockDim.x;
	}
}
'''.replace('NY2',str(ny*2)).replace('NXY',str(nx*ny)).replace('NX',str(nx)).replace('NY',str(ny)).replace('TID0',str(vx0*ny)).replace('TID_MAX',str(vx1*ny)).replace('VY0',str(vy0)).replace('VY1',str(vy1))
#print kernels
mod = SourceModule(kernels)
lcf = mod.get_function('lcf')
vcf = mod.get_function('vcf')
lcx_const, _ = mod.get_global('lcx')
vc_const, _ = mod.get_global('vc')

cuda.memcpy_htod(lcx_const, lcx_sqrt)
cuda.memcpy_htod(vc_const, vc)

tpb = 256
bpg1, bpg2 = 0, 0
for bpg in xrange(65535, 0, -1):
	if (nx * ny / tpb) % bpg == 0: bpg1 = bpg
	if (vwidth * ny / tpb) % bpg == 0: bpg2 = bpg
	if bpg1 * bpg2 != 0: break
print 'tpb = %d, bpg1 = %g, bpg2 = %g' % (tpb, bpg1, bpg2)

# save to the h5 file
f['data'].create_dataset('psi0', data=psi_gpu.get(), compression='gzip')

Example #25

Show file

File: trace.py Project: znah/yoxel-voxel

    ])
    render_params = np.zeros((1,), render_params_t)[0] # how to do it easier

    render_params["hintTreeRoot"] = hint_grids.shape[0]-1
    render_params["viewSize"][:] = (512, 512)
    render_params["fovCoef"] = np.tan(np.radians( 45.0 / 2 ))

    eyePos = np.array([1.5, 1.5, 1.5])
    targetPos = np.array([0.5, 0.5, 0.5])
    v2wMtx = makeViewToWldMtx(eyePos, targetPos, np.array([0, 0, 1]))
    w2vMtx = np.linalg.inv(v2wMtx)
    render_params["eyePos"][:] = eyePos
    render_params["viewToWldMtx"][:] = v2wMtx[:3]
    render_params["wldToViewMtx"][:] = w2vMtx[:3]

    cuda.memcpy_htod(mod.get_global("rp")[0], render_params)

    hint_grid_tex = mod.get_texref("hint_grid_tex")
    hint_brick_tex = mod.get_texref("hint_brick_tex")
    hint_grid_tex.set_address(d_hint_grids, len(hint_grids.data))
    hint_brick_tex.set_address(d_hint_bricks, len(hint_bricks.data))
    hint_brick_tex.set_format(cuda.array_format.UNSIGNED_INT32, 2)

    dst = np.zeros((512, 512), np.float32)
    print "running kernel"
    #TestFetch(np.float32(0.6), cuda.Out(dst), block = (8, 8, 1), grid=(32, 32), texrefs = [hint_grid_tex, hint_brick_tex])
    Trace(cuda.Out(dst), block = (16, 16, 1), grid=(32, 32), texrefs = [hint_grid_tex, hint_brick_tex])

    def vis():
        import pylab
        pylab.imshow(dst, origin="bottom")

Example #26

Show file

File: pycuda.py Project: ar4/wave_2d_fd_perf_gpu

    def __init__(self, model, batch_size, dx, dt=None):

        source = """
__constant__ float fd_d[3];

__global__ void step_d(const float *const model,
                float *wfc,
                float *wfp,
                const int nb, const int nz, const int nx)
{
        int zblocks_per_shot = gridDim.y / nb;
        int x = blockDim.x * blockIdx.x + threadIdx.x;
        int z = blockDim.y * (blockIdx.y % zblocks_per_shot) + threadIdx.y;
        int b = blockIdx.y / zblocks_per_shot;
        int i = z * nx + x;
        int ib = b * nz * nx + i;
        float lap;
        bool in_domain = (x > 1) && (x < nx - 2)
                && (z > 1) && (z < nz - 2)
                && (b < nb);

        if (in_domain)
        {
                /* Laplacian */
                lap = (fd_d[0] * wfc[ib] +
                                fd_d[1] *
                                (wfc[ib + 1] +
                                 wfc[ib - 1] +
                                 wfc[ib + nx] +
                                 wfc[ib - nx]) +
                                fd_d[2] *
                                (wfc[ib + 2] +
                                 wfc[ib - 2] +
                                 wfc[ib + 2 * nx] +
                                 wfc[ib - 2 * nx]));

                /* Main evolution equation */
                wfp[ib] = model[i] * lap + 2 * wfc[ib] - wfp[ib];

        }
}

__global__ void add_sources_d(const float *const model,
                float *wfp,
                const float *const source_amplitude,
                const int *const sources_z,
                const int *const sources_x,
                const int nz, const int nx,
                const int nt, const int ns, const int it)
{

        int x = threadIdx.x;
        int b = blockIdx.x;
        int i = sources_z[b * ns + x] * nx + sources_x[b * ns + x];
        int ib = b * nz * nx + i;
        wfp[ib] += source_amplitude[b * ns * nt + x * nt + it] * model[i];
}
"""

        mod = SourceModule(source,
                           options=[
                               '-ccbin', 'clang-3.8', '--restrict',
                               '--use_fast_math', '-O3'
                           ])
        jitfunc1 = mod.get_function('step_d')
        jitfunc2 = mod.get_function('add_sources_d')
        fd_d = mod.get_global('fd_d')[0]

        pad = 3
        super(VPycuda1, self).__init__(jitfunc1, jitfunc2, fd_d, model,
                                       batch_size, pad, dx, dt)

Example #27

Show file

  def __init__(self,instance,algorithm,verbose=False):
    """Initializes a direct parallel worker

    :param instance: an instance of class from_scatterers_direct(\
cctbx.xray.structure_factors.manager.managed_calculation_base)
    :type instance: cctbx.xray.structure_factors.from_scatterers_direct
    :param algorithm: an instance of class direct_summation_cuda_platform(\
direct_summation_simple) with algorithm set to "simple" or "pycuda"
    :type algorithm: cctbx.xray.structure_factors.direct_summation_cuda_platform
    """
    self.scatterers = instance._xray_structure.scatterers()
    self.registry = instance._xray_structure.scattering_type_registry()
    self.miller_indices = instance._miller_set.indices()
    self.unit_cell = instance._miller_set.unit_cell()
    self.space_group = instance._miller_set.space_group().make_tidy()

    if verbose: self.print_diagnostics() # some diagnostics used for development

    if hasattr(algorithm,"simple"):
      instance._results = ext.structure_factors_simple(
      self.unit_cell,
      instance._miller_set.space_group(),
      self.miller_indices,
      self.scatterers,
      self.registry); return

    if hasattr(algorithm,"pycuda"):
      import pycuda.driver as cuda
      from pycuda.compiler import SourceModule

      self.validate_the_inputs(instance,cuda,algorithm)

      self.prepare_miller_arrays_for_cuda(algorithm)

      self.prepare_scattering_sites_for_cuda(algorithm)

      self.prepare_gaussians_symmetries_cell(algorithm)

      assert cuda.Device.count() >= 1

      device = cuda.Device(0)
      WARPSIZE=device.get_attribute(cuda.device_attribute.WARP_SIZE) # 32
      MULTIPROCESSOR_COUNT=device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)

      sort_mod = SourceModule((mod_fhkl_sorted%(self.gaussians.shape[0],
                         self.symmetry.shape[0],
                         self.sym_stride,
                         self.g_stride,
                         int(self.use_debye_waller),
                         self.order_z,self.order_p)).replace("floating_point_t",algorithm.float_t)
                         )

      r_m_m_address = sort_mod.get_global("reciprocal_metrical_matrix")[0]
      cuda.memcpy_htod(r_m_m_address, self.reciprocal_metrical_matrix)

      gaussian_address = sort_mod.get_global("gaussians")[0]
      cuda.memcpy_htod(gaussian_address, self.gaussians)

      symmetry_address = sort_mod.get_global("symmetry")[0]
      cuda.memcpy_htod(symmetry_address, self.symmetry)

      CUDA_fhkl = sort_mod.get_function("CUDA_fhkl")

      intermediate_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)
      intermediate_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)
      for x in range(self.scatterers.number_of_types):
        fhkl_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)
        fhkl_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t)

        CUDA_fhkl(cuda.InOut(fhkl_real),
                 cuda.InOut(fhkl_imag),
                 cuda.In(self.flat_sites),
                 cuda.In(self.weights),
                 cuda.In(self.u_iso),
                 algorithm.numpy.uint32(self.scatterers.increasing_order[x]),
                 algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][0]),
                 algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][1]),
                 cuda.In(self.flat_mix),
                 block=(FHKL_BLOCKSIZE,1,1),
                 grid=((self.n_flat_hkl//FHKL_BLOCKSIZE,1)))

        intermediate_real += fhkl_real
        intermediate_imag += fhkl_imag

      flex_fhkl_real = flex.double(intermediate_real[0:len(self.miller_indices)].astype(algorithm.numpy.float64))
      flex_fhkl_imag = flex.double(intermediate_imag[0:len(self.miller_indices)].astype(algorithm.numpy.float64))

      instance._results = fcalc_container(flex.complex_double(flex_fhkl_real,flex_fhkl_imag))

      return

Example #28

Show file

class RimeSumCoherencies(Node):
    def __init__(self):
        super(RimeSumCoherencies, self).__init__()

    def initialise(self, solver, stream=None):
        slvr = solver
        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')

        # Get a property dictionary off the solver
        D = slvr.template_dict()
        # Include our kernel parameters
        D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS)
        D['rime_const_data_struct'] = slvr.const_data().string_def()

        D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \
            mbu.redistribute_threads(
                D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'],
                npolchan, nbl, ntime)

        regs = str(FLOAT_PARAMS['maxregs'] \
            if slvr.is_float() else DOUBLE_PARAMS['maxregs'])

        # Create the signature of the call to the function stamping macro
        stamp_args = ', '.join([
            'float' if slvr.is_float() else 'double',
            'float2' if slvr.is_float() else 'double2',
            'float3' if slvr.is_float() else 'double3',
            'true' if slvr.use_weight_vector() else 'false',
            '1' if slvr.outputs_residuals() else '0'])
        stamp_fn = ''.join(['stamp_sum_coherencies_fn(', stamp_args, ')'])
        D['stamp_function'] = stamp_fn

        kname = 'rime_sum_coherencies'

        self.mod = SourceModule(
            KERNEL_TEMPLATE.substitute(**D),
            options=['-lineinfo','-maxrregcount', regs],
            include_dirs=[montblanc.get_source_path()],
            no_extern_c=True)

        self.rime_const_data = self.mod.get_global('C')
        self.kernel = self.mod.get_function(kname)
        self.launch_params = self.get_launch_params(slvr, D)

    def shutdown(self, solver, stream=None):
        pass

    def pre_execution(self, solver, stream=None):
        pass

    def get_launch_params(self, slvr, D):
        polchans_per_block = D['BLOCKDIMX']
        bl_per_block = D['BLOCKDIMY']
        times_per_block = D['BLOCKDIMZ']

        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
        polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block)
        bl_blocks = mbu.blocks_required(nbl, bl_per_block)
        time_blocks = mbu.blocks_required(ntime, times_per_block)

        return {
            'block' : (polchans_per_block, bl_per_block, times_per_block),
            'grid'  : (polchan_blocks, bl_blocks, time_blocks),
        }

    def execute(self, solver, stream=None):
        slvr = solver

        if stream is not None:
            cuda.memcpy_htod_async(
                self.rime_const_data[0],
                slvr.const_data().ndary(),
                stream=stream)
        else:
            cuda.memcpy_htod(
                self.rime_const_data[0],
                slvr.const_data().ndary())

        # The gaussian shape array can be empty if
        # no gaussian sources were specified.
        gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \
            else slvr.gauss_shape

        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
            else slvr.sersic_shape

        self.kernel(slvr.uvw, gauss, sersic,
            slvr.frequency, slvr.antenna1, slvr.antenna2,
            slvr.jones, slvr.flag, slvr.weight_vector,
            slvr.observed_vis, slvr.G_term,
            slvr.model_vis, slvr.chi_sqrd_result,
            stream=stream, **self.launch_params)

        # Call the pycuda reduction kernel.
        # Divide by the single sigma squared value if a weight vector
        # is not required. Otherwise the kernel will incorporate the
        # individual sigma squared values into the sum
        gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get()

        if not slvr.use_weight_vector():
            slvr.set_X2(gpu_sum/slvr.sigma_sqrd)
        else:
            slvr.set_X2(gpu_sum)

    def post_execution(self, solver, stream=None):
        pass

Example #29

Show file

phi = np.linspace(0, 2 * math.pi, nsamps).astype(np.float64)
psi = np.linspace(0, 2 * math.pi, nsamps).astype(np.float64)

tref = np.array([24715.581890875823 for item in theta]).astype(np.float64)

rhoTS = np.ones((nmodes, ntimes)).astype(np.complex128)
rhoTS[0, :] = np.arange(ntimes) + 1.0j * np.arange(ntimes)
rhoTS[1, :] = 2.0 * np.arange(ntimes) + 1.0j * 2.0 * np.arange(ntimes)
rhoTS[2, :] = 3.0 * np.arange(ntimes) + 1.0j * 3.0 * np.arange(ntimes)

#_____________________
#   Pass down data
#_____________________

# **-- constants --**
nmodes_gpu = mod.get_global("nmodes")[0]
nsamps_gpu = mod.get_global("nsamps")[0]
ntimes_gpu = mod.get_global("ntimes")[0]
nclmns_gpu = mod.get_global("nclmns")[0]
detector_tensor_gpu = mod.get_global("det_tns")[0]

cuda.memcpy_htod(nmodes_gpu, nmodes)
cuda.memcpy_htod(nsamps_gpu, nsamps)
cuda.memcpy_htod(ntimes_gpu, ntimes)
cuda.memcpy_htod(nclmns_gpu, nclmns)
cuda.memcpy_htod(detector_tensor_gpu, detector_tensor)

# **---- data -----**

selected_modes_gpu = gpuarray.to_gpu(mlist_sort)
theta_gpu = gpuarray.to_gpu(theta)

Example #30

Show file

File: cuda_debug.py Project: svistunov/cubvis

normals = numpy.loadtxt("data/normals.dat").astype(numpy.float32)
indexes = numpy.loadtxt("data/indexes.dat").astype(numpy.ushort)
vis = numpy.zeros((1, verts.size/3), numpy.float32)
template_params = {'dN' : design.size, 'visN': vis.size, 'vN' : verts.size, 'kernelStep' : kernelStep}
kernel_code = Template(
    file = 'vis.cu', 
    searchList = [template_params],
  )
cuda_module = SourceModule(kernel_code)
cuda_call = cuda_module.get_function("vis")
cuda_call.prepare("P", (256, 1, 1))
N =  vis.size
grid_dimensions   =  (min(32, (N+256-1) // 256 ), 1)

vis_gpu = cuda_driver.mem_alloc(vis.nbytes)
design_gpu = cuda_module.get_global('design')[0]
cuda_driver.memcpy_htod(design_gpu, design)
cuda_driver.memcpy_htod(vis_gpu, vis)
kernel_n = cuda_module.get_global('kernelN')[0]


v_tex = cuda_module.get_texref('v_tex')
v_tex_gpu = cuda_driver.to_device(verts)
v_tex.set_address(v_tex_gpu, verts.nbytes)
v_tex.set_format(cuda_driver.array_format.FLOAT, 1)

n_tex = cuda_module.get_texref('n_tex')
n_tex_gpu = cuda_driver.to_device(normals)
n_tex.set_address(n_tex_gpu, normals.nbytes)
n_tex.set_format(cuda_driver.array_format.FLOAT, 1)

Example #31

Show file

File: 028-1d-barrier-gpu-h5.py Project: wbkifun/my_research

	while ( tid < TID_MAX ) {
		rpsi = psi[tid]; 

		psi[tid].x = vc_real * rpsi.x - vc_imag * rpsi.y;
		psi[tid].y = vc_real * rpsi.y + vc_imag * rpsi.x;

		tid += gridDim.x * blockDim.x;
	}
}
'''.replace('HNX',str(nx/2)).replace('NX',str(nx)).replace('TID0',str(vx0)).replace('TID_MAX',str(vx1))
print kernels
mod = SourceModule(kernels)
lcf = mod.get_function('lcf')
vcf = mod.get_function('vcf')
lcx_const, _ = mod.get_global('lcx')
cuda.memcpy_htod(lcx_const, lcx_sqrt[1:nx/2+1])

tpb = 256
bpg = 30 * 4
print 'tpb = %d, bpg = %g' % (tpb, bpg)

# save to the h5 file

# plot & save to the h5 file
import matplotlib.pyplot as plt
plt.ion()
fig = plt.figure()
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)

Example #32

Show file

File: likelihood_v2.py Project: brandonbm00/rapidpe_gpu

psi   = np.linspace(0, 2*math.pi, nsamps).astype(np.float64)

tref = np.array([24715.581890875823 for item in theta]).astype(np.float64)

rhoTS = np.ones((nmodes, ntimes)).astype(np.complex128)
rhoTS[0,:] = np.arange(ntimes) + 1.0j*np.arange(ntimes)
rhoTS[1,:] = 2.0*np.arange(ntimes) + 1.0j*2.0*np.arange(ntimes)
rhoTS[2,:] = 3.0*np.arange(ntimes) + 1.0j*3.0*np.arange(ntimes)

#_____________________
#   Pass down data 
#_____________________


# **-- constants --**
nmodes_gpu = mod.get_global("nmodes")[0]
nsamps_gpu = mod.get_global("nsamps")[0]
ntimes_gpu = mod.get_global("ntimes")[0]
nclmns_gpu = mod.get_global("nclmns")[0]
detector_tensor_gpu = mod.get_global("det_tns")[0] 

cuda.memcpy_htod(nmodes_gpu, nmodes)
cuda.memcpy_htod(nsamps_gpu, nsamps)
cuda.memcpy_htod(ntimes_gpu, ntimes)
cuda.memcpy_htod(nclmns_gpu, nclmns)
cuda.memcpy_htod(detector_tensor_gpu, detector_tensor)



# **---- data -----**

Example #33

Show file

class GPURBFEll(object):
    """RBF Kernel with ellpack format"""

    cache_size = 100

    Gamma = 1.0

    #template
    func_name = 'rbfEllpackILPcol2multi'

    #template
    module_file = os.path.dirname(__file__) + '/cu/KernelsEllpackCol2.cu'

    #template
    texref_nameI = 'VecI_TexRef'
    texref_nameJ = 'VecJ_TexRef'

    max_concurrent_kernels = 1

    def __init__(self, gamma=1.0, cache_size=100):
        """
        Initialize object
        
        Parameters
        -------------
        
        max_kernel_nr: int
            determines maximal concurrent kernel column gpu computation
        """
        self.cache_size = cache_size

        self.threadsPerRow = 1
        self.prefetch = 2

        self.tpb = 128
        self.Gamma = gamma

    def init_cuda(self, X, Y, cls_start, max_kernels=1):

        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels

        self.X = X
        self.Y = Y

        self.cls_start = cls_start.astype(np.int32)

        #handle to gpu memory for y for each concurrent classifier
        self.g_y = []
        #handle to gpu memory for results for each concurrent classifier
        self.g_out = []  #gpu kernel out
        self.kernel_out = []  #cpu kernel out
        #blocks per grid for each concurrent classifier
        self.bpg = []

        #function reference
        self.func = []

        #texture references for each concurrent kernel
        self.tex_ref = []

        #main vectors
        #gpu
        self.g_vecI = []
        self.g_vecJ = []
        #cpu
        self.main_vecI = []
        self.main_vecJ = []

        #cpu class
        self.cls_count = []
        self.cls = []
        #gpu class
        self.g_cls_count = []
        self.g_cls = []

        self.sum_cls = []

        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)
            #            self.func.append(0)
            #            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
            #            self.main_vecI.append(0)
            #            self.main_vecJ.append(0)
            self.sum_cls.append(0)

        self.N, self.Dim = X.shape
        column_size = self.N * 4
        cacheMB = self.cache_size * 1024 * 1024  #100MB for cache size

        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB / column_size).astype(int)

        cache_items = min(self.N, cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)

        self.compute_diag()

        #cuda initialization
        cuda.init()

        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code
        with open(self.module_file, "r") as CudaFile:
            module_code = CudaFile.read()

        #compile module
        self.module = SourceModule(module_code, keep=True, no_extern_c=True)

        (g_gamma, gsize) = self.module.get_global('GAMMA')
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma))

        #get functions reference

        Dim = self.Dim
        vecBytes = Dim * 4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex = self.module.get_texref('VecI_TexRef')
            self.g_vecI[f] = cuda.mem_alloc(vecBytes)
            vecI_tex.set_address(self.g_vecI[f], vecBytes)

            #init texture for vector J
            vecJ_tex = self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f] = cuda.mem_alloc(vecBytes)
            vecJ_tex.set_address(self.g_vecJ[f], vecBytes)

            self.tex_ref.append((vecI_tex, vecJ_tex))

            self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32))
            self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32))

            texReflist = list(self.tex_ref[f])

            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist)

        #transform X to particular format
        v, c, r = spf.csr2ellpack(self.X, align=self.prefetch)
        #copy format data structure to gpu memory

        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)

        self.g_cls_start = cuda.to_device(self.cls_start)

    def cls_init(self, kernel_nr, y_cls, cls1, cls2, cls1_n, cls2_n):
        """
        Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2.
         
        Parameters
        ------------
        kernel_nr : int
            concurrent kernel number
        y_cls : array-like
            binary class labels (1,-1)
        cls1: int
            first class number
        cls2: int
            second class number
        cls1_n : int
            number of elements of class 1
        cls2_n : int
            number of elements of class 2
        kernel_out : array-like
            array for gpu kernel result, size=2*len(y_cls)
        
        """
        warp = 32
        align_cls1_n = cls1_n + (warp - cls1_n % warp) % warp
        align_cls2_n = cls2_n + (warp - cls2_n % warp) % warp

        self.cls1_N_aligned = align_cls1_n

        sum_cls = align_cls1_n + align_cls2_n
        self.sum_cls[kernel_nr] = sum_cls

        self.cls_count[kernel_nr] = np.array([cls1_n, cls2_n], dtype=np.int32)
        self.cls[kernel_nr] = np.array([cls1, cls2], dtype=np.int32)

        self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr])

        self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr])

        self.bpg[kernel_nr] = int(
            np.ceil((self.threadsPerRow * sum_cls + 0.0) / self.tpb))

        self.g_y[kernel_nr] = cuda.to_device(y_cls)

        self.kernel_out[kernel_nr] = np.zeros(2 * y_cls.shape[0],
                                              dtype=np.float32)

        ker_out = self.kernel_out[kernel_nr]
        self.g_out[kernel_nr] = cuda.to_device(
            ker_out)  # cuda.mem_alloc_like(ker_out)

        #add prepare for device functions

    def K2Col(self, i, j, i_ds, j_ds, kernel_nr):
        """ 
        computes i-th and j-th kernel column 

        Parameters
        ---------------
        i: int
            i-th kernel column number in subproblem
        j: int
            j-th kernel column number in subproblem

        i_ds: int
            i-th kernel column number in whole dataset
        j_ds: int
            j-th kernel column number in  whole dataset

        kernel_nr : int
            number of concurrent kernel
            
        ker2ColOut: array like
            array for output
        
        Returns
        -------
        ker2Col
        
        """

        #make i-th and j-the main vectors
        vecI = self.main_vecI[kernel_nr]
        vecJ = self.main_vecJ[kernel_nr]

        #        self.X[i_ds,:].todense(out=vecI)
        #        self.X[j_ds,:].todense(out=vecJ)

        #vecI.fill(0)
        #vecJ.fill(0)

        #self.X[i_ds,:].toarray(out=vecI)
        #self.X[j_ds,:].toarray(out=vecJ)

        vecI = self.X.getrow(i_ds).todense()
        vecJ = self.X.getrow(j_ds).todense()

        #copy them to texture
        cuda.memcpy_htod(self.g_vecI[kernel_nr], vecI)
        cuda.memcpy_htod(self.g_vecJ[kernel_nr], vecJ)

        #        temp = np.empty_like(vecI)
        #        cuda.memcpy_dtoh(temp,self.g_vecI[kernel_nr])
        #        print 'temp',temp
        #lauch kernel

        gfunc = self.func[kernel_nr]
        gy = self.g_y[kernel_nr]
        gout = self.g_out[kernel_nr]
        gN = np.int32(self.N)
        g_i = np.int32(i)
        g_j = np.int32(j)
        g_ids = np.int32(i_ds)
        g_jds = np.int32(j_ds)
        gNalign = np.int32(self.cls1_N_aligned)
        gcs = self.g_cls_start
        gcc = self.g_cls_count[kernel_nr]
        gc = self.g_cls[kernel_nr]
        bpg = self.bpg[kernel_nr]

        #print 'start gpu i,j,kernel_nr ',i,j,kernel_nr
        #texReflist = list(self.tex_ref[kernel_nr])
        #gfunc(self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc,block=(self.tpb,1,1),grid=(bpg,1),texrefs=texReflist)
        #print 'end gpu',i,j
        #copy the results

        #grid=(bpg,1),block=(self.tpb,1,1)
        gfunc.prepared_call((bpg, 1), (self.tpb, 1, 1), self.g_val, self.g_col,
                            self.g_len, self.g_sdot, gy, gout, gN, g_i, g_j,
                            g_ids, g_jds, gNalign, gcs, gcc, gc)

        cuda.memcpy_dtoh(self.kernel_out[kernel_nr], gout)

        return self.kernel_out[kernel_nr]

    def K_vec(self, vec):
        '''
        vec - array-like, row ordered data, should be not to big
        '''

        dot = self.X.dot(vec.T)
        x2 = self.Xsquare.reshape((self.Xsquare.shape[0], 1))
        if (sp.issparse(vec)):
            v2 = vec.multiply(vec).sum(1).reshape((1, vec.shape[0]))
        else:
            v2 = np.einsum('...i,...i', vec, vec)

        return np.exp(-self.Gamma * (x2 + v2 - 2 * dot))

    def compute_diag(self):
        """
        Computes kernel matrix diagonal
        """

        #for rbf diagonal consists of ones exp(0)==1
        self.Diag = np.ones(self.X.shape[0], dtype=np.float32)

        if (sp.issparse(self.X)):
            # result as matrix
            self.Xsquare = self.X.multiply(self.X).sum(1)
            #result as array
            self.Xsquare = np.asarray(self.Xsquare).flatten()
        else:
            self.Xsquare = np.einsum('...i,...i', self.X, self.X)

    def clean(self, kernel_nr):
        """ clean the kernel cache """
        #self.kernel_cache.clear()

        self.bpg[kernel_nr] = 0

    def clean_cuda(self):
        '''
        clean all cuda resources
        '''

        for f in range(self.max_concurrent_kernels):

            #vecI_tex=??
            #self.g_vecI[f].free()
            del self.g_vecI[f]

            #init texture for vector J
            #vecJ_tex=??
            #self.g_vecJ[f].free()
            del self.g_vecJ[f]
            self.g_cls_count[f].free()
            self.g_cls[f].free()
            self.g_y[f].free()
            self.g_out[f].free()

        #test it
        #del self.g_out[f] ??

        #copy format data structure to gpu memory

        self.g_val.free()
        self.g_col.free()
        self.g_len.free()
        self.g_sdot.free()
        self.g_cls_start.free()

        print self.ctx
        self.ctx.pop()

        print self.ctx
        del self.ctx

    def predict_init(self, SV):
        """
        Init the classifier for prediction
        """

        self.X = SV
        self.compute_diag()

Example #34

Show file

class RimeSumCoherencies(Node):
    def __init__(self):
        super(RimeSumCoherencies, self).__init__()

    def initialise(self, solver, stream=None):
        slvr = solver
        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')

        # Get a property dictionary off the solver
        D = slvr.template_dict()
        # Include our kernel parameters
        D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS)
        D['rime_const_data_struct'] = slvr.const_data().string_def()

        D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \
            mbu.redistribute_threads(
                D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'],
                npolchan, nbl, ntime)

        regs = str(FLOAT_PARAMS['maxregs'] \
            if slvr.is_float() else DOUBLE_PARAMS['maxregs'])

        # Create the signature of the call to the function stamping macro
        stamp_args = ', '.join([
            'float' if slvr.is_float() else 'double',
            'float2' if slvr.is_float() else 'double2',
            'float3' if slvr.is_float() else 'double3',
            'true' if slvr.use_weight_vector() else 'false',
            '1' if slvr.outputs_residuals() else '0'
        ])
        stamp_fn = ''.join(['stamp_sum_coherencies_fn(', stamp_args, ')'])
        D['stamp_function'] = stamp_fn

        kname = 'rime_sum_coherencies'

        self.mod = SourceModule(KERNEL_TEMPLATE.substitute(**D),
                                options=['-lineinfo', '-maxrregcount', regs],
                                include_dirs=[montblanc.get_source_path()],
                                no_extern_c=True)

        self.rime_const_data = self.mod.get_global('C')
        self.kernel = self.mod.get_function(kname)
        self.launch_params = self.get_launch_params(slvr, D)

    def shutdown(self, solver, stream=None):
        pass

    def pre_execution(self, solver, stream=None):
        pass

    def get_launch_params(self, slvr, D):
        polchans_per_block = D['BLOCKDIMX']
        bl_per_block = D['BLOCKDIMY']
        times_per_block = D['BLOCKDIMZ']

        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
        polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block)
        bl_blocks = mbu.blocks_required(nbl, bl_per_block)
        time_blocks = mbu.blocks_required(ntime, times_per_block)

        return {
            'block': (polchans_per_block, bl_per_block, times_per_block),
            'grid': (polchan_blocks, bl_blocks, time_blocks),
        }

    def execute(self, solver, stream=None):
        slvr = solver

        if stream is not None:
            cuda.memcpy_htod_async(self.rime_const_data[0],
                                   slvr.const_data().ndary(),
                                   stream=stream)
        else:
            cuda.memcpy_htod(self.rime_const_data[0],
                             slvr.const_data().ndary())

        # The gaussian shape array can be empty if
        # no gaussian sources were specified.
        gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \
            else slvr.gauss_shape

        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
            else slvr.sersic_shape

        self.kernel(slvr.uvw,
                    gauss,
                    sersic,
                    slvr.frequency,
                    slvr.antenna1,
                    slvr.antenna2,
                    slvr.jones,
                    slvr.flag,
                    slvr.weight_vector,
                    slvr.observed_vis,
                    slvr.G_term,
                    slvr.model_vis,
                    slvr.chi_sqrd_result,
                    stream=stream,
                    **self.launch_params)

        # Call the pycuda reduction kernel.
        # Divide by the single sigma squared value if a weight vector
        # is not required. Otherwise the kernel will incorporate the
        # individual sigma squared values into the sum
        gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get()

        if not slvr.use_weight_vector():
            slvr.set_X2(gpu_sum / slvr.sigma_sqrd)
        else:
            slvr.set_X2(gpu_sum)

    def post_execution(self, solver, stream=None):
        pass

Example #35

Show file

File: riemanntheta_omegas.py Project: abelfunctions/abelfunctions

    def func1(self, g, TILEWIDTH, TILEHEIGHT):
        template = """
        #include <stdlib.h>
        #include <stdio.h>
        #include <math.h>
        
        #define GENUS %d
        #define TILEHEIGHT %d
        #define TILEWIDTH %d

        __device__ __constant__ double xd[GENUS];
        __device__ __constant__ double yd[GENUS];
        
        /***************************************************************************

        normpart
        --------

        A helper function for the finite sum functions. Computes:

        -pi * ||T*(n + fracshift)||^2

        = -pi * ||T * (n + (shift - intshift))||^2

        = -pi * ||T * (n + Yinv*y - round(Yinv*y))||^2

        ***************************************************************************/
        
        __device__ double normpart(int g, double* Yinvd_s, double* Td_s, double* Sd_s)
        {
          int tx = threadIdx.x;
          int ty = threadIdx.y;
          double norm = 0;
          int i,j,k;
          for (i = 0; i < g; i++) {
            double sum = 0;
            for (j = 0; j < g; j++) {
              double T_ij = Td_s[ty*g*g + i*g + j];
              double n_j = Sd_s[tx*g + j];
              double shift_j = 0;
              for (k = 0; k < g; k++) {
                shift_j += Yinvd_s[ty*g*g + g*j + k]*yd[k];
              }
            sum += T_ij * (n_j + shift_j - round(shift_j));
            }
          norm += sum*sum;
          }
          return -M_PI * norm;
        }

        /*************************************************************************
        exppart
        -------

        A helper function for the finite sum functions. Computes:

        2pi * <(n - intshift), (1/2)X(n - intshift) + x>

        =2pi * <n - round(shift), (1/2)X(n - round(shift) + x>

        =2pi * <n - round(Yinv*y), (1/2)X(n - round(Yinv*y) + x>

        ***************************************************************************/
        
        __device__ double exppart(int g, double *Xd_s, double *Yinvd_s, double* Sd_s)
        {
          int tx = threadIdx.x;
          int ty = threadIdx.y;
          double exppart = 0;
          int i,j,k,h;
          for (i = 0; i < g; i++) {
            double n_i = Sd_s[tx*g + i];
            double shift_i = 0;
            for (k = 0; k < g; k++) {
              shift_i += Yinvd_s[ty*g*g + i*g + k] * yd[k];
            }
            double A = n_i - round(shift_i);
            double Xshift_i = 0;
            for (j = 0; j < g; j++) {
              double X_ij = Xd_s[ty*g*g + i*g + j];
              double shift_j = 0;
              for (h = 0; h < g; h++) {
                shift_j += Yinvd_s[ty*g*g + j*g + h] * yd[h];
              }
              Xshift_i += .5 * (X_ij * (Sd_s[tx*g + j] - round(shift_j)));
            }
            double B = Xshift_i + xd[i];
            exppart += A*B;
          }
          return 2* M_PI * exppart;
        }

        /****************************************************************************
        Derivative Product
        
        Computes:
        
           ___
           | | 
           | | 2*pi*I <d, n - intshift>
        d in derivs
        ****************************************************************************/
        __device__ void deriv_prod(int g, double *Sd_s, double* Yinvd_s,
                                   double* dpr, double* dpi, double* deriv_real, 
                                   double* deriv_imag, int nderivs)
        {
          int tx = threadIdx.x;
          int ty = threadIdx.y;
          double total_real = 1;
          double total_imag = 0;
          
          int i,j,k;
          for (i = 0; i < nderivs; i++) {
            double term_real = 0;
            double term_imag = 0;
            for (j = 0; i < g; i++) {
              double shift_j = 0;
              for (k = 0; k < g; k++) {
                shift_j += Yinvd_s[ty*g*g + j*g + k] * yd[k];
              }
              double intshift = round(shift_j);
              double nmintshift = Sd_s[tx*g + j] - intshift;
              term_real += deriv_real[j + g*i] * nmintshift;
              term_imag += deriv_imag[j + g*i] * nmintshift;
            }
            
            total_real = total_real * term_real - total_imag * term_imag;
            total_imag = total_real * term_imag + total_imag * term_real;
          }

          //Computes: (2*pi*i)^(nderivs) * (total_real + total_imag*i)
          double pi_mult = pow(2*M_PI, nderivs);
          /*Determines what the result of i^nderivs is, and performs the 
          correct multiplication afterwards.*/
          if (nderivs %% 4 == 0) {
            dpr[0] = pi_mult*total_real;
            dpi[0] = pi_mult*total_imag;
          }
          else if (nderivs %% 4 == 1) {
            dpr[0] = -pi_mult * total_imag;
            dpi[0] = pi_mult * total_real;
          }
          else if (nderivs %% 4 == 2) {
            dpr[0] = -pi_mult * total_real;
            dpi[0] = -pi_mult * total_imag;
          }
          else if (nderivs %% 4 == 3) {
            dpr[0] = pi_mult * total_imag;
            dpi[0] = -pi_mult * total_real;
          }
        }


        /**************************************************************************
        
        Finite Sum Without Derivatives Kernel Function
        
        **************************************************************************/
        __global__ void riemann_theta(double *fsum_reald, double *fsum_imagd, double *Xd,
                                      double *Yinvd, double* Td, double *Sd, int g, int N, int K)
        {
          /*Built in variables to be used, x variable denotes the summation index
          while the y variable denotes the Omega index*/
          int bx = blockIdx.x;
          int by = blockIdx.y;
          int tx = threadIdx.x;
          int ty = threadIdx.y;
          
          __shared__ double Sd_s[TILEWIDTH * GENUS];
          __shared__ double Xd_s[TILEHEIGHT * GENUS * GENUS];
          __shared__ double Yinvd_s[TILEHEIGHT * GENUS * GENUS];
          __shared__ double Td_s[TILEHEIGHT * GENUS * GENUS]; 

          /*Determine n_0, the start of the summation vector,
          the full vector is of the form, n_0, n_1, n_2, n_g*/
          int n_start = (bx * TILEWIDTH + tx) * g;
          /* Now n = S[n_start], S[n_start + 1], S[n_start + 2]...S[n_start + (g-1)] */
        
          /*Determine the Omega to evaluate on*/
          int omega_start = (by*TILEHEIGHT + ty)*g*g;
          /* Now omega is Omega[omega_start], ... Omega[omega_start + g*g-1]
          where Omega is Xd, Yinvd, and Td */
          
          /*Load data into shared arrays */
          int i;
          for (i = 0; i < g; i++){
            Sd_s[tx*g + i] = Sd[n_start + i];
          }
          for (i = 0; i < g*g; i++) {
            Xd_s[ty*g*g + i] = Xd[omega_start + i];
            Yinvd_s[ty*g*g + i] = Yinvd[omega_start + i];
            Td_s[ty*g*g + i] = Td[omega_start + i];
          }

          __syncthreads();
          
          if (n_start < N*g && omega_start < K*g*g) {
            /*Compute the 'cosine' and 'sine' parts of the summand*/
            double ept, npt, cpt, spt;
            ept = exppart(g, Xd_s, Yinvd_s, Sd_s);
            npt = exp(normpart(g, Yinvd_s, Td_s, Sd_s));
            cpt = npt*cos(ept);
            spt = npt*sin(ept);

            fsum_reald[n_start/g + omega_start/g/g * N] = cpt;
            fsum_imagd[n_start/g + omega_start/g/g * N] = spt;
          }
       }

       /***********************************************************************
       
       Finite Sum with Derivatives Kernel Function
       
       ************************************************************************/
       __global__ void riemann_theta_derivatives(double* fsum_reald, double* fsum_imagd, 
                                                 double* Xd, double *Yinvd, double *Td, 
                                                 double *Sd, double *deriv_reald, 
                                                 double *deriv_imagd, 
                                                 int nderivs, int g, int N, int K)
        {
           /*Built in variables to be used, x variable denotes the summation index
          while the y variable denotes the Omega index*/
          int bx = blockIdx.x;
          int by = blockIdx.y;
          int tx = threadIdx.x;
          int ty = threadIdx.y;
          
          __shared__ double Sd_s[TILEWIDTH * GENUS];
          __shared__ double Xd_s[TILEHEIGHT * GENUS * GENUS];
          __shared__ double Yinvd_s[TILEHEIGHT * GENUS * GENUS];
          __shared__ double Td_s[TILEHEIGHT * GENUS * GENUS]; 

          /*Determine n_0, the start of the summation vector,
          the full vector is of the form, n_0, n_1, n_2, n_g*/
          int n_start = (bx * TILEWIDTH + tx) * g;
          /* Now n = S[n_start], S[n_start + 1], S[n_start + 2]...S[n_start + (g-1)] */
          
          /*Determine the Omega to evaluate on*/
          int omega_start = (by*TILEHEIGHT + ty)*g*g;
          /* Now omega is Omega[omega_start], ... Omega[omega_start + g*g-1]
          where Omega is Xd, Yinvd, and Td */
          
          /*Load data into shared arrays */
          int i;
          for (i = 0; i < g; i++){
            Sd_s[tx*g + i] = Sd[n_start + i];
          }
          for (i = 0; i < g*g; i++) {
            Xd_s[ty*g*g + i] = Xd[omega_start + i];
            Yinvd_s[ty*g*g + i] = Yinvd[omega_start + i];
            Td_s[ty*g*g + i] = Td[omega_start + i];
          }

          __syncthreads();

          if (n_start < N*g && omega_start < K*g*g) {
            /*Compute the 'cosine' and 'sine' parts of the summand */
            double dpr[1];
            double dpi[1];
            dpr[0] = 0;
            dpi[0] = 0;
            double ept, npt, cpt, spt;            
            ept = exppart(g, Xd_s, Yinvd_s, Sd_s);
            npt = exp(normpart(g, Yinvd_s, Td_s, Sd_s));
            cpt = npt*cos(ept);
            spt = npt*sin(ept);
            deriv_prod(g,Sd_s,Yinvd_s,dpr,dpi, deriv_reald,deriv_imagd, nderivs);
            fsum_reald[n_start/g + omega_start/g/g * N] = dpr[0] * cpt - dpi[0] * spt;
            fsum_imagd[n_start/g + omega_start/g/g * N] = dpi[0] * cpt + dpr[0] * spt;
          }
        } 
        
       """ %(g, TILEHEIGHT, TILEWIDTH)
        mod = SourceModule(template)
        func = mod.get_function("riemann_theta")
        deriv_func = mod.get_function("riemann_theta_derivatives")
        xd = mod.get_global("xd")[0]
        yd = mod.get_global("yd")[0]
        return func, deriv_func, xd, yd

Example #36

Show file

import os
import math
import numpy
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

from .utils import gpu_func
from .enums import MAX_BLOCK_SIZE, CUR_DIR, CACHE_DIR

mod = SourceModule(open(os.path.join(CUR_DIR, 'kernel/maxpool.cu')).read(),
                   cache_dir=CACHE_DIR)
maxpool_kernel = mod.get_function('maxpool_kernel')
maxpool_back_kernel = mod.get_function('maxpool_back_kernel')
d_a_size = mod.get_global('d_a_size')[0]
d_out_size = mod.get_global('d_out_size')[0]


@gpu_func
def maxpool(d_a, window_shape):
    h, w = window_shape
    in_z, in_y, in_x = d_a.shape
    out_z = in_z
    out_y = in_y / h
    out_x = in_x / w

    assert h * w < MAX_BLOCK_SIZE

    cuda.memcpy_htod(d_a_size, numpy.array(d_a.shape, dtype=numpy.int32))
    cuda.memcpy_htod(d_out_size,

Example #37

Show file

        gmemPos += gmemStride;
    }
}
'''
template = string.Template(template)
code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS,
                           KERNEL_W = KERNEL_W,
                           COLUMN_TILE_H=COLUMN_TILE_H,
                           COLUMN_TILE_W=COLUMN_TILE_W,
                           ROW_TILE_W=ROW_TILE_W,
                           KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)

module = SourceModule(code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
d_Kernel_columns = module.get_global('d_Kernel_columns')[0]

# Helper functions for computing alignment...
def iDivUp(a, b):
    # Round a / b to nearest higher integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a / b + 1) if (a % b != 0) else (a / b)

def iDivDown(a, b):
    # Round a / b to nearest lower integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return a / b;

Example #38

Show file

File: main_adaptation.py Project: schroeder-dewitt/polyomino-self-assembly

def main():

    #Initialise InteractionMatrix
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Initialise GPU (equivalent of autoinit)
    drv.init()
    assert drv.Device.count() >= 1
    dev = drv.Device(0)
    ctx = dev.make_context(0)

    #Convert GlobalParams to List
    GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32)
    count = 0
    for x in GlobalParamsDict.keys():
        GlobalParams[count] = GlobalParamsDict[x]
        count += 1

    #Convert FitnessParams to List
    FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32)
    count = 0
    for x in FitnessParamsDict.keys():
        FitnessParams[count] = FitnessParamsDict[x]
        count += 1

    #Convert GAParams to List
    GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32)
    count = 0
    for x in GAParamsDict.keys():
        GAParams[count] = GAParamsDict[x]
        count += 1

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main_discoverytime', './templates'))

    # Load source code from file
    Source = env.get_template('./kernel.cu') #Template( file(KernelFile).read() )

    #Create dictionary argument for rendering
    RenderArgs= {"params_size":GlobalParams.nbytes,\
                "fitnessparams_size":FitnessParams.nbytes,\
                "gaparams_size":GAParams.nbytes,\
                "genome_bytelength":int(ByteLengthGenome),\
                "genome_bitlength":int(BitLengthGenome),\
                "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\
                "textures":range( 0, NrFitnessFunctionGrids ),\
                "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\
                "with_mixed_crossover":WithMixedCrossover,
                "with_bank_conflict":WithBankConflict,
                "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection,
                "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues,
                "with_uniform_crossover":WithUniformCrossover,
                "with_single_point_crossover":WithSinglePointCrossover,
                "with_surefire_mutation":WithSurefireMutation,
                "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory,
                "ga_threaddimx":int(GA_ThreadDim),
                "glob_nr_tiletypes":int(NrTileTypes),
                "glob_nr_edgetypes":int(NrEdgeTypes),
                "glob_nr_tileorientations":int(NrTileOrientations),
                "fit_dimgridx":int(DimGridX),
                "fit_dimgridy":int(DimGridY),
                "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids),
                "fit_nr_fourpermutations":int(NrFourPermutations),
                "fit_assembly_redundancy":int(NrAssemblyRedundancy),
                "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock),
                "sort_threaddimx":int(Sort_ThreadDimX),
                "glob_nr_genomes":int(NrGenomes),
                "fit_dimthreadx":int(ThreadDimX),
                "fit_dimthready":int(ThreadDimY),
                "fit_dimsubgridx":int(SubgridDimX),
                "fit_dimsubgridy":int(SubgridDimY),
                "fit_nr_subgridsperbank":int(NrSubgridsPerBank),
                "glob_bitlength_edgetype":int(EdgeTypeBitLength),
                "fitness_break_value":int(BitLengthGenome),   # ADAPTED FOR DISCOVERY KERNEL
                "fitness_flag_index":int(NrGenomes)
                }

    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_20", code="sm_20", cache_dir=None)

    #Allocate values on GPU
    Genomes_h = drv.mem_alloc(Genomes.nbytes)
    FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
    FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
    AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
    Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
    #ReductionList_h = drv.mem_alloc(ReductionList.nbytes)

    #Copy values to global memory
    drv.memcpy_htod(Genomes_h, Genomes)
    drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
    drv.memcpy_htod(FitnessValues_h, FitnessValues)
    drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
    drv.memcpy_htod(Mutexe_h, Mutexe)

    #Copy values to constant / texture memory
    for id in range(0, NrFitnessFunctionGrids):
        FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
        drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")

    GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
    drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
    FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
    drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
    GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
    drv.memcpy_htod(GAParams_h[0], GAParams)
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
    FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
    FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")

    #Set up curandStates
    curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
    CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)

    #Compile kernels
    curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
    #fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
    sorting_fnc = KernelSourceModule.get_function("SortingKernel")
    ga_fnc = KernelSourceModule.get_function("GAKernel")

    #Initialise Curand
    curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))

    #Build parameter lists for FitnessKernel and GAKernel
    FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
    SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
    GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);

    #TEST ONLY
    #return #ADAPTED
    #TEST ONLY

    #START ADAPTED
    print "GENOMES NOW:\n"
    print Genomes
    print ":::STARTING KERNEL EXECUTION:::"
    #STOP ADAPTED

    #Discovery time parameters
    min_fitness_value = BitLengthGenome # Want all bits set
    mutation_rate = -2.0 #normally: -2


    #Define Numpy construct to sideways join arrays (glue columns together)
    #Taken from: http://stackoverflow.com/questions/5355744/numpy-joining-structured-arrays
    #def join_struct_arrays(arrays):
    #    sizes = np.array([a.itemsize for a in arrays])
    #    offsets = np.r_[0, sizes.cumsum()]
    #    n = len(arrays[0])
    #    joint = np.empty((n, offsets[-1]), dtype=np.int32)
    #    for a, size, offset in zip(arrays, sizes, offsets):
    #        joint[:,offset:offset+size] = a.view(np.int32).reshape(n,size)
    #    dtype = sum((a.dtype.descr for a in arrays), [])
    #    return joint.ravel().view(dtype)
    #Test join_struct_arrays:
    #a = np.array([[1, 2], [11, 22],  [111, 222]]).astype(np.int32);
    #b = np.array([[3, 4], [33, 44],  [333, 444]]).astype(np.int32);
    #c = np.array([[5, 6], [55, 66],  [555, 666]]).astype(np.int32);
    #print "Test join_struct_arrays:"
    #print join_struct_arrays([a, b, c]) #FAILED
    #Set up PYTABLES
    #class GAGenome(IsDescription):
        #gen_id = Int32Col()
        #fitness_val = Float32Col()
        #genome = StringCol(mByteLengthGenome)
        #last_nr_mutations = Int32Col() # Contains the Nr of mutations genome underwent during this generation
        #mother_id = Int32Col() # Contains the crossover "mother"
        #father_id = Int32Col()  # Contains the crossover "father" (empty if no crossing over)
        #assembledgrid      = StringCol(DimGridX*DimGridY)   # 16-character String
    #class GAGenerations(IsDescription):
    #    nr_generations = Int32Col()
    #    nr_genomes = Int32Col()
    #    mutation_rate = Float32Col() # Contains the Nr of mutations genome underwent during this generation

    #from datetime import datetime

    #filename = "fujiama_"+str(NrGenomes)+"_"+str(RateMutation)+"_"+".h5"
    #print filename
    #h5file = openFile(filename, mode = "w", title = "GA FILE")
    #group = h5file.createGroup("/", 'fujiama_ga', 'Fujiama Genetic Algorithm output')
    #table = h5file.createTable(group, 'GaGenerations', GAGenerations, "Raw data")
    #atom = Atom.from_dtype(np.float32)

    #Initialise File I/O
    FILE = open("fujiamakernel_nrgen-" + str(NrGenomes) + "_adaptation.plot", "w")

    #ADAPTED FOR TESTING HISTOGRAM
    #TestValues = [13,24,26,31,32,14]
    #print np.histogram(TestValues,  bins=[0, 24, 32])[0][1]
    #quit()

    #Initialise CUDA timers
    start = drv.Event()
    stop = drv.Event()


    while mutation_rate < 1: # normally: 1

        #ds = h5file.createArray(f.root, 'ga_raw_'+str(mutation_rate), atom, x.shape)
        mutation_rate += 0.1
        GAParams[0]  = 10.0 ** mutation_rate
        drv.memcpy_htod(GAParams_h[0], GAParams)
        print "Mutation rate: ", GAParams[0]

        #ADAPTED: Initialise global memory (absolutely necessary!!)
        drv.memcpy_htod(Genomes_h, Genomes)
        drv.memcpy_htod(FitnessValues_h, FitnessValues)
        drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
        drv.memcpy_htod(Mutexe_h, Mutexe)

        #execute kernels for specified number of generations
        start.record()

        biggest_fit = 0
	reprange = 100
        average_breakup = np.zeros((reprange)).astype(np.float32)
        
        for rep in range(0, reprange):
            breakup_generation = GlobalParamsDict["NrGenerations"]
            dontcount = 0
            #ADAPTED: Initialise global memory (absolutely necessary!!)
            drv.memcpy_htod(Genomes_h, Genomes)
            drv.memcpy_htod(FitnessValues_h, FitnessValues)
            drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
            drv.memcpy_htod(Mutexe_h, Mutexe)

            #execute kernels for specified number of generations
            start.record()
            for gen in range(0, GlobalParamsDict["NrGenerations"]):
                #print "Processing Generation: %d"%(gen)

                #Launch CPU processing (should be asynchroneous calls)

                sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel
                drv.memcpy_dtoh(FitnessPartialSums, FitnessPartialSums_h) #Copy from Device to Host and finish sorting
                FitnessSumConst = FitnessPartialSums.sum()
                drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory
                #drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitnessValues from Device to Device Const #TEST

                ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) #TEST
                #Note: Fitness Function is here integrated into GA kernel!

                drv.memcpy_dtoh(Genomes_res, Genomes_h) #Copy data from GPU
                drv.memcpy_dtoh(FitnessValues_res, FitnessValues_h)
                #drv.memcpy_dtoh(AssembledGrids_res, AssembledGrids_h) #Takes about as much time as the whole simulation!

                #print FitnessValues_res

                #maxxie = FitnessValues_res.max()
                #if maxxie > biggest_fit:
            	#    biggest_fit = maxxie

                #print "max fitness:", maxxie
                #if maxxie >= 25.0 and breakup_generation == -1:
	        if np.histogram(FitnessValues_res,  (0, 24, 32))[0][1]  >= NrGenomes/2:
                    breakup_generation = gen
                    break
                # else:
                #    breakup_generation = -1
                #if FitnessValues[NrGenomes]  == float(1):
                #    breakup_generation = i
                #    break
                #else:
                #    breakup_generation = -1

                #maxxie = FitnessValues_res.max()
                #if maxxie >= 30:
                #    print "Max fitness value: ", FitnessValues_res.max()
                #ds[:]  = FitnessValues #join_struct_arrays(Genomes,  FitnessValues,  AssembledGrids);
                #trow = table.row
                #trow['nr_generations'] = NrGenerations
                #trow['nr_genomes'] = NrGenomes
                #trow['mutation_rate'] = mutation_rate
                #trow.append()
                #trow.flush()

            stop.record()
            stop.synchronize()
            print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
            print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / breakup_generation)
            print "Discovery time (generations) for mutation rate %f: %d"%(GAParams[0],  breakup_generation)
            #print "Max:", biggest_fit

	    #TEST MODE
	    #print "Printing all FitnessValues now:"
	    #print FitnessValues_res
            #raw_input("Next redundancy with keystroke!...");

            #if breakup_generation==0:
            #    print FitnessValues_res
            #    print "Genomes: "
            #    print Genomes_res
            average_breakup[rep] = breakup_generation / reprange
            #if breakup_generation == -1:
            #    dontcount = 1
            #    break

        #if dontcount == 1:
        #    average_breakup.fill(20000)
        FILE.write( str(GAParams[0]) + " " + str(np.median(average_breakup)) + " " + str(np.std(average_breakup)) + "\n");
        FILE.flush()

    #Clean-up pytables
    #h5file.close()
    #Clean up File I/O
    FILE.close()

Example #39

Show file

def bloom_gpu(img, threshold, sigma, sigma_n):
    kernel_radius = np.int32(sigma_n * sigma + 0.5)
    poly = np.polynomial.Polynomial([0, 0, -0.5 / (sigma * sigma)])
    x = np.arange(-kernel_radius, kernel_radius + 1)
    kernel = np.exp(poly(x), dtype=np.float32)
    kernel /= kernel.sum()
    bloom_cuda_source_template = """
        /* 
         * Function:    luminance
         * ----------------------   
         * Performs perceptual luminance-preserving conversion of sRGB image to grayscale image.
         * @param lum: resulting grayscale (1-channel) image as 1D float array;
         *             1D-mapping: from left to right, from top to bottom.
         * @param img: sRGB (3-channel) image as 1D float array;
         *             1D-mapping: from R to B, from left to right, from top to bottom.
         * @param n: total number of pixels in image (length of lum array) as single-element int array.
         */ 
        __global__ void luminance(float *lum, float *img, int *n)
        {
            // Luminance perception coefficients for sRGB.
            const float c_r = 0.2126;
            const float c_g = 0.7152;
            const float c_b = 0.0722;
            
            // Get 1D-mapped index of pixel.
            int g_idx = blockIdx.x * blockDim.x + threadIdx.x;
            
            // Perform perceptual luminance-preserving conversion of sRGB image to grayscale image
            // (one grayscale pixel per thread)
            // after checking that 1D-mapped index of pixel is less than total number of pixels.
            if (g_idx < n[0])
                lum[g_idx] = c_r * img[g_idx * 3 + 0] + c_g * img[g_idx * 3 + 1] + c_b * img[g_idx * 3 + 2];
        }

        /*
         * Function:    array_max
         * ----------------------
         * Finds the maximum value in 1D array using reduction technique.
         * @param max_val: resulting maximum value in array as single-element float array.
         * @param array: data as 1D float array.
         * @param n: number of elements in array as single-element int array.
         * @param mutex: mutex value (need to be zero) as single-element int array.
         */
        __global__ void array_max(float *max_val, float *array, int *n, int *mutex)
        {
            extern __shared__ float sdata[];
            
            // Load array into shared memory.
            // The number of blocks is halved, so it is possible to load the maximum of two values:
            // the element within current block and the element with index shifted by size of the block.
            const int g_idx = __mul24(blockIdx.x, blockDim.x << 1) + threadIdx.x; 
            float x = -1.0;
            if(g_idx + blockDim.x < n[0])
                x = fmaxf(array[g_idx], array[g_idx + blockDim.x]);
            sdata[threadIdx.x] = x;
            __syncthreads();
            
            // Perform reduction within one block.
            if(threadIdx.x < 512 && sdata[threadIdx.x + 512] > sdata[threadIdx.x])
            {
                sdata[threadIdx.x] = sdata[threadIdx.x + 512];
            }
            __syncthreads();
            if(threadIdx.x < 256 && sdata[threadIdx.x + 256] > sdata[threadIdx.x])
            {
                sdata[threadIdx.x] = sdata[threadIdx.x + 256];
            }
            __syncthreads();
            if(threadIdx.x < 128 && sdata[threadIdx.x + 128] > sdata[threadIdx.x])
            {
                sdata[threadIdx.x] = sdata[threadIdx.x + 128];
            }
            __syncthreads();
            if(threadIdx.x < 64 && sdata[threadIdx.x + 64] > sdata[threadIdx.x])
            {
                sdata[threadIdx.x] = sdata[threadIdx.x + 64];
            }
            __syncthreads();
            
            // Single-warp threads in use now, no thread synchronisation needed. 
            if (threadIdx.x < 32)
            {
                sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 32]);
                sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 16]);
                sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 8]);
                sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 4]);
                sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 2]);
                sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 1]);
            }
            
            // The first thread updates maximum value using mutex to ensure update is correct.
            if(threadIdx.x == 0)
            {
                // Lock mutex
                while(atomicCAS(mutex,0,1) != 0);
                // Write result
                max_val[0] = fmaxf(max_val[0], sdata[0]);
                // Unlock mutex
                atomicExch(mutex, 0);
            }
        }

        /*
         * Function:    array_highpass
         * ---------------------------
         * Zeros elements of array which are less or equal to threshold.
         * @param array: data as 1D float array.
         * @param n: number of elements in array as single-element int array.
         * @param threshold: the maxmum value to zero as single-element float array.
         */
        __global__ void array_highpass(float *array, int *n, float *threshold)
        {
            // Get 1D-mapped index of pixel.
            int g_idx = blockIdx.x * blockDim.x + threadIdx.x;
            
            // Perform threshold-based zeroing
            // after checking that 1D-mapped index of pixel is less than total number of pixels.
            if (g_idx < n[0])
                if (array[g_idx] <= threshold[0])
                    array[g_idx] = 0.0;
        }

        
        #define kernel_radius           $kernel_radius
        #define kernel_radius_aligned   $kernel_radius_aligned
        #define kernel_w                $kernel_w
        #define row_tile_w              $row_tile_w
        #define col_tile_w              $col_tile_w
        #define col_tile_h              $col_tile_h

        __device__ __constant__ float kernel[kernel_w];

        /*
         * Function:    convolution_row
         * ---------------------------
         * Performs 1D convolution for image in horizontal direction using predefined above kernel.
         * @param input: data as 1D float array.
         * @param dataW: width of image in pixels as pointer to int.
         * @param dataH: height of image in pixels as pointer to int.
         */
        __global__ void convolution_row(float *input,
                                        int *dataW,
                                        int *dataH)
        {
            extern __shared__ float data[];
            // Define working area
            const int tile_start = __mul24(blockIdx.x, row_tile_w);
            const int tile_end = tile_start + row_tile_w;
            const int apron_start = tile_start - kernel_radius;
            const int apron_end = tile_end + kernel_radius;
            
            // tile_start is clamped by definition!
            const int tile_end_clamped = min(tile_end, dataW[0] - 1);
            const int apron_start_clamped = max(apron_start, 0);
            const int apron_end_clamped = min(apron_end, dataW[0] - 1);

            const int row_start = __mul24(blockIdx.y, dataW[0]);
            const int apron_start_aligned = tile_start - kernel_radius_aligned;
            const int load_pos = apron_start_aligned + threadIdx.x;
            
            // Transfer data to shared memory
            if (load_pos >= apron_start)
            {
                data[load_pos - apron_start] = 
                    (apron_start_clamped <= load_pos && load_pos <= apron_end_clamped) ?
                        input[row_start + load_pos] : 0;
            }

            __syncthreads();
            // Perform convolution and write result to global memory
            const int write_pos = tile_start + threadIdx.x;
            if (write_pos <= tile_end_clamped)
            {
                const int smem_pos = write_pos - apron_start;
                float sum = 0;
                """
    for k in range(-kernel_radius, kernel_radius + 1):
        bloom_cuda_source_template += string.Template(
            'sum += data[smem_pos + $k] * kernel[kernel_radius - $k];\n').substitute(k=k)
    bloom_cuda_source_template += """
                input[row_start + write_pos] = sum;
            }    
        }

        /*
         * Function:    convolution_column
         * ---------------------------
         * Performs 1D convolution for image in vertical direction using predefined above kernel.
         * @param input: data as 1D float array.
         * @param dataW: width of image in pixels as pointer to int.
         * @param dataH: height of image in pixels as pointer to int.
         * @param smem_stride: stride in shared memory array as pointer to int.
         * @param gmem_stride: stride in global memory array as pointer to int.
         */
        __global__ void convolution_column(float *input,
                                           int *dataW,
                                           int *dataH,
                                           int *smem_stride,
                                           int *gmem_stride)
        {
            extern __shared__ float data[];
            // Define working area
            const int tile_start = __mul24(blockIdx.y, col_tile_h);
            const int tile_end = tile_start + col_tile_h - 1;
            const int apron_start = tile_start - kernel_radius;
            const int apron_end = tile_end + kernel_radius;

            const int tile_end_clamped = min(tile_end, dataH[0] - 1);
            const int apron_start_clamped = max(apron_start, 0);
            const int apron_end_clamped = min(apron_end, dataH[0] - 1);

            const int column_start = __mul24(blockIdx.x, col_tile_w) + threadIdx.x;

            int smem_pos = __mul24(threadIdx.y, col_tile_w) + threadIdx.x;
            int gmem_pos = __mul24(apron_start + threadIdx.y, dataW[0]) + column_start;
            
            // Transfer data from global to shared memory.
            for (int y = apron_start + threadIdx.y; y < apron_end; 
                y += blockDim.y, smem_pos += smem_stride[0], gmem_pos += gmem_stride[0])
            {
                data[smem_pos] = 
                    (apron_start_clamped <= y && y <= apron_end_clamped) ? 
                        input[gmem_pos] : 0;
            }

            __syncthreads();
            // Perform convolution (each thread performs convolution several times to different parts of image)
            smem_pos = __mul24(threadIdx.y + kernel_radius, col_tile_w) + threadIdx.x;
            gmem_pos = __mul24(tile_start + threadIdx.y, dataW[0]) + column_start;
            for (int y = tile_start + threadIdx.y; y <= tile_end_clamped;
                y += blockDim.y, smem_pos += smem_stride[0], gmem_pos += gmem_stride[0])
            {
                float sum = 0;
                """
    for k in range(-kernel_radius, kernel_radius + 1):
        bloom_cuda_source_template += string.Template(
            'sum += data[smem_pos + __mul24($k, col_tile_w)] * kernel[kernel_radius - $k];\n').substitute(k=k)
    bloom_cuda_source_template += """
                input[gmem_pos] = sum;
            }
        }
        /*
         * Function:    array_add_sat255
         * -----------------------------
         * Performs addition with saturation to 255.0 of sRGB (3-channel) image with grayscale (1-channel) image.
         * @param array3: initial and resulting sRGB (3-channel) image as 1D float array;
         *                1D-mapping: from R to B, from left to right, from top to bottom.
         * @param array1: grayscale (1-channel) image as 1D float array;
         *                1D-mapping: from left to right, from top to bottom.
         * @param w: width of image in pixels as single-element int array.
         * @param n: total number of pixels in image (length of array1) as single-element int array.
         */
        __global__ void arrays_add_sat255(float *array3, float *array1, int *w, int *h)
        {
            int px_x = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
            int px_y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y;
            int px = px_y * w[0] + px_x;
            array3[px * 3 + 0] = array3[px * 3 + 0] + array1[px];
            array3[px * 3 + 1] = array3[px * 3 + 1] + array1[px];
            array3[px * 3 + 2] = array3[px * 3 + 2] + array1[px];
            if (array3[px * 3 + 0] > 255.0)
                array3[px * 3 + 0] = 255.0;
            if (array3[px * 3 + 1] > 255.0)
                array3[px * 3 + 1] = 255.0;
            if (array3[px * 3 + 2] > 255.0)
                array3[px * 3 + 2] = 255.0;
        }
    """
    kernel_radius_aligned = int(16)
    row_tile_w = int(128)
    col_tile_w = int(16)
    col_tile_h = int(48)
    bloom_cuda_source = string.Template(bloom_cuda_source_template). \
        substitute(kernel_radius=kernel_radius,
                   kernel_radius_aligned=kernel_radius_aligned,
                   kernel_w=kernel.size,
                   row_tile_w=row_tile_w,
                   col_tile_w=col_tile_w,
                   col_tile_h=col_tile_h)
    bloom_cuda_module = SourceModule(bloom_cuda_source)

    img = img.astype(np.float32)
    sat = np.empty_like(img)
    w = np.int32(img.shape[1])
    h = np.int32(img.shape[0])
    n = np.int32(w * h);
    w_al = np.int32(int_align_up(w, 16))
    smem_stride = np.int32(col_tile_w * 8)
    gmem_stride = np.int32(w_al * 8)
    lum = np.zeros((h, w), np.float32)
    img_g = drv.mem_alloc(img.nbytes)
    lum_g = drv.mem_alloc(lum.nbytes)
    kernel_g = bloom_cuda_module.get_global('kernel')[0]
    w_g = drv.mem_alloc(4)
    h_g = drv.mem_alloc(4)
    n_g = drv.mem_alloc(4)
    w_al_g = drv.mem_alloc(4)
    smem_stride_g = drv.mem_alloc(4)
    gmem_stride_g = drv.mem_alloc(4)
    drv.memcpy_htod(img_g, img)
    drv.memcpy_htod(lum_g, lum)
    drv.memcpy_htod(kernel_g, kernel)
    drv.memcpy_htod(w_g, w)
    drv.memcpy_htod(h_g, h)
    drv.memcpy_htod(n_g, n)
    drv.memcpy_htod(w_al_g, w_al)
    drv.memcpy_htod(smem_stride_g, smem_stride)
    drv.memcpy_htod(gmem_stride_g, gmem_stride)

    block_size = 1024
    grid_size = int_div_up(n, block_size)
    bloom_cuda_module.get_function("luminance")(
        lum_g, img_g, n_g,
        block=(block_size, 1, 1),
        grid=(grid_size, 1, 1))
    lum_max = np.zeros((1, 1), np.float32)
    mutex = np.zeros((1, 1), np.int32)
    bloom_cuda_module.get_function("array_max")(
        drv.Out(lum_max), lum_g, n_g, drv.In(mutex),
        block=(block_size, 1, 1),
        grid=(grid_size >> 1, 1),
        shared=block_size * 4)
    lum_threshold = np.float32(lum_max * threshold)
    bloom_cuda_module.get_function("array_highpass")(
        lum_g, n_g, drv.In(lum_threshold),
        block=(block_size, 1, 1),
        grid=(grid_size, 1))
    block_size_x = int(kernel_radius_aligned + row_tile_w + kernel_radius)
    grid_size_x = int(int_div_up(w_al, row_tile_w))
    grid_size_y = int(h)
    shared_size = int((kernel_radius + row_tile_w + kernel_radius) * 4)
    bloom_cuda_module.get_function("convolution_row")(
        lum_g, w_al_g, h_g,
        block=(block_size_x, 1, 1),
        grid=(grid_size_x, grid_size_y),
        shared=shared_size)
    grid_size_x = int(int_div_up(w_al, col_tile_w))
    grid_size_y = int(int_div_up(h, col_tile_h))
    shared_size = int(col_tile_w * (kernel_radius + row_tile_w + kernel_radius) * 4)
    bloom_cuda_module.get_function("convolution_column")(
        lum_g, w_al_g, h_g, smem_stride_g, gmem_stride_g,
        block=(col_tile_w, 8, 1),
        grid=(grid_size_x, grid_size_y),
        shared=shared_size)
    bloom_cuda_module.get_function("arrays_add_sat255")(
        img_g, lum_g, w_g, h_g,
        block=(block_size, 1, 1),
        grid=(grid_size, 1)
    )
    drv.memcpy_dtoh(sat, img_g)
    sat = sat.astype(np.uint8)
    return sat

Example #40

Show file

                *pivot = pivot_local;
                atomicExch(&lock, 0);
                needlock = 0;
            }
        }
    }
}
""" % (size)

mod = SourceModule(kernel, options=["--ptxas-options=-v"])

start = timer()

phase_adj = mod.get_function("phase_adj")

entropia_gpu = mod.get_global('entropia')[0]
device_real = mod.get_global('device_real')[0]
device_imag = mod.get_global('device_imag')[0]

cuda.memcpy_htod(device_real, FFT.real.astype(np.float32))
cuda.memcpy_htod(device_imag, FFT.imag.astype(np.float32))
entropia_cpu = array([2147483647])
cuda.memcpy_htod(entropia_gpu, entropia_cpu.astype(np.float32))
phc0_cpu = array([0])
phc0_gpu = cuda.to_device(phc0_cpu.astype(np.int32))
phc1_cpu = array([0])
phc1_gpu = cuda.to_device(phc1_cpu.astype(np.int32))
pivot_cpu = array([0])
pivot_gpu = cuda.to_device(pivot_cpu.astype(np.int32))

phase_adj(

Example #41

Show file

File: main.py Project: schroeder-dewitt/polyomino-self-assembly

def main():

    #Initialise InteractionMatrix
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Initialise GPU (equivalent of autoinit)
    drv.init()
    assert drv.Device.count() >= 1
    dev = drv.Device(0)
    ctx = dev.make_context(0)

    #Convert GlobalParams to List
    GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32)
    count = 0
    for x in GlobalParamsDict.keys():
        GlobalParams[count] = GlobalParamsDict[x]
        count += 1

    #Convert FitnessParams to List
    FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32)
    count = 0
    for x in FitnessParamsDict.keys():
        FitnessParams[count] = FitnessParamsDict[x]
        count += 1

    #Convert GAParams to List
    GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32)
    count = 0
    for x in GAParamsDict.keys():
        GAParams[count] = GAParamsDict[x]
        count += 1

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', 'cuda'))

    # Load source code from file
    Source = env.get_template('kernel.cu') #Template( file(KernelFile).read() )

    #Create dictionary argument for rendering
    RenderArgs= {"params_size":GlobalParams.nbytes,\
                "fitnessparams_size":FitnessParams.nbytes,\
                "gaparams_size":GAParams.nbytes,\
                "genome_bytelength":int(ByteLengthGenome),\
                "genome_bitlength":int(BitLengthGenome),\
                "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\
                "textures":range( 0, NrFitnessFunctionGrids ),\
                "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\
                "with_mixed_crossover":WithMixedCrossover,
                "with_bank_conflict":WithBankConflict,
                "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection,
                "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues,
                "with_uniform_crossover":WithUniformCrossover,
                "with_single_point_crossover":WithSinglePointCrossover,
                "with_surefire_mutation":WithSurefireMutation,
                "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory,
                "ga_threaddimx":int(ThreadDim),
                "glob_nr_tiletypes":int(NrTileTypes),
                "glob_nr_edgetypes":int(NrEdgeTypes),
                "glob_nr_tileorientations":int(NrTileOrientations),
                "fit_dimgridx":int(DimGridX),
                "fit_dimgridy":int(DimGridY),
                "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids),
                "fit_nr_fourpermutations":int(NrFourPermutations),
                "fit_assembly_redundancy":int(NrAssemblyRedundancy),
                "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock),
                "sort_threaddimx":int(Sort_ThreadDimX),
                "glob_nr_genomes":int(NrGenomes),
                "fit_dimthreadx":int(ThreadDimX),
                "fit_dimthready":int(ThreadDimY),
                "fit_dimsubgridx":int(SubgridDimX),
                "fit_dimsubgridy":int(SubgridDimY),
                "fit_nr_subgridsperbank":int(NrSubgridsPerBank),
                "glob_bitlength_edgetype":int(EdgeTypeBitLength)
                }

    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None)

    #Allocate values on GPU
    Genomes_h = drv.mem_alloc(Genomes.nbytes)
    FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
    FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
    AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
    Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
    ReductionList_h = drv.mem_alloc(ReductionList.nbytes)

    #Copy values to global memory
    drv.memcpy_htod(Genomes_h, Genomes)
    drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
    drv.memcpy_htod(FitnessValues_h, FitnessValues)
    drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
    drv.memcpy_htod(Mutexe_h, Mutexe)

    #Copy values to constant / texture memory
    for id in range(0, NrFitnessFunctionGrids):
        FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
        drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")

    GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
    drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
    FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
    drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
    GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
    drv.memcpy_htod(GAParams_h[0], GAParams)
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
    FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
    FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")

    #Set up curandStates
    curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
    CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)

    #Compile kernels
    curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
    fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
    sorting_fnc = KernelSourceModule.get_function("SortingKernel")
    ga_fnc = KernelSourceModule.get_function("GAKernel")

    #Initialise Curand
    curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))

    #Build parameter lists for FitnessKernel and GAKernel
    FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
    SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
    GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);

    #TEST ONLY
    return
    #TEST ONLY

    #Initialise CUDA timers
    start = drv.Event()
    stop = drv.Event()

    #execute kernels for specified number of generations
    start.record()
    for gen in range(0, GlobalParamsDict["NrGenerations"]):
        #print "Processing Generation: %d"%(gen)

        #fitness_fnc(*(FitnessKernelParams), block=fit_blocks, grid=fit_grid)

        #Launch CPU processing (should be asynchroneous calls)

        sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel

        drv.memcpy_dtoh(ReductionList, ReductionList_h) #Copy from Device to Host and finish sorting
        FitnessSumConst = ReductionList.sum()
        drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory
        drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitneValues from Device to Device Const

        ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids)

        drv.memcpy_dtoh(Genomes, Genomes_h) #Copy data from GPU
        drv.memcpy_dtoh(FitnessValues, FitnessValues_h)
        drv.memcpy_dtoh(AssembledGrids, AssembledGrids_h)

    stop.record()
    stop.synchronize()
    print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
    print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
    pass

Example #42

Show file

File: GPUKernels.py Project: ksopyla/pyKMLib

class GPURBFEll(object):
    """RBF Kernel with ellpack format"""
    
    cache_size =100
    
    Gamma=1.0
    
    #template
    func_name='rbfEllpackILPcol2multi'
    
    #template    
    module_file = os.path.dirname(__file__)+'/cu/KernelsEllpackCol2.cu'
    
    #template
    texref_nameI='VecI_TexRef'
    texref_nameJ='VecJ_TexRef'
    
    max_concurrent_kernels=1
   
    def __init__(self,gamma=1.0,cache_size=100):
        """
        Initialize object
        
        Parameters
        -------------
        
        max_kernel_nr: int
            determines maximal concurrent kernel column gpu computation
        """
        self.cache_size=cache_size
  
        self.threadsPerRow=1
        self.prefetch=2        
        
        self.tpb=128
        self.Gamma = gamma
       
        
        
        
        
    def init_cuda(self,X,Y, cls_start, max_kernels=1 ):
        
        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels 
        
        self.X =X
        self.Y = Y
        
        self.cls_start=cls_start.astype(np.int32)
        
        #handle to gpu memory for y for each concurrent classifier
        self.g_y=[]
        #handle to gpu memory for results for each concurrent classifier
        self.g_out=[] #gpu kernel out
        self.kernel_out=[] #cpu kernel out
        #blocks per grid for each concurrent classifier    
        self.bpg=[]
        
        #function reference
        self.func=[]
        
        #texture references for each concurrent kernel
        self.tex_ref=[]

        #main vectors 
        #gpu        
        self.g_vecI=[]
        self.g_vecJ=[]
        #cpu
        self.main_vecI=[]
        self.main_vecJ=[]    
        
        #cpu class 
        self.cls_count=[]
        self.cls=[]
        #gpu class
        self.g_cls_count=[]
        self.g_cls=[]
        
        self.sum_cls=[]
        
        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)            
#            self.func.append(0)
#            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
#            self.main_vecI.append(0)
#            self.main_vecJ.append(0)
            self.sum_cls.append(0)
            
            
        self.N,self.Dim = X.shape
        column_size = self.N*4
        cacheMB = self.cache_size*1024*1024 #100MB for cache size   
        
        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB/column_size).astype(int)
        
        cache_items = min(self.N,cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)        
        
        self.compute_diag()
        
        #cuda initialization
        cuda.init()        
        
        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code        
        with open (self.module_file,"r") as CudaFile:
            module_code = CudaFile.read();
        
        #compile module
        self.module = SourceModule(module_code,keep=True,no_extern_c=True)
        
        (g_gamma,gsize)=self.module.get_global('GAMMA')       
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) )
        
        #get functions reference

        Dim =self.Dim        
        vecBytes = Dim*4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex=self.module.get_texref('VecI_TexRef')
            self.g_vecI[f]=cuda.mem_alloc( vecBytes)           
            vecI_tex.set_address(self.g_vecI[f],vecBytes)

            #init texture for vector J
            vecJ_tex=self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f]=cuda.mem_alloc( vecBytes)     
            vecJ_tex.set_address(self.g_vecJ[f],vecBytes)
            
            self.tex_ref.append((vecI_tex,vecJ_tex) )
            
            self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32))
            self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32))
            
            texReflist = list(self.tex_ref[f])
            
            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist)
            
        
        #transform X to particular format
        v,c,r=spf.csr2ellpack(self.X,align=self.prefetch)
        #copy format data structure to gpu memory
        
        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)
        
        self.g_cls_start = cuda.to_device(self.cls_start)
        
        
        
        
    def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n):
        """
        Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2.
         
        Parameters
        ------------
        kernel_nr : int
            concurrent kernel number
        y_cls : array-like
            binary class labels (1,-1)
        cls1: int
            first class number
        cls2: int
            second class number
        cls1_n : int
            number of elements of class 1
        cls2_n : int
            number of elements of class 2
        kernel_out : array-like
            array for gpu kernel result, size=2*len(y_cls)
        
        """
        warp=32
        align_cls1_n =  cls1_n+(warp-cls1_n%warp)%warp
        align_cls2_n =  cls2_n+(warp-cls2_n%warp)%warp
        
        self.cls1_N_aligned=align_cls1_n

        sum_cls= align_cls1_n+align_cls2_n   
        self.sum_cls[kernel_nr] = sum_cls
              
        
        self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32)
        self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32)  
        
        self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr])
        
        self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr])
        
        self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb ))
        
        self.g_y[kernel_nr] =  cuda.to_device(y_cls)
        
        self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32)
        
        ker_out = self.kernel_out[kernel_nr]      
        self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out)
        
    
        #add prepare for device functions
        
    
    
    def K2Col(self,i,j,i_ds,j_ds,kernel_nr):
        """ 
        computes i-th and j-th kernel column 

        Parameters
        ---------------
        i: int
            i-th kernel column number in subproblem
        j: int
            j-th kernel column number in subproblem

        i_ds: int
            i-th kernel column number in whole dataset
        j_ds: int
            j-th kernel column number in  whole dataset

        kernel_nr : int
            number of concurrent kernel
            
        ker2ColOut: array like
            array for output
        
        Returns
        -------
        ker2Col
        
        """ 
        
        #make i-th and j-the main vectors
        vecI= self.main_vecI[kernel_nr]
        vecJ= self.main_vecJ[kernel_nr]
        
#        self.X[i_ds,:].todense(out=vecI)        
#        self.X[j_ds,:].todense(out=vecJ)  
        
        #vecI.fill(0)
        #vecJ.fill(0)
        
        
        
        #self.X[i_ds,:].toarray(out=vecI)        
        #self.X[j_ds,:].toarray(out=vecJ)        
        
        vecI=self.X.getrow(i_ds).todense()
        vecJ=self.X.getrow(j_ds).todense()
        
        
        #copy them to texture
        cuda.memcpy_htod(self.g_vecI[kernel_nr],vecI)
        cuda.memcpy_htod(self.g_vecJ[kernel_nr],vecJ)
        
#        temp = np.empty_like(vecI)
#        cuda.memcpy_dtoh(temp,self.g_vecI[kernel_nr])        
#        print 'temp',temp
        #lauch kernel
        
        gfunc=self.func[kernel_nr]
        gy = self.g_y[kernel_nr]
        gout = self.g_out[kernel_nr]
        gN = np.int32(self.N)
        g_i = np.int32(i)
        g_j = np.int32(j)
        g_ids = np.int32(i_ds)
        g_jds = np.int32(j_ds)
        gNalign = np.int32(self.cls1_N_aligned)
        gcs = self.g_cls_start
        gcc = self.g_cls_count[kernel_nr]
        gc  = self.g_cls[kernel_nr]
        bpg=self.bpg[kernel_nr]
        
        
        #print 'start gpu i,j,kernel_nr ',i,j,kernel_nr
        #texReflist = list(self.tex_ref[kernel_nr])                
        #gfunc(self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc,block=(self.tpb,1,1),grid=(bpg,1),texrefs=texReflist)
        #print 'end gpu',i,j
        #copy the results
       
        #grid=(bpg,1),block=(self.tpb,1,1)
        gfunc.prepared_call((bpg,1),(self.tpb,1,1),self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc)
        
        cuda.memcpy_dtoh(self.kernel_out[kernel_nr],gout)

                
        
        return self.kernel_out[kernel_nr]
        
    def K_vec(self,vec):
        '''
        vec - array-like, row ordered data, should be not to big
        '''
        
        dot=self.X.dot(vec.T)  
        x2=self.Xsquare.reshape((self.Xsquare.shape[0],1))
        if(sp.issparse(vec)):        
            v2 = vec.multiply(vec).sum(1).reshape((1,vec.shape[0]))        
        else:
            v2 =  np.einsum('...i,...i',vec,vec)
        
        return np.exp(-self.Gamma*(x2+v2-2*dot))
        
    def compute_diag(self):
        """
        Computes kernel matrix diagonal
        """
        
        #for rbf diagonal consists of ones exp(0)==1
        self.Diag = np.ones(self.X.shape[0],dtype=np.float32)

        if(sp.issparse(self.X)):
            # result as matrix
            self.Xsquare = self.X.multiply(self.X).sum(1)
            #result as array
            self.Xsquare = np.asarray(self.Xsquare).flatten()
        else:
            self.Xsquare =np.einsum('...i,...i',self.X,self.X)
        
        
    def clean(self,kernel_nr):
        """ clean the kernel cache """
        #self.kernel_cache.clear()

        self.bpg[kernel_nr]=0

          
        
        
        


    def clean_cuda(self):
        '''
        clean all cuda resources
        '''
        
        
        for f in range(self.max_concurrent_kernels):
            
            #vecI_tex=??
            #self.g_vecI[f].free()     
            del self.g_vecI[f]

            #init texture for vector J
            #vecJ_tex=??
            #self.g_vecJ[f].free()
            del self.g_vecJ[f]
            self.g_cls_count[f].free()
            self.g_cls[f].free()
            self.g_y[f].free()
            self.g_out[f].free()

        #test it
        #del self.g_out[f] ??
        
        #copy format data structure to gpu memory
        
        self.g_val.free()
        self.g_col.free()
        self.g_len.free()
        self.g_sdot.free()
        self.g_cls_start.free()
         
        print self.ctx 
        self.ctx.pop()
        
        print self.ctx
        del self.ctx
        
        
        

    def predict_init(self, SV):
        """
        Init the classifier for prediction
        """        
        
        self.X =SV
        self.compute_diag()

Example #43

Show file

File: 030-sourcemodule-complex.py Project: wbkifun/my_stuff

kernels = '''
__constant__ float2 vc[1];

__global__ void vcf(float2 *psi) {
	int tid = threadIdx.x;

	__shared__ float2 spsi[TID_MAX];
	spsi[tid] = psi[tid];

	if ( tid < TID_MAX ) {
		psi[tid].x = vc[0].x * spsi[tid].x - vc[0].y * spsi[tid].y;
		psi[tid].y = vc[0].x * spsi[tid].y + vc[0].y * spsi[tid].x;
		//psi[tid].x = vc[0].x;
		//psi[tid].y = vc[0].y;
	}
}'''.replace('TID_MAX', str(35))
print kernels
mod = SourceModule(kernels)
vcf = mod.get_function('vcf')
vc_const, _ = mod.get_global('vc')
cuda.memcpy_htod(vc_const, vc)

psi_gpu = gpuarray.to_gpu(psi)
vcf(psi_gpu, block=(256,1,1), grid=(1,1))

assert np.linalg.norm(psi_gpu.get() - vc*psi) < 1e-6


ctx.pop()

Example #44

Show file

File: riemanntheta_cuda.py Project: tolsonMDA/abelfunctions

    def finite_sum_d(self, TILEWIDTH, TILEHEIGHT, g):
        template = """
 
    #include <stdlib.h>
    #include <stdio.h>
    #include <math.h>

    #define GENUS %d
    #define TILEHEIGHT %d
    #define TILEWIDTH %d

    __device__ __constant__ double Xd[GENUS*GENUS];
    __device__ __constant__ double Yinvd[GENUS * GENUS];
    __device__ __constant__ double Td[GENUS*GENUS];

    /***************************************************************************

    normpart
    --------

    A helper function for the finite sum functions. Computes:

    -pi * ||T*(n + fracshift)||^2

    = -pi * ||T * (n + (shift - intshift))||^2

    = -pi * ||T * (n + Yinv*y - round(Yinv*y))||^2

    ***************************************************************************/

    __device__ double normpart(int g, double* Sd_s, double* yd_s)
    {
      int tx = threadIdx.x;
      int ty = threadIdx.y;
      double norm = 0;
      int i,j,k;
      for (i = 0; i < g; i++) {
        double sum = 0;
        for (j = 0; j < g; j++) {
          double T_ij = Td[i*g + j];
          double n_j = Sd_s[tx*g + j];
          double shift_j = 0;
          for (k = 0; k < g; k++) {
            shift_j += Yinvd[g*j + k]*yd_s[ty*g + k];
          }
          sum += T_ij * (n_j + shift_j - round(shift_j));
        }
        norm += sum * sum;
      }
      return -M_PI * norm;
    }

    /*************************************************************************
    exppart
    -------

    A helper function for the finite sum functions. Computes:

    2pi * <(n - intshift), (1/2)X(n - intshift) + x>

    =2pi * <n - round(shift), (1/2)X(n - round(shift) + x>

    =2pi * <n - round(Yinv*y), (1/2)X(n - round(Yinv*y) + x>



    ***************************************************************************/

    __device__ double exppart(int g, double* Sd_s, 
                              double* xd_s, double* yd_s)
    {
      int tx = threadIdx.x;
      int ty = threadIdx.y;
      double exppart = 0;
      int i,j,k,h;
      for (i = 0; i < g; i++) {
        double n_i = Sd_s[tx*g + i];
        double shift_i = 0;
        for (k = 0; k < g; k++) {
          shift_i += Yinvd[k + i*g] * yd_s[ty*g + k];
        }
        double A = n_i - round(shift_i);
        double Xshift_i = 0;
        for (j = 0; j < g; j++) {
          double X_ij = Xd[j + i * g];
          double shift_j = 0;
          for (h = 0; h < g; h++) {
            shift_j += Yinvd[h + j * g] * yd_s[ty*g + h];
          }
          Xshift_i += (.5) * (X_ij * (Sd_s[tx*g + j] - round(shift_j)));
        }
        double B = Xshift_i + xd_s[ty*g + i];
        exppart += A * B;
      }
      return 2 * M_PI * exppart;
    }

    /**********************************************************************

    Derivative Product

    Computes: 
                       ___
                       | |    2*pi*I <d, n-intshift>
                       | |
                   d in derivs

    =                  ___
                       | |    2*pi*I <d, n-round(shift)>
                       | |
                   d in derivs
    =                  ___
                       | |    2*pi*I <d, n-round(Yinv*y)>
                       | |
                   d in derivs
    ************************************************************************/
    __device__ void deriv_prod(int g, double* Sd_s, double* yd_s, double* dpr, double* dpi,
                                 double* deriv_real, double* deriv_imag, int nderivs)
    {
      int tx = threadIdx.x;
      int ty = threadIdx.y;

      double total_real = 1;
      double total_imag = 0;

      int i,j,k;
      for (i = 0; i < nderivs; i++){
        double term_real = 0;
        double term_imag = 0;
        for (j = 0; j < g; j++){
          double shift_j = 0;
          for (k = 0; k < g; k++){
            shift_j += Yinvd[j*g + k] * yd_s[ty*g + k];
          }
          double intshift = round(shift_j);
          double nmintshift = Sd_s[tx*g + j] - intshift;
          term_real += deriv_real[j + g*i] * nmintshift;
          term_imag += deriv_imag[j + g*i] * nmintshift;
        }

        total_real = total_real * term_real - total_imag * term_imag;
        total_imag = total_real * term_imag + total_imag * term_real;
      }

        //Computes: (2*pi*i)^(nderivs) * (total_real + total_imag*i)
        double pi_mult = pow(2*M_PI, nderivs);
        /*Determines what the result of i^nderivs is, and performs the 
          correct multiplication afterwards.*/
        if (nderivs %% 4 == 0) {
            dpr[0] = pi_mult*total_real;
            dpi[0] = pi_mult*total_imag;
        }
        else if (nderivs %% 4 == 1) {
            dpr[0] = -pi_mult * total_imag;
            dpi[0] = pi_mult * total_real;
        }
        else if (nderivs %% 4 == 2) {
            dpr[0] = -pi_mult * total_real;
            dpi[0] = -pi_mult * total_imag;
        }
        else if (nderivs %% 4 == 3) {
            dpr[0] = pi_mult * total_imag;
            dpi[0] = -pi_mult * total_real;
        }
    }



    /***********************************************************************

    Finite Sum Without Derivatives Kernel Function

    ************************************************************************/
    __global__ void riemann_theta(double* fsum_reald, double* fsum_imagd,
                          double* xd, double* yd, double* Sd,
                          int g, int N, int K)
    {
      /*Built in variables to be used, br is block row, bc is
      block column, and similiarly for tr and tc.*/
      int bx = blockIdx.x;
      int by = blockIdx.y;
      int tx = threadIdx.x;
      int ty = threadIdx.y;

      __shared__ double Sd_s[TILEWIDTH * GENUS];
      __shared__ double xd_s[TILEHEIGHT * GENUS];
      __shared__ double yd_s[TILEHEIGHT * GENUS];

      /*Determine n_1, the start of the summation vector,
      the full vector is of the form n_1, n_2, ..., n_g*/
      int n_start = (bx * TILEWIDTH + tx) * g;
      /*Now n = S[n_start], S[n_start + 1], ..., S[n_start + (g - 1)]*/

      /*Determine z the point of evaluation*/
      int z_start = (by * TILEHEIGHT + ty) * g;
      /*Now x = (x[z_start], x[z_start + 1], ... , x[z_start + (g-1)],
      and similiarly for y.*/

      /*Load data into shared arrays*/
      int i;
      for (i = 0; i < g; i++) {
        Sd_s[tx*g + i] = Sd[n_start + i];
        xd_s[ty*g + i] = xd[z_start + i];
        yd_s[ty*g + i] = yd[z_start + i];
      }

      __syncthreads();

      if (n_start < N*g && z_start < K*g) {
        /*Compute the "cosine" and "sine" parts of the summand*/
        double ept, npt, cpt, spt;
        ept = exppart(g,Sd_s, xd_s, yd_s);
        npt = exp(normpart(g, Sd_s, yd_s));
        cpt = npt * cos(ept);
        spt = npt * sin(ept);

        fsum_reald[n_start/g + z_start/g * N] = cpt;
        fsum_imagd[n_start/g + z_start/g * N] = spt;
      }
    }

    /************************************************************************************

    Finite Sum with Derivatives Kernel Function

    ************************************************************************************/
    __global__ void riemann_theta_derivatives(double* fsum_reald, double* fsum_imagd,
                          double* xd, double* yd, double* Sd, double* deriv_real,
                          double* deriv_imag, int nderivs, int g, int N, int K)
    {
      /*Built in variables to be used, br is block row, bc is
      block column, and similiarly for tr and tc.*/
      int bx = blockIdx.x;
      int by = blockIdx.y;
      int tx = threadIdx.x;
      int ty = threadIdx.y;

      __shared__ double Sd_s[TILEWIDTH * GENUS];
      __shared__ double xd_s[TILEHEIGHT * GENUS];
      __shared__ double yd_s[TILEHEIGHT * GENUS];

      /*Determine n_1, the start of the summation vector,
      the full vector is of the form n_1, n_2, ..., n_g*/
      int n_start = (bx * TILEWIDTH + tx) * g;
      /*Now n = S[n_start], S[n_start + 1], ..., S[n_start + (g - 1)]*/

      /*Determine z the point of evaluation*/
      int z_start = (by * TILEHEIGHT + ty) * g;
      /*Now x = (x[z_start], x[z_start + 1], ... , x[z_start + (g-1)],
      and similiarly for y.*/

      /*Load data into shared arrays*/
      int i;
      for (i = 0; i < g; i++) {
        Sd_s[tx*g + i] = Sd[n_start + i];
        xd_s[ty*g + i] = xd[z_start + i];
        yd_s[ty*g + i] = yd[z_start + i];
      }

      __syncthreads();

      if (n_start < N*g && z_start < K*g) {
        /*Compute the "cosine" and "sine" parts of the summand*/
        double dpr[1];
        double dpi[1];
        dpr[0] = 0;
        dpi[0] = 0;
        double ept, npt, cpt, spt;
        ept = exppart(g,Sd_s, xd_s, yd_s);
        npt = exp(normpart(g, Sd_s, yd_s));
        cpt = npt * cos(ept);
        spt = npt * sin(ept);
        deriv_prod(g, Sd_s, yd_s, dpr, dpi, deriv_real, deriv_imag, nderivs);
        fsum_reald[n_start/g + z_start/g * N] = dpr[0] * cpt - dpi[0] * spt;
        fsum_imagd[n_start/g + z_start/g * N] = dpi[0] * cpt + dpr[0] * spt;
      }
    }

    """ %(g, TILEHEIGHT, TILEWIDTH)
        mod = SourceModule(template)
        func = mod.get_function("riemann_theta")
        deriv_func = mod.get_function("riemann_theta_derivatives")
        Xd = mod.get_global("Xd")[0]
        Yinvd = mod.get_global("Yinvd")[0]
        Td = mod.get_global("Td")[0]
        return (func, deriv_func, Xd, Yinvd, Td)

Example #45

Show file

            for k in range(farthest_index+1, NUM_CHARGERS):
                current_assignments[k] = current_assignments[k-1]+1
        
    sys.exit()

    """
else:
    defines += "#define NUM_THREADS " + str(999) + "\n" +\
               "#define TOTAL_WORK " + str(0) + "\n" +\
               "#define NUM_WORK_PER_THREAD " + str(0) + "\n" +\
               "#define LAST_THREAD_START_OFFSET " + str(0) + "\n"

mod = SourceModule(preamble + defines + kernel_approx_src, no_extern_c=True)

(routes_lengths_gpu, size_in_bytes) = mod.get_global("routes_lengths")
(stops_lengths_gpu, size_in_bytes) = mod.get_global("stops_lengths")

cuda.memcpy_htod(routes_lengths_gpu, routes_lengths_np)
cuda.memcpy_htod(stops_lengths_gpu, stops_lengths_np)

routes_gpu = cuda.mem_alloc(routes_np.nbytes)
cuda.memcpy_htod(routes_gpu, routes_np)

stops_gpu = cuda.mem_alloc(stops_np.nbytes)
cuda.memcpy_htod(stops_gpu, stops_np)

final_utilities_np = np.zeros((NUM_RUNS), dtype=np.float32)
final_utilities_gpu = cuda.mem_alloc(final_utilities_np.nbytes)

final_chargers_np = np.zeros((NUM_RUNS, NUM_CHARGER_INTS), dtype=np.uint32)

Example #46

Show file

File: model_build_run.py Project: zeticabrian/gprMax

def solve_gpu(currentmodelrun, modelend, G):
    """Solving using FDTD method on GPU. Implemented using Nvidia CUDA.

    Args:
        currentmodelrun (int): Current model run number.
        modelend (int): Number of last model to run.
        G (class): Grid class instance - holds essential parameters describing the model.

    Returns:
        tsolve (float): Time taken to execute solving
        memsolve (int): memory usage on final iteration in bytes
    """

    import pycuda.driver as drv
    from pycuda.compiler import SourceModule
    drv.init()

    # Suppress nvcc warnings on Windows
    if sys.platform == 'win32':
        compiler_opts = ['-w']
    else:
        compiler_opts = None

    # Create device handle and context on specifc GPU device (and make it current context)
    dev = drv.Device(G.gpu.deviceID)
    ctx = dev.make_context()

    # Electric and magnetic field updates - prepare kernels, and get kernel functions
    if Material.maxpoles > 0:
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]), options=compiler_opts)
    else:   # Set to one any substitutions for dispersive materials
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1), options=compiler_opts)
    update_e_gpu = kernels_fields.get_function("update_e")
    update_h_gpu = kernels_fields.get_function("update_h")

    # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels
    updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0]
    updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0]
    if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem:
        raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name))
    else:
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)

    # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU
    if Material.maxpoles > 0:  # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values).
        update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A")
        update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B")
        G.gpu_initialise_dispersive_arrays()

    # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU
    G.gpu_set_blocks_per_grid()
    G.gpu_initialise_arrays()

    # PML updates
    if G.pmls:
        # Prepare kernels
        pmlmodulelectric = 'gprMax.pml_updates.pml_updates_electric_' + G.pmlformulation + '_gpu'
        kernelelectricfunc = getattr(import_module(pmlmodulelectric), 'kernels_template_pml_electric_' + G.pmlformulation)
        pmlmodulemagnetic = 'gprMax.pml_updates.pml_updates_magnetic_' + G.pmlformulation + '_gpu'
        kernelmagneticfunc = getattr(import_module(pmlmodulemagnetic), 'kernels_template_pml_magnetic_' + G.pmlformulation)
        kernels_pml_electric = SourceModule(kernelelectricfunc.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts)
        kernels_pml_magnetic = SourceModule(kernelmagneticfunc.substitute(REAL=cudafloattype, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsH.shape[1], NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts)
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels
        updatecoeffsE = kernels_pml_electric.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_pml_magnetic.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        # Set block per grid, initialise arrays on GPU, and get kernel functions
        for pml in G.pmls:
            pml.gpu_initialise_arrays()
            pml.gpu_get_update_funcs(kernels_pml_electric, kernels_pml_magnetic)
            pml.gpu_set_blocks_per_grid(G)

    # Receivers
    if G.rxs:
        # Initialise arrays on GPU
        rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G)
        # Prepare kernel and get kernel function
        kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1), options=compiler_opts)
        store_outputs_gpu = kernel_store_outputs.get_function("store_outputs")

    # Sources - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.voltagesources + G.hertziandipoles + G.magneticdipoles:
        kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts)
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels
        updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        if G.hertziandipoles:
            srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G)
            update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole")
        if G.magneticdipoles:
            srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G)
            update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole")
        if G.voltagesources:
            srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G)
            update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source")

    # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.snapshots:
        # Initialise arrays on GPU
        snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G)
        # Prepare kernel and get kernel function
        kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1), options=compiler_opts)
        store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot")

    # Iteration loop timer
    iterstart = drv.Event()
    iterend = drv.Event()
    iterstart.record()

    for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=not G.progressbars):

        # Get GPU memory usage on final iteration
        if iteration == G.iterations - 1:
            memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0]

        # Store field component values for every receiver
        if G.rxs:
            store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration),
                              rxcoords_gpu.gpudata, rxs_gpu.gpudata,
                              G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                              G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                              block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1))

        # Store any snapshots
        for i, snap in enumerate(G.snapshots):
            if snap.time == iteration + 1:
                if not G.snapsgpu2cpu:
                    store_snapshot_gpu(np.int32(i), np.int32(snap.xs),
                                       np.int32(snap.xf), np.int32(snap.ys),
                                       np.int32(snap.yf), np.int32(snap.zs),
                                       np.int32(snap.zf), np.int32(snap.dx),
                                       np.int32(snap.dy), np.int32(snap.dz),
                                       G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                       G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                       snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata,
                                       snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata,
                                       block=Snapshot.tpb, grid=Snapshot.bpg)
                else:
                    store_snapshot_gpu(np.int32(0), np.int32(snap.xs),
                                       np.int32(snap.xf), np.int32(snap.ys),
                                       np.int32(snap.yf), np.int32(snap.zs),
                                       np.int32(snap.zf), np.int32(snap.dx),
                                       np.int32(snap.dy), np.int32(snap.dz),
                                       G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                       G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                       snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata,
                                       snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata,
                                       block=Snapshot.tpb, grid=Snapshot.bpg)
                    gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                           snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), 0, snap)

        # Update magnetic field components
        update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                     G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata,
                     G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata,
                     G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg)

        # Update magnetic field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_magnetic(G)

        # Update magnetic field components for magetic dipole sources
        if G.magneticdipoles:
            update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata,
                                       srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1))

        # Update electric field components
        # If all materials are non-dispersive do standard update
        if Material.maxpoles == 0:
            update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata,
                         G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                         G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                         block=G.tpb, grid=G.bpg)
        # If there are any dispersive materials do 1st part of dispersive update
        # (it is split into two parts as it requires present and updated electric field values).
        else:
            update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

        # Update electric field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_electric(G)

        # Update electric field components for voltage sources
        if G.voltagesources:
            update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration),
                                      floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                      srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata,
                                      srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1))

        # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last)
        if G.hertziandipoles:
            update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata,
                                       srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1))

        # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates.
        if Material.maxpoles > 0:
            update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

    # Copy output from receivers array back to correct receiver objects
    if G.rxs:
        gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G)

    # Copy data from any snapshots back to correct snapshot objects
    if G.snapshots and not G.snapsgpu2cpu:
        for i, snap in enumerate(G.snapshots):
            gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                   snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

    iterend.record()
    iterend.synchronize()
    tsolve = iterstart.time_till(iterend) * 1e-3

    # Remove context from top of stack and delete
    ctx.pop()
    del ctx

    return tsolve, memsolve

Example #47

Show file

File: cuda.py Project: svistunov/cubvis

class CudaRender:
    """Main class. Do all stuff"""
    def __init__(self, w = 800, h = 800, name = "CudaRender"):
        self.w = w
        self.h = h
        self.name = name
        self.buffers = {}
        self.pointers = {}
        self.scn = Blender.Scene.GetCurrent()
        self.mouse_states = {}
        self.cuda_buffers = {}
        self.mouse_coords = {GLUT_LEFT_BUTTON : (0,0)}
        self.frame = 0
        self.timebase = 0
        self.calculated_vis = False
        self.from_files = {'model' : False, 'cudaRes': False}
        self.kernel_step = 4
        self.secs = 0
        self.cuda_stop = False
        self.live = True
        self.memory_type = '1dtex' # 'global'
        
    def create_events(self):
        self.start = cuda_driver.Event()
        self.end = cuda_driver.Event()
        
    def load_models_from_blender(self):
        self.objs = Blender.Object.GetSelected()
        if self.objs:
            self.verts = []
            self.normals = []
            self.indexes = []
            index_offset = 0
            for obj in self.objs:
                mesh = obj.getData(mesh=True)
                mesh.quadToTriangle()
                mesh.transform(obj.getMatrix())
                counter = 0
                for face in mesh.faces:
                    self.indexes.extend(range(index_offset + counter,index_offset + counter + 3))
                    counter += 3
                    for v in face.v:
                        self.verts.extend(v.co)
                        self.normals.extend(face.no)
                index_offset = max(self.indexes) + 1
            self.verts = numpy.asarray(self.verts).astype(numpy.float32)
            self.indexes = numpy.asarray(self.indexes).astype(numpy.ushort)
            self.normals = numpy.asarray(self.normals).astype(numpy.float32)
        
    def save_models_data_to_files(self):
        numpy.savetxt(os.path.join(p, 'data', 'verts.dat'), self.verts)
        numpy.savetxt(os.path.join(p, 'data', 'normals.dat'), self.normals)
        numpy.savetxt(os.path.join(p, 'data', 'indexes.dat'), self.indexes)
    
    def save_cuda_res_to_files(self):
        numpy.savetxt(os.path.join(p, 'data', 'vis.dat'), self.vis)
        
    def load_cuda_res_from_files(self):
        self.vis = numpy.loadtxt(os.path.join(p, 'data', 'vis.dat')).astype(numpy.float32)
        
    def load_models_from_files(self):
        self.verts = numpy.loadtxt(os.path.join(p, 'data', 'verts.dat')).astype(numpy.float32)
        self.normals = numpy.loadtxt(os.path.join(p, 'data', 'normals.dat')).astype(numpy.float32)
        self.indexes = numpy.loadtxt(os.path.join(p, 'data', 'indexes.dat')).astype(numpy.ushort)
    
    def load(self):
        self.load_cam()
        self.design = numpy.loadtxt(os.path.join(p, 'data', 'des3d_240_21.txt')).astype(numpy.float32)
        if (self.from_files['model']):
            self.load_models_from_files()
        else:
            self.load_models_from_blender()
            self.save_models_data_to_files()
        if (self.from_files['cudaRes']):
            self.load_cuda_res_from_files()
        else:
            self.vis = numpy.zeros((1, self.verts.size/3), numpy.float32)
    
    def create_glut_window(self):
        glutInit(sys.argv)
        glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH)
        glutInitWindowSize(self.w,self.h)
        glutCreateWindow(self.name)
        glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_CONTINUE_EXECUTION)
        glEnable(GL_DEPTH_TEST)
        #TODO: get bgColor form blender
        glutDisplayFunc(self.display)
        glutReshapeFunc(self.reshape)
        glutKeyboardFunc(self.keyboard)
        glutMouseFunc(self.mouse_button)
        glutMotionFunc(self.mouse_move)
        if (load_cuda):
            import pycuda.gl.autoinit
            import pycuda.driver as cuda_driver
            self.create_events()
    
    def mouse_button(self, button, state, x, y):
        self.mouse_states[button] = state
        self.mouse_coords[button] = (x,y)
        
    def mouse_move(self, x, y):
        cam_move_button = GLUT_LEFT_BUTTON
        f = 0.01
        if cam_move_button in self.mouse_states and self.mouse_states[cam_move_button] == GLUT_DOWN:
            mod =  glutGetModifiers()
            dx = self.mouse_coords[cam_move_button][0] - x
            dy = self.mouse_coords[cam_move_button][1] - y
            self.mouse_coords[cam_move_button] = (x,y)
            if mod <> GLUT_ACTIVE_CTRL:
                self.cam['phi'] += f*dx
                self.cam['theta'] += f*dy
            if mod == GLUT_ACTIVE_CTRL:
                self.cam['r'] += f*dy
                if self.cam['r'] < 4: self.cam['r'] = 4
                if self.cam['r'] > 20 : self.cam['r'] = 20
            self.set_cam()
            glutPostRedisplay()
            
    def keyboard(self, key, x, y):
        code = ord(key)
        if (code == 27):#Esc
            glutLeaveMainLoop()
        if (key == ' '):#Space
            self.cuda_stop = not self.cuda_stop

    def reshape(self, width, height):
        self.w = width
        self.h = height
        glutPostRedisplay()
        
    def cuda_from_display(self):
        #TODO: cuda streams ?
        if (not self.calculated_vis and load_cuda and not self.cuda_stop):
            try:
                step = self.kernel_iterator.next()
                self.cuda_map()
                self.cuda_call_kernal_step(step)
                glutSetWindowTitle("%s kernel steps: %d" % (self.name, step))
                self.cuda_unmap()
            except StopIteration:
                self.cuda_print_secs()
                self.cuda_free_memory()
                self.calculated_vis = True
                self.save_cuda_res_to_files()
    
    def display(self):
        self.cuda_from_display()
        glViewport(0, 0, self.w, self.h)
        #TODO: do smoothing
        glEnable(GL_POLYGON_SMOOTH)
        glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)
        self.put_buffer('aPos', 3)
        #self.put_buffer('aNorm', 3)
        self.put_buffer('aVis')
        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, self.buffers['ind'])
        glDrawElements(GL_TRIANGLES, self.indexes.size, GL_UNSIGNED_SHORT, None)
        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0)
        if (self.calculated_vis or self.cuda_stop): self.fps()
        glutSwapBuffers()
        if (not self.calculated_vis): glutPostRedisplay()
    
    def fps(self):
        self.frame += 1;
        time = glutGet(GLUT_ELAPSED_TIME)
        if (time - self.timebase > 1000):
            s = "FPS:%4.2f" % (self.frame*1000/(time - self.timebase))
            self.timebase = time
            self.frame = 0
            s = self.name + " " + s
            glutSetWindowTitle(s)
        
    def put_buffer(self, name, size = 1, type=GL_FLOAT):
        glBindBuffer(GL_ARRAY_BUFFER, self.buffers[name])
        glVertexAttribPointer(self.pointers[name], size, type, GL_FALSE, 0, None)
        glBindBuffer(GL_ARRAY_BUFFER, 0)
        
    def make_buffer(self, name, data, target=GL_ARRAY_BUFFER, usage=GL_STATIC_DRAW):
        self.buffers[name] = glGenBuffers(1)
        glBindBuffer(target, self.buffers[name])
        glBufferData(target, data.size*data.itemsize, data, usage)
        glBindBuffer(target, 0)
    
    def make_cuda_buffer(self, name):
        self.cuda_buffers[name] = cuda_gl.BufferObject(long(self.buffers[name]))

    def init_buffers(self):
        self.make_buffer('aVis', self.vis, GL_ARRAY_BUFFER, GL_DYNAMIC_DRAW)
        self.make_buffer('aPos', self.verts)
        #self.make_buffer('aNorm', self.normals)
        self.make_buffer('ind', self.indexes, GL_ELEMENT_ARRAY_BUFFER)
        if (not self.calculated_vis):
            self.make_cuda_buffer('aVis')
            if (self.memory_type == 'global'):
                self.make_cuda_buffer('aPos')
            #self.make_cuda_buffer('ind')
        
    def create_shaders(self):
        #TODO: get errors
        self.shaders = {}
        self.shaders['vertex'] = glCreateShader(GL_VERTEX_SHADER)
        self.shaders['fragment'] = glCreateShader(GL_FRAGMENT_SHADER)
        for (name,shader) in self.shaders.iteritems():
            source = open(os.path.join(p, 'glsl', name + '.glsl')).read()
            glShaderSource(shader, source)
            glCompileShader(shader)
        self.program = glCreateProgram()
        glAttachShader(self.program, self.shaders['vertex'])
        glAttachShader(self.program, self.shaders['fragment'])
        glLinkProgram(self.program)
        glUseProgram(self.program)
        #self.program.mvMatrixUniform = glGetUniformLocation(self.program, "uMVMatrix");
        
    def make_pointer(self, name):
        self.pointers[name] = glGetAttribLocation(self.program, name)
        glEnableVertexAttribArray(self.pointers[name])
        
    def init_pointers(self):
        self.make_pointer('aVis')
        self.make_pointer('aPos')
        #self.make_pointer('aNorm')
        
    def load_cam(self):
        self.cam_obj = self.scn.objects.camera
        cam = self.cam_obj.getData()
        self.cam = {}
        matrix = self.cam_obj.getMatrix()
        self.cam['pos'] = pos = matrix[3]
        self.cam['forwards'] = -matrix[2]
        self.cam['target'] = matrix[3] - matrix[2]
        self.cam['up'] = matrix[1]
        self.cam['fov'] = cam.angle
        self.cam['start'] = cam.clipStart
        self.cam['end'] = cam.clipEnd
        #TODO: FIX
        self.cam['r'] = math.sqrt(numpy.dot(pos,pos))
        self.cam['theta'] = math.acos(pos[2]/self.cam['r'])
        self.cam['phi'] = math.atan(pos[1]/pos[0])
    
    def set_cam(self):
        r = self.cam['r']; theta = self.cam['theta']; phi = self.cam['phi']
        self.cam['pos'][0] = r*math.sin(theta)*math.cos(phi)
        self.cam['pos'][1] =  r*math.sin(theta)*math.sin(phi)
        self.cam['pos'][2] = r*math.cos(theta)
        glMatrixMode(GL_PROJECTION)
        glLoadIdentity()
        gluPerspective(self.cam['fov'], self.w / self.h, self.cam['start'], self.cam['end']) 
        gluLookAt(self.cam['pos'][0], self.cam['pos'][1], self.cam['pos'][2],\
            0, 0, 0,\
            0, 0, 1)
    
    def set_matrix(self):
        self.set_cam()
        glMatrixMode(GL_MODELVIEW)
        glLoadIdentity()
        
    def cuda_print_secs(self):
        print "cuda time: %fs" % self.secs
        
    #TODO: optim block size and grid size
    def init_cuda(self):
        inc = {'common': os.path.join(p,'cu', 'common.cu')}
        template_params = {'inc': inc, 'dN' : self.design.size,
                           'visN': self.vis.size, 'vN' : self.verts.size,
                           'kernelStep' : self.kernel_step}
        kernel_code = Template(
            file = os.path.join(p, 'cu', 'vis_%s.cu' % (self.memory_type)), 
            searchList = [template_params],
          )
        self.cuda_module = SourceModule(kernel_code)
        self.cuda_call = self.cuda_module.get_function("vis")
        self.block_seize = (256, 1, 1)
        if (self.memory_type == 'global'):
             self.cuda_call.prepare("PPP", self.block_seize)
        if (self.memory_type == '1dtex'):
            self.cuda_call.prepare("P", self.block_seize)
        
    def cuda_get_memory(self):
        self.grid_dimensions =  (min(32, (self.vis.size+256-1) // 256 ), 1)
        self.cuda_mem = {}
        if (self.memory_type == 'global'):
            self.cuda_mem['normals_gpu'] = cuda_driver.mem_alloc(self.normals.nbytes)
            cuda_driver.memcpy_htod(self.cuda_mem['normals_gpu'], self.normals)
        self.cuda_mem['design_gpu'] = self.cuda_module.get_global('design')[0]
        cuda_driver.memcpy_htod(self.cuda_mem['design_gpu'], self.design)
        self.cuda_mem['kernel_n'] = self.cuda_module.get_global('kernelN')[0]
        if (self.memory_type == '1dtex'):
            self.put_data_to_cuda1dtex(self.normals, 'n_tex')
            self.put_data_to_cuda1dtex(self.verts, 'v_tex')
        
    def put_data_to_cuda1dtex(self, data, name):
        if (not name in self.cuda_mem):
            self.cuda_mem[name] = self.cuda_module.get_texref(name)
        self.cuda_mem[name + '_gpu'] = cuda_driver.to_device(data)
        self.cuda_mem[name].set_address(self.cuda_mem[name + '_gpu'], data.nbytes)
        self.cuda_mem[name].set_format(cuda_driver.array_format.FLOAT, 1)
        
    def cuda_map(self):
        self.cuda_mem['map_vis'] = self.cuda_buffers['aVis'].map()
        if (self.memory_type == 'global'):
            self.cuda_mem['map_pos'] = self.cuda_buffers['aPos'].map()
        
    def cuda_unmap(self):
        cuda_driver.Context.synchronize()
        self.cuda_mem['map_vis'].unmap()
        if (self.memory_type == 'global'):
            self.cuda_mem['map_pos'].unmap()
    
    def cuda_free_memory(self):
        pass

    def cuda_call_kernel_global(self):
        self.cuda_call.prepared_call(self.grid_dimensions, self.cuda_mem['map_vis'].device_ptr(),
            self.cuda_mem['map_pos'].device_ptr(), self.cuda_mem['normals_gpu']
            )
        
    def cuda_call_kernel_1dtex(self):
        self.cuda_call.prepared_call(self.grid_dimensions, self.cuda_mem['map_vis'].device_ptr())

    def cuda_call_kernal_step(self, step):
        self.start.record()
        cuda_driver.memcpy_htod(self.cuda_mem['kernel_n'],  numpy.array([step]).astype(numpy.int32))
        getattr(self, 'cuda_call_kernel_' + self.memory_type)()
        self.end.record()
        self.end.synchronize()
        #cuda_driver.Context.synchronize() # ????/???/
        self.secs += self.start.time_till(self.end)*1e-3
        
    def cuda_kernel_iterator(self):
        self.kernel_iterator = iter(range(0, self.design.size, self.kernel_step))
        
    def call_cuda(self):
        self.secs = 0
        for i in range(0, self.design.size, self.kernel_step):
            self.cuda_call_kernal_step(i)
        self.cuda_print_secs()
        
    def run(self):
        self.load()
        self.create_glut_window()
        self.set_matrix()
        self.init_buffers()
        self.create_shaders()
        self.init_pointers()
        if (not self.calculated_vis and load_cuda):
            self.init_cuda()
            self.cuda_kernel_iterator()
            self.cuda_get_memory()
            if (not self.live):
                self.cuda_map()
                self.call_cuda()
                self.cuda_free_memory()
                self.cuda_unmap()
                self.calculated_vis = True
        glutMainLoop()

Example #48

Show file

File: main.py Project: schroeder-dewitt/polyomino-self-assembly

def main():

    #FourPermutations set-up
    FourPermutations = numpy.array([ [1,2,3,4],
                                  [1,2,4,3],
                                  [1,3,2,4],
                                  [1,3,4,2],
                                  [1,4,2,3],
                                  [1,4,3,2],
                                  [2,1,3,4],
                                  [2,1,4,3],
                                  [2,3,1,4],
                                  [2,3,4,1],
                                  [2,4,1,3],
                                  [2,4,3,1],
                                  [3,2,1,4],
                                  [3,2,4,1],
                                  [3,1,2,4],
                                  [3,1,4,2],
                                  [3,4,2,1],
                                  [3,4,1,2],
                                  [4,2,3,1],
                                  [4,2,1,3],
                                  [4,3,2,1],
                                  [4,3,1,2],
                                  [4,1,2,3],
                                  [4,1,3,2],]).astype(numpy.uint8)

    BankSize = 8 # Do not go beyond 8!

    #Define constants
    DimGridX = 19
    DimGridY = 19
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
    BlockDimX = 100
    BlockDimY = 100
    SearchSpaceSize = BlockDimX * BlockDimY  * 32
    #BlockDimX = 600
    #BlockDimY = 600
    FitnessValDim = SearchSpaceSize
    GenomeDim = SearchSpaceSize

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":4,
                 "bit_length_edge_type":3,
                 "curand_nr_threads_per_block":32,
                 "nr_tile_types":2,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":32*BankSize,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":BlockDimX,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4,
                 "banksize":BankSize,
                 "curand_dim_block_x":BlockDimX
                }
    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")


    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix



    #Set-up genomes
    dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*4) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    fitness_left = numpy.zeros(FitnessValDim).astype(numpy.float32)
    fitness_left_h = drv.mem_alloc(fitness_left.nbytes)
    fitness_bottom = numpy.zeros(FitnessValDim).astype(numpy.float32)
    fitness_bottom_h = drv.mem_alloc(fitness_bottom.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    grids = numpy.zeros((10000*32, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #Set-up four permutations
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*32)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*32)
    BlockCounter=0
    print "Will do that many kernels a ",BlockDimX,"x",BlockDimY,":", MaxBlockCycles

    for i in range(MaxBlockCycles):
        #Set-up timer
        start = drv.Event()
        stop = drv.Event()
        start.record()
        print "Start kernel:"

        #Call kernels
        CurandKernel(curand_h, block=(32,1,1), grid=(BlockDimX, BlockDimY))
        print "Finished Curand kernel, starting main kernel..."
        Kernel(dest_h, grids_h, fitness_left_h, fitness_bottom_h, curand_h, block=(32*BankSize,1,1), grid=(BlockDimX,BlockDimY))

        #End timer
        stop.record()
        stop.synchronize()
        print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
        #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
        pass

    #Output
    #drv.memcpy_dtoh(dest, dest_h)
    #print "Genomes after: "
    #print dest[0:4]
    
    drv.memcpy_dtoh(fitness_left, fitness_left_h)
    print "FitnessLeft after: "
    print fitness_left#[1000000:1000500]

    drv.memcpy_dtoh(fitness_bottom, fitness_bottom_h)
    print "FitnessBottom after: "
    print fitness_bottom#[1000000:1000500]

    drv.memcpy_dtoh(grids, grids_h)
    print "Grids[0] after: "
    for i in range(0,5):
        print "Grid ",i,": "
        print grids[i]

Example #49

Show file

def get_transduction_func(dtype, block_size, Xaddress, change_ind1,
                          change_ind2, change1, change2, compile_options):
    template = """
/* This is kept for documentation purposes the actual code used is after the end
 * of this template */
#include "curand_kernel.h"

extern "C" {
#include "stdio.h"

#define BLOCK_SIZE %(block_size)d
#define LA 0.5

/* Simulation Constants */
#define C_T     0.5     /* Total concentration of calmodulin */
#define G_T     50      /* Total number of G-protein */
#define PLC_T   100     /* Total number of PLC */
#define T_T     25      /* Total number of TRP/TRPL channels */
#define I_TSTAR 0.68    /* Average current through one opened TRP/TRPL channel (pA)*/

#define GAMMA_DSTAR     4.0 /* s^(-1) rate constant*/
#define GAMMA_GAP       3.0 /* s^(-1) rate constant*/
#define GAMMA_GSTAR     3.5 /* s^(-1) rate constant*/
#define GAMMA_MSTAR     3.7 /* s^(-1) rate constant*/
#define GAMMA_PLCSTAR   144 /* s^(-1) rate constant */
#define GAMMA_TSTAR     25  /* s^(-1) rate constant */

#define H_DSTAR         37.8    /* strength constant */
#define H_MSTAR         40      /* strength constant */
#define H_PLCSTAR       11.1    /* strength constant */
#define H_TSTARP        11.5    /* strength constant */
#define H_TSTARN        10      /* strength constant */

#define K_P     0.3     /* Dissociation coefficient for calcium positive feedback */
#define K_P_INV 3.3333  /* K_P inverse ( too many decimals are not important) */
#define K_N     0.18    /* Dissociation coefficient for calmodulin negative feedback */
#define K_N_INV 5.5555  /* K_N inverse ( too many decimals are not important) */
#define K_U     30      /* (mM^(-1)s^(-1)) Rate of Ca2+ uptake by calmodulin */
#define K_R     5.5     /* (mM^(-1)s^(-1)) Rate of Ca2+ release by calmodulin */
#define K_CA    1000    /* s^(-1) diffusion from microvillus to somata (tuned) */

#define K_NACA  3e-8    /* Scaling factor for Na+/Ca2+ exchanger model */

#define KAPPA_DSTAR         1300.0  /* s^(-1) rate constant - there is also a capital K_DSTAR */
#define KAPPA_GSTAR         7.05    /* s^(-1) rate constant */
#define KAPPA_PLCSTAR       15.6    /* s^(-1) rate constant */
#define KAPPA_TSTAR         150.0   /* s^(-1) rate constant */
#define K_DSTAR             100.0   /* rate constant */

#define F                   96485   /* (mC/mol) Faraday constant (changed from paper)*/
#define N                   4       /* Binding sites for calcium on calmodulin */
#define R                   8.314   /* (J*K^-1*mol^-1)Gas constant */
#define T                   293     /* (K) Absolute temperature */
#define VOL                 3e-9    /* changed from 3e-12microlitres to nlitres
                                     * microvillus volume so that units agree */

#define N_S0_DIM        1   /* initial condition */
#define N_S0_BRIGHT     2

#define A_N_S0_DIM      4   /* upper bound for dynamic increase (of negetive feedback) */
#define A_N_S0_BRIGHT   200

#define TAU_N_S0_DIM    3000    /* time constant for negative feedback */
#define TAU_N_S0_BRIGHT 1000

#define NA_CO           120     /* (mM) Extracellular sodium concentration */
#define NA_CI           8       /* (mM) Intracellular sodium concentration */
#define CA_CO           1.5     /* (mM) Extracellular calcium concentration */

#define G_TRP           8       /* conductance of a TRP channel */
#define TRP_REV         0       /* TRP channel reversal potential (mV) */

__device__ __constant__ long long int d_X[5];
__device__ __constant__ int change_ind1[14];
__device__ __constant__ int change1[14];
__device__ __constant__ int change_ind2[14];
__device__ __constant__ int change2[14];

/* cc = n/(NA*VOL) [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */
__device__ float num_to_mM(int n)
{
    return n * 5.5353e-4; // n/1806.6;
}

/* n = cc*VOL*NA [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */
__device__ float mM_to_num(float cc)
{
    return rintf(cc * 1806.6);
}

/* Assumes Hill constant (=2) for positive calcium feedback */
__device__ float compute_fp(float Ca_cc)
{
    float tmp = Ca_cc*K_P_INV;
    tmp *= tmp;
    return tmp/(1 + tmp);
}

/* Assumes Hill constant(=3) for negative calmodulin feedback */
__device__ float compute_fn(float Cstar_cc, float ns)
{
    float tmp = Cstar_cc*K_N_INV;
    tmp *= tmp*tmp;
    return ns*tmp/(1 + tmp);
}

/* Vm [V] */
__device__ float compute_ca(int Tstar, float Cstar_cc, float Vm)
{
    float I_in = Tstar*G_TRP*fmaxf(-Vm + 0.001*TRP_REV, 0);
    /* CaM = C_T - Cstar_cc */
    float denom = (K_CA + (N*K_U*C_T) - (N*K_U)*Cstar_cc + 179.0952 * expf(-(F/(R*T))*Vm));  // (K_NACA*NA_CO^3/VOL*F)
    /* I_Ca ~= 0.4*I_in */
    float numer = (0.4*I_in)/(2*VOL*F) +
                  ((K_NACA*CA_CO*NA_CI*NA_CI*NA_CI)/(VOL*F)) +  // in paper it's -K_NACA... due to different conventions
                  N*K_R*Cstar_cc;

    return fmaxf(1.6e-4, numer/denom);
}

__global__ void
transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm,
             %(type)s* g_ns, %(type)s* input,
             int* num_microvilli, int total_microvilli, int* count)
{
    int tid = threadIdx.x;
    int gid = threadIdx.x + blockIdx.x * blockDim.x;
    int wid = tid %% 32;
    int wrp = tid >> 5;

    __shared__ int X[BLOCK_SIZE][7];  // number of molecules
    __shared__ float Ca[BLOCK_SIZE];
    __shared__ float fn[BLOCK_SIZE];

    float Vm, ns, lambda;

    float sumrate, dt_advanced;
    int reaction_ind;
    ushort2 tmp;


    // copy random generator state locally to avoid accessing global memory
    curandStateXORWOW_t localstate = state[gid];


    int mid; // microvilli ID
    volatile __shared__ int mi[4]; // starting point of mid per ward

    // use atomicAdd to obtain the starting mid for the warp
    if(wid == 0)
    {
        mi[wrp] = atomicAdd(count, 32);
    }
    mid = mi[wrp] + wid;
    int ind;

    while(mid < total_microvilli)
    {
        // load photoreceptor index of the microvilli
        ind = ((ushort*)d_X[4])[mid];

        // load variables that are needed for computing calcium concentration
        tmp = ((ushort2*)d_X[2])[mid];
        X[tid][5] = tmp.x;
        X[tid][6] = tmp.y;

        Vm = d_Vm[ind]*1e-3;
        ns = g_ns[ind];

        // update calcium concentration
        Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
        fn[tid] = compute_fn(num_to_mM(X[tid][5]), ns);

        lambda = input[ind]/num_microvilli[ind];

        // load the rest of variables
        tmp = ((ushort2*)d_X[1])[mid];
        X[tid][4] = tmp.y;
        X[tid][3] = tmp.x;
        tmp = ((ushort2*)d_X[0])[mid];
        X[tid][2] = tmp.y;
        X[tid][1] = tmp.x;
        X[tid][0] = ((ushort*)d_X[3])[mid];

        // compute total rate of reaction
        sumrate = lambda;
        sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );  //11
        sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]);  //12
        sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6];  // 10
        sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4];  // 8
        sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3];  // 7
        sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0];  // 1
        sumrate += KAPPA_DSTAR * X[tid][3];  // 6
        sumrate += GAMMA_GAP * X[tid][2] * X[tid][3];  // 4
        sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);  // 3
        sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]);  // 5
        sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0];  // 2
        sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                   (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                   X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5 ;  // 9

        // choose the next reaction time
        dt_advanced = -logf(curand_uniform(&localstate))/(LA + sumrate);

        // If the reaction time is smaller than dt,
        // pick the reaction and update,
        // then compute the total rate and next reaction time again
        // until all dt_advanced is larger than dt.
        // Note that you don't have to compensate for
        // the last reaction time that exceeds dt.
        // The reason is that the exponential distribution is MEMORYLESS.
        while(dt_advanced <= dt)
        {
            reaction_ind = 0;
            sumrate = curand_uniform(&localstate) * sumrate;

            if(sumrate > 2e-5)
            {

                sumrate -= lambda;
                reaction_ind = (sumrate<=2e-5) * 13;

                if(!reaction_ind)
                {

                    sumrate -= mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );
                    reaction_ind = (sumrate<=2e-5) * 11;

                    if(!reaction_ind)
                    {
                        sumrate -= mM_to_num(K_R) * num_to_mM(X[tid][5]);
                        reaction_ind = (sumrate<=2e-5) * 12;
                        if(!reaction_ind)
                        {
                            sumrate -= GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6];
                            reaction_ind = (sumrate<=2e-5) * 10;
                            if(!reaction_ind)
                            {
                                sumrate -= GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4];
                                reaction_ind = (sumrate<=2e-5) * 8;

                                if(!reaction_ind)
                                {
                                    sumrate -= GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3];
                                    reaction_ind = (sumrate<=2e-5) * 7;
                                    if(!reaction_ind)
                                    {
                                        sumrate -= GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0];
                                        reaction_ind = (sumrate<=2e-5) * 1;
                                        if(!reaction_ind)
                                        {
                                            sumrate -= KAPPA_DSTAR * X[tid][3];
                                            reaction_ind = (sumrate<=2e-5) * 6;
                                            if(!reaction_ind)
                                            {
                                                sumrate -= GAMMA_GAP * X[tid][2] * X[tid][3];
                                                reaction_ind = (sumrate<=2e-5) * 4;

                                                if(!reaction_ind)
                                                {
                                                    sumrate -= KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);
                                                    reaction_ind = (sumrate<=2e-5) * 3;
                                                    if(!reaction_ind)
                                                    {
                                                        sumrate -= GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]);
                                                        reaction_ind = (sumrate<=2e-5) * 5;
                                                        if(!reaction_ind)
                                                        {
                                                            sumrate -= KAPPA_GSTAR * X[tid][1] * X[tid][0];
                                                            reaction_ind = (sumrate<=2e-5) * 2;
                                                            if(!reaction_ind)
                                                            {
                                                                sumrate -= (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                                                                           (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                                                                           X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5;
                                                                reaction_ind = (sumrate<=2e-5) * 9;
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            int ind;

            // only up to two state variables are needed to be updated
            // update the first one.
            ind = change_ind1[reaction_ind];
            X[tid][ind] += change1[reaction_ind];

            //if(reaction_ind == 9)
            //{
            //    X[tid][ind] = max(X[tid][ind], 0);
            //}

            ind = change_ind2[reaction_ind];
            //update the second one
            if(ind != 0)
            {
                X[tid][ind] += change2[reaction_ind];
            }

            // compute the advance time again
            Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
            fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns );
            //fp[tid] = compute_fp( Ca[tid] );

            sumrate = lambda;
            sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11
            sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12
            sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10
            sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8
            sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7
            sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1
            sumrate += KAPPA_DSTAR * X[tid][3]; // 6
            sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4
            sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]);  // 3
            sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5
            sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2
            sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) *
                       (1 + H_TSTARP*compute_fp( Ca[tid] )) *
                       X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; // 9

            dt_advanced -= logf(curand_uniform(&localstate))/(LA + sumrate);

        } // end while

        ((ushort*)d_X[3])[mid] = X[tid][0];
        ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]);
        ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]);
        ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]);

        if(wid == 0)
        {
            mi[wrp] = atomicAdd(count, 32);
        }
        mid = mi[wrp] + wid;
    }

    // copy the updated random generator state back to global memory
    state[gid] = localstate;

}

}
"""

    template_run = """

#include "curand_kernel.h"

extern "C" {
#include "stdio.h"

#define BLOCK_SIZE %(block_size)d
#define LA 0.5

__device__ __constant__ long long int d_X[5];
__device__ __constant__ int change_ind1[14];
__device__ __constant__ int change1[14];
__device__ __constant__ int change_ind2[14];
__device__ __constant__ int change2[14];


__device__ float num_to_mM(int n)
{
    return n * 5.5353e-4; // n/1806.6;
}

__device__ float mM_to_num(float cc)
{
    return rintf(cc * 1806.6);
}

__device__ float compute_fp( float ca_cc)
{
    float tmp = ca_cc*3.3333333333;
    tmp *= tmp;
    return tmp/(1+tmp);
}

__device__ float compute_fn( float Cstar_cc, float ns)
{
    float tmp = Cstar_cc*5.55555555;
    tmp *= tmp*tmp;
    return ns*tmp/(1+tmp);
}

__device__ float compute_ca(int Tstar, float cstar_cc, float Vm)
{
    float I_in = Tstar*8*fmaxf(-Vm,0);
    float denom = (1060 - 120*cstar_cc + 179.0952 * expf(-39.60793*Vm));
    float numer = I_in * 690.9537 + 0.0795979 + 22*cstar_cc;

    return fmaxf(1.6e-4, numer/denom);
}

__global__ void
transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm,
             %(type)s* g_ns, %(type)s* input,
             int* num_microvilli, int total_microvilli, int* count)
{
    int tid = threadIdx.x;
    int gid = threadIdx.x + blockIdx.x * blockDim.x;
    int wid = tid %% 32;
    int wrp = tid >> 5;

    __shared__ int X[BLOCK_SIZE][7];  // number of molecules
    __shared__ float Ca[BLOCK_SIZE];
    __shared__ float fn[BLOCK_SIZE];

    float Vm, ns, lambda;

    float sumrate, dt_advanced;
    int reaction_ind;
    ushort2 tmp;

    // copy random generator state locally to avoid accessing global memory
    curandStateXORWOW_t localstate = state[gid];


    int mid; // microvilli ID
    volatile __shared__ int mi[4]; // starting point of mid per ward, blocksize must be 128

    // use atomicAdd to obtain the starting mid for the warp
    if(wid == 0)
    {
        mi[wrp] = atomicAdd(count, 32);
    }
    mid = mi[wrp] + wid;
    int ind;

    while(mid < total_microvilli)
    {
        ind = ((ushort*)d_X[4])[mid];
        // load variables that are needed for computing calcium concentration
        tmp = ((ushort2*)d_X[2])[mid];
        X[tid][5] = tmp.x;
        X[tid][6] = tmp.y;

        Vm = d_Vm[ind]*1e-3;
        ns = g_ns[ind];

        // update calcium concentration
        Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
        fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns);

        lambda = input[ind]/(double)num_microvilli[ind];

        // load the rest of variables
        tmp = ((ushort2*)d_X[1])[mid];
        X[tid][4] = tmp.y;
        X[tid][3] = tmp.x;
        tmp = ((ushort2*)d_X[0])[mid];
        X[tid][2] = tmp.y;
        X[tid][1] = tmp.x;
        X[tid][0] = ((ushort*)d_X[3])[mid];

        sumrate = lambda + 54198 * Ca[tid] * (0.5 - X[tid][5] * 5.5353e-4) + 5.5 * X[tid][5]; // 11, 12
        sumrate += 25 * (1+10*fn[tid]) * X[tid][6]; // 10
        sumrate += 4 * (1+37.8*fn[tid]) * X[tid][4] ; // 8
        sumrate += (1444+1598.4*fn[tid]) * X[tid][3] ; // 7, 6
        sumrate += (3.7*(1+40*fn[tid]) + 7.05 * X[tid][1]) * X[tid][0] ; // 1, 2
        sumrate += (1560 - 12.6 * X[tid][3]) * X[tid][2]; // 3, 4
        sumrate += 3.5 * (50 - X[tid][2] - X[tid][1] - X[tid][3]) ; // 5
        sumrate += 0.015 * (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5 ; // 9

        dt_advanced = -logf(curand_uniform(&localstate))/(LA+sumrate);

        // If the reaction time is smaller than dt,
        // pick the reaction and update,
        // then compute the total rate and next reaction time again
        // until all dt_advanced is larger than dt.
        // Note that you don't have to compensate for
        // the last reaction time that exceeds dt.
        // The reason is that the exponential distribution is MEMORYLESS.
        while (dt_advanced <= dt) {
            reaction_ind = 0;
            sumrate = curand_uniform(&localstate) * sumrate;

            if (sumrate > 2e-5) {
                sumrate -= lambda;
                reaction_ind = (sumrate<=2e-5) * 13;

                if (!reaction_ind) {
                    sumrate -= mM_to_num(30) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) );
                    reaction_ind = (sumrate<=2e-5) * 11;

                    if (!reaction_ind) {
                        sumrate -= mM_to_num(5.5) * num_to_mM(X[tid][5]);
                        reaction_ind = (sumrate<=2e-5) * 12;

                        if (!reaction_ind) {
                            sumrate -= 25 * (1+10*fn[tid]) * X[tid][6];
                            reaction_ind = (sumrate<=2e-5) * 10;

                            if (!reaction_ind) {
                                sumrate -= 4 * (1+37.8*fn[tid]) * X[tid][4];
                                reaction_ind = (sumrate<=2e-5) * 8;

                                if (!reaction_ind) {
                                    sumrate -= 144 * (1+11.1*fn[tid]) * X[tid][3];
                                    reaction_ind = (sumrate<=2e-5) * 7;

                                    if (!reaction_ind) {
                                        sumrate -= 3.7*(1+40*fn[tid]) * X[tid][0];
                                        reaction_ind = (sumrate<=2e-5) * 1;

                                        if (!reaction_ind) {
                                            sumrate -= 1300 * X[tid][3];
                                            reaction_ind = (sumrate<=2e-5) * 6;

                                            if (!reaction_ind) {
                                                sumrate -= 3.0 * X[tid][2] * X[tid][3];
                                                reaction_ind = (sumrate<=2e-5) * 4;

                                                if (!reaction_ind) {
                                                    sumrate -= 15.6 * X[tid][2]
                                                        * (100-X[tid][3]);
                                                    reaction_ind = (sumrate<=2e-5) * 3;

                                                    if (!reaction_ind) {
                                                        sumrate -= 3.5 * (50 - X[tid][2]
                                                            - X[tid][1] - X[tid][3]);
                                                        reaction_ind = (sumrate<=2e-5) * 5;

                                                        if(!reaction_ind) {
                                                            sumrate -= 7.05 * X[tid][1]
                                                                * X[tid][0];
                                                            reaction_ind = (sumrate<=2e-5)
                                                                * 2;

                                                            if(!reaction_ind) {
                                                                sumrate -= 0.015 *
                                                                    (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5;
                                                                reaction_ind = (sumrate<=2e-5) * 9;
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            //int ind;

            // only up to two state variables are needed to be updated
            // update the first one.
            ind = change_ind1[reaction_ind];
            X[tid][ind] += change1[reaction_ind];

            //update the second one
            ind = change_ind2[reaction_ind];
            if (ind != 0)
                X[tid][ind] += change2[reaction_ind];

            // compute the advance time again
            Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm);
            fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns );

            sumrate = lambda + 54198*Ca[tid]*(0.5 - X[tid][5]*5.5353e-4)
                + 5.5*X[tid][5]; // 11, 12
            sumrate += 25*(1 + 10*fn[tid])*X[tid][6]; // 10
            sumrate += 4*(1 + 37.8*fn[tid])*X[tid][4]; // 8
            sumrate += (1444 + 1598.4*fn[tid])*X[tid][3]; // 7, 6
            sumrate += (3.7*(1 + 40*fn[tid]) + 7.05*X[tid][1])*X[tid][0]; // 1, 2
            sumrate += (1560 - 12.6*X[tid][3])*X[tid][2]; // 3, 4
            sumrate += 3.5*(50 - X[tid][2] - X[tid][1] - X[tid][3]); // 5
            sumrate += 0.015*(1 + 11.5*compute_fp( Ca[tid] ))
                *X[tid][4]*(X[tid][4] - 1)*(25 - X[tid][6])*0.5; // 9

            dt_advanced -= logf(curand_uniform(&localstate))/(LA+sumrate);

        } // end while

        ((ushort*)d_X[3])[mid] = X[tid][0];
        ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]);
        ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]);
        ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]);

        if(wid == 0)
        {
            mi[wrp] = atomicAdd(count, 32);
        }
        mid = mi[wrp] + wid;
    }
    // copy the updated random generator state back to global memory
    state[gid] = localstate;
}

}
"""
    try:
        co = [compile_options[0] + ' --maxrregcount=54']
    except IndexError:
        co = ['--maxrregcount=54']

    scalartype = dtype.type if isinstance(dtype, np.dtype) else dtype
    mod = SourceModule(template_run % {
        "type": dtype_to_ctype(dtype),
        "block_size": block_size,
        "fletter": 'f' if scalartype == np.float32 else ''
    },
                       options=co,
                       no_extern_c=True)
    func = mod.get_function('transduction')
    d_X_address, d_X_nbytes = mod.get_global("d_X")
    cuda.memcpy_htod(d_X_address, Xaddress)
    d_change_ind1_address, d_change_ind1_nbytes = mod.get_global("change_ind1")
    d_change_ind2_address, d_change_ind2_nbytes = mod.get_global("change_ind2")
    d_change1_address, d_change1_nbytes = mod.get_global("change1")
    d_change2_address, d_change2_nbytes = mod.get_global("change2")
    cuda.memcpy_htod(d_change_ind1_address, change_ind1)
    cuda.memcpy_htod(d_change_ind2_address, change_ind2)
    cuda.memcpy_htod(d_change1_address, change1)
    cuda.memcpy_htod(d_change2_address, change2)

    func.prepare('PfPPPPiP')
    func.set_cache_config(cuda.func_cache.PREFER_SHARED)
    return func

Example #50

Show file

            fin_g4[i] = -feq_g2[i] + feq_g4[i] + fin_g2[i];
            fin_g7[i] = -feq_g5[i] + feq_g7[i] + fin_g5[i];
            fin_g8[i] = -feq_g6[i] + feq_g8[i] + fin_g6[i];
        } 
        
          
                    
    }
""" % (omega))

#    funRT = funRT % (uLB, omega, turb)

#mod         = SourceModule(funRT+funBC)
funRT = mod.get_function("funRT")
#funBC        = mod.get_function("funBC")
t_c, _ = mod.get_global("t_c")
c_c, _ = mod.get_global("c_c")
#
cuda.memcpy_htod(t_c, t_h)
cuda.memcpy_htod(c_c, c_h)

# Time Loop
for It in range(maxIt):

    #cuda.memcpy_htod(fin_g, fin)
    #cuda.memcpy_htod(fpost_g,fpost)

    #    start.record()
    #start.synchronize()

    funRT(fin_g0,

Example #51

Show file

#!/usr/bin/python

import sys
import pycuda.driver as cuda
import pycuda.gpuarray
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.autoinit
import time

dtype = np.double
src = ''.join(open('some-tests.cu').readlines())
#mod = SourceModule(src, arch='sm_13', no_extern_c = True)
mod = SourceModule(src, no_extern_c=True)

mod.get_global('str')

fun = mod.get_function('test')

#jm1 = np.matrix(range(6), dtype).reshape(2,3)
m1 = np.zeros((2, 3), dtype)

fun(cuda.InOut(m1), np.uint32(time.time()), block=(10, 1, 1), grid=(1, 1))

print m1

Example #52

Show file

def initialize(function_number, threads = 1, grid = (1,1)):
    total_threads = threads * grid[0] * grid[1]
    # initialize only once for subsequent calls of the same benchmark function
    global initialized_parameters
    if initialized_parameters == (function_number, threads, grid):
        return
    initialized_parameters = (function_number, threads, grid)

    print '--- Allocating memmory, initializing benchmark function(s) ---'

    global module
    # compile the kernel source for given dimension and proper blocksize
    module = SourceModule(src, arch='sm_13', no_extern_c = True, options= \
            ['--use_fast_math', '--ptxas-options=-v', \
            '-D NREAL=%d'%nreal, '-D NFUNC=%d'%nfunc, '-D BLOCKSIZE=%d'%threads])

    # PRNG initialization
    global rngStates, initRNG
    rngStates = cuda.mem_alloc(40 * total_threads)  # sizeof(curandState) = 40 bytes
    cuda.memcpy_htod(module.get_global('rngStates')[0], np.intp(rngStates)) # set pointer in kernel code
    initRNG = module.get_function('initRNG')
    initRNG(np.uint32(time.time()), block=(threads,1,1), grid = grid)

    # rw data structures (separate for each thread)
    global g_trans_x, g_temp_x1, g_temp_x2, g_temp_x3, g_temp_x4, \
            g_norm_x,  g_basic_f, g_weight, g_norm_f

    # constant (same for each thread)
    global sigma, lambd, bias, o, g, l, o_gpu, o_rows, g_gpu, g_rows, l_gpu
    
    # constant scalars
    cuda.memcpy_htod(module.get_global('nreal')[0], np.int32(nreal))
    cuda.memcpy_htod(module.get_global('nfunc')[0], np.int32(nfunc))
    cuda.memcpy_htod(module.get_global('C')[0], np.double(2000))
    cuda.memcpy_htod(module.get_global('global_bias')[0], np.double(0))

    # rw arrays (memmory allocation only, no initialization)
    g_trans_x = np.zeros(nreal * total_threads).astype(dtype)
    g_temp_x1 = np.zeros(nreal * total_threads).astype(dtype)
    g_temp_x2 = np.zeros(nreal * total_threads).astype(dtype)
    g_temp_x3 = np.zeros(nreal * total_threads).astype(dtype)
    g_temp_x4 = np.zeros(nreal * total_threads).astype(dtype)
    g_norm_x  = np.zeros(nreal * total_threads).astype(dtype)
    g_basic_f = np.zeros(nfunc * total_threads).astype(dtype)
    g_weight  = np.zeros(nfunc * total_threads).astype(dtype)
    g_norm_f  = np.zeros(nfunc * total_threads).astype(dtype)

    # constant arrays
    sigma = np.zeros(nfunc).astype(dtype)
    lambd = np.ones(nfunc).astype(dtype)
    bias = np.zeros(nfunc).astype(dtype)

    # 2d arrays
    o = np.zeros((nfunc,nreal)).astype(dtype)
    g = np.eye(nreal,nreal).astype(dtype)

    if function_number == 1:
        # wczytanie 'o' mozna bedzie pewnie zamienic na 1-linijkowy kod (o = np.matrix(''.join... itp
        fpt = ''.join(open('input_data/sphere_func_data.txt', 'r').readlines()).strip().split()
        for i in xrange(nfunc):
            for j in xrange(nreal):
                o[i,j] = dtype(fpt.pop(0))
        bias[0] = -450.0
    elif function_number == 2:
        fpt = ''.join(open('input_data/schwefel_102_data.txt', 'r').readlines()).strip().split()
        for i in xrange(nfunc):
            for j in xrange(nreal):
                o[i,j] = dtype(fpt.pop(0))
        bias[0] = -450.0
    elif function_number == 3:
        fpt = ''.join(open('input_data/elliptic_M_D%d.txt' % nreal, 'r').readlines()).strip().split()
        for i in xrange(nreal):
            for j in xrange(nreal):
                g[i,j] = dtype(fpt.pop(0))

        fpt = ''.join(open('input_data/high_cond_elliptic_rot_data.txt', 'r').readlines()).strip().split()
        for i in xrange(nfunc):
            for j in xrange(nreal):
                o[i,j] = dtype(fpt.pop(0))
        bias[0] = -450.0
    elif function_number == 4:
        fpt = ''.join(open('input_data/schwefel_102_data.txt', 'r').readlines()).strip().split()
        for i in xrange(nfunc):
            for j in xrange(nreal):
                o[i,j] = dtype(fpt.pop(0))
        bias[0] = -450.0
    elif function_number == 5:
        global A, B
        fpt = open('input_data/schwefel_206_data.txt', 'r')
        o = np.matrix(fpt.readline().strip(), dtype)[0,:nreal]
        A = np.matrix(np.matrix(';'.join(fpt.readlines()))[:nreal,:nreal], dtype)
        B = np.zeros(nreal).astype(dtype)
        if nreal % 4 == 0:
            index = nreal / 4
        else:
            index = nreal / 4 + 1
        for i in xrange(index):
            o[0,i] = -100
        index = (3 * nreal) / 4 - 1
        for i in xrange(index, nreal):
            o[0,i] = 100
        for i in xrange(nreal):
            for j in xrange(nreal):
                B[i] += A[i,j] * o[0,j]
        A = cuda.to_device(A)
        B = cuda.to_device(B)
        cuda.memcpy_htod(module.get_global('A')[0], np.intp(A))
        cuda.memcpy_htod(module.get_global('B')[0], np.intp(B))
        bias[0] = -310.0
    elif function_number == 6:
        fpt = ''.join(open('input_data/rosenbrock_func_data.txt', 'r').readlines()).strip().split()
        for i in xrange(nfunc):
            for j in xrange(nreal):
                o[i,j] = dtype(fpt.pop(0)) - 1
        bias[0] = 390

    # 6 1-dimensional arrays
    arrays = ['g_trans_x', 'g_temp_x1', 'g_temp_x2', 'g_temp_x3', 'g_temp_x4', 'g_norm_x']
    for var in arrays:
        globals()[var] = cuda.to_device(globals()[var]) # send to device and save the pointer
        cuda.memcpy_htod(module.get_global(var)[0], np.intp(globals()[var])) # set pointer in kernel code

    # 6 1-dimensional arrays
    arrays = ['g_basic_f', 'g_weight', 'sigma', 'g_norm_f']
    for var in arrays:
        globals()[var] = cuda.to_device(globals()[var])
        cuda.memcpy_htod(module.get_global(var)[0], np.intp(globals()[var]))
    #lambd = cuda.to_device(lambd)
    cuda.memcpy_htod(module.get_global('lambda')[0], lambd)
    cuda.memcpy_htod(module.get_global('bias')[0], bias)

    # 2 2-dimensional arrays (o, g)
    # MAYBE IT SHOULD BE CASTED to 1-d ARRAY for performance?

    print 'o:', o
    #o_gpu = cuda.to_device(np.zeros(nfunc).astype(np.intp))
    #o_rows = []
    for i in xrange(o.shape[0]):
        cuda.memcpy_htod(module.get_global('o')[0] + i * 50 * 8, o[i,:])
        #row = cuda.to_device(o[i,:])
        #cuda.memcpy_htod(int(o_gpu) + np.intp().nbytes * i, np.intp(row))
        #o_rows.append(row)
        #cuda.memcpy_htod(int(o_gpu) + np.intp().nbytes * i, np.intp(row))
    #cuda.memcpy_htod(module.get_global('o')[0], np.intp(o_gpu))

    # watch out! 'g' is nreal x nreal
    #g = np.linspace(21,21 + nreal*nreal-1,nreal*nreal).reshape((nreal,nreal))
    print 'g:',g
    #g_gpu = cuda.to_device(np.zeros(nreal).astype(np.intp))
    #g_rows = []
    for i in xrange(g.shape[0]):
        cuda.memcpy_htod(module.get_global('g')[0] + i * 50 * 8, g[i,:])
        #row = cuda.to_device(g[i,:])
        #g_rows.append(row)
        #cuda.memcpy_htod(int(g_gpu) + np.intp().nbytes * i, np.intp(row))
    #cuda.memcpy_htod(module.get_global('g')[0], np.intp(g_gpu))

    # 'l' (3d array) -- flatten to 1d
    l = np.zeros((nfunc,nreal,nreal)).astype(dtype)
    #l = np.linspace(3, 3 + nfunc*nreal*nreal-1,nfunc*nreal*nreal).reshape((nfunc,nreal,nreal))
    for i in xrange(nfunc):
        l[i,:,:] = np.eye(nreal)
    print 'l:',l
    print 'l flatten:', l.flatten()
    #l_gpu = cuda.to_device(l.flatten().astype(dtype))
    #print l_gpu
    cuda.memcpy_htod(module.get_global('l_flat')[0], l);

    print '--- initialization done ---'

Example #53

Show file

File: ar_missed_obs.py Project: LinZhiyuan/parallelKF

        kalman_gain_ar, 
        &fltr_stt_cov_ar[offset_cov+wid], dim_state_ar, temp, temp1, tid);
      }
      
      logpdf[tid] = log_pdf(pred_obs_mean_ar[tid], pred_obs_cov_ar[tid],tid);
    }
""")


start = time.time()


context.set_cache_config(cuda.func_cache.PREFER_L1)

filter = mod.get_function("filter")
init_stt_mean_ar =  mod.get_global('init_stt_mean_ar')[0] 
init_stt_cov_ar =  mod.get_global('init_stt_cov_ar')[0] 
tran_mat_ar =  mod.get_global('tran_mat_ar')[0] 
observations_ar =  mod.get_global('observations_ar')[0] 

cuda.memcpy_htod(init_stt_mean_ar, init_stt_mean_ar_const)
cuda.memcpy_htod(init_stt_cov_ar, init_stt_cov_ar_const)
cuda.memcpy_htod(tran_mat_ar, tran_mat_ar_const)
cuda.memcpy_htod(observations_ar, observations_ar_const)



start = time.time()

filter(tran_cov_mat_ar_gpu,
  fltr_stt_mean_ar_gpu,

Example #54

Show file

    def __init__(self,
                 model,
                 dx,
                 source_dt,
                 sources,
                 pml_width=None,
                 nvcc_options=None):

        source = """
__constant__ float fd1_d[2];
__constant__ float fd2_d[3];

__global__ void step_d(const float *const wfc,
                float *wfp,
                const float *const phix,
                float *phixp,
                const float *const sigmax,
                const float *const model2_dt2,
                const float dt,
                const int nx)
{
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int b = blockIdx.y;
        int bx = b * nx + x;
        float wfc_xx;
        float wfc_x;
        float phix_x;
        bool in_domain = (x > 1) && (x < nx - 2);

        if (in_domain)
        {
                wfc_xx = (fd2_d[0] * wfc[bx] +
                                fd2_d[1] *
                                (wfc[bx + 1] +
                                 wfc[bx - 1]) +
                                fd2_d[2] *
                                (wfc[bx + 2] +
                                 wfc[bx - 2]));

                wfc_x = (fd1_d[0] *
                                (wfc[bx + 1] -
                                 wfc[bx - 1]) + 
                                 fd1_d[1] * 
                                 (wfc[bx + 2] -
                                 wfc[bx - 2]));

                phix_x = (fd1_d[0] *
                                (phix[bx + 1] -
                                 phix[bx - 1]) + 
                                 fd1_d[1] * 
                                 (phix[bx + 2] -
                                 phix[bx - 2]));

                wfp[bx] = 1 / (1 + dt * sigmax[x] / 2) *
                        (model2_dt2[x] *
                        (wfc_xx + phix_x) +
                        dt * sigmax[x] * wfp[bx] / 2 +
                        (2 * wfc[bx] - wfp[bx]));

                phixp[bx] = phix[bx] -
                        dt * sigmax[x] * (wfc_x + phix[bx]);

        }
}

__global__ void add_sources_d(float *wfp,
                const float *const model2_dt2,
                const float *const source_amplitude,
                const int *const sources_x,
                const int step,
                const int nx,
                const int nt, const int ns)
{
        int s = threadIdx.x;
        int b = blockIdx.x;
        int x = sources_x[b * ns + s];
        int bx = b * nx + x;

        wfp[bx] += source_amplitude[b * ns * nt + s * nt + step] *
            model2_dt2[x];
}
"""

        if nvcc_options is None:
            nvcc_options = ['--restrict', '--use_fast_math', '-O3']

        mod = SourceModule(source, options=nvcc_options)

        jitfunc1 = mod.get_function('step_d')
        jitfunc2 = mod.get_function('add_sources_d')
        fd1_d = mod.get_global('fd1_d')[0]
        fd2_d = mod.get_global('fd2_d')[0]

        pad_width = 2
        super(Scalar1D, self).__init__(jitfunc1,
                                       jitfunc2,
                                       fd1_d,
                                       fd2_d,
                                       model,
                                       dx,
                                       source_dt,
                                       sources,
                                       pad_width,
                                       pml_width=pml_width)

Example #55

Show file

File: aes.py Project: fantapsody/pycuda_aes

	def init_cuda(self):
		if self.cuda_inited:
			return
		cuda_kernel = """
#include <stdio.h>

__device__ __constant__ unsigned int keySchedule[44];
__device__ __constant__ unsigned int Te0[256];
__device__ __constant__ unsigned int Te1[256];
__device__ __constant__ unsigned int Te2[256];
__device__ __constant__ unsigned int Te3[256];
__device__ __constant__ unsigned int length;
__device__ __constant__ unsigned int threadMax;


__global__ void printKeySchedule(){
	for(int i = 0; i < 11; i++){
		for(int j = 0; j < 4; j++){
			printf("%08x", keySchedule[i * 4 + j]);
		}
		printf("\\n");
	}
}

__device__ unsigned int bytestoword(unsigned char* b){
	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
}

__device__ void wordtobytes(unsigned char* b, unsigned int w){
	b[0] = (w >> 24);
	b[1] = (w >> 16) & 0xff;
	b[2] = (w >> 8) & 0xff;
	b[3] = w & 0xff;
}

__device__ void addRoundKey(unsigned int *s, unsigned int *k){
	s[0] ^= k[0];
	s[1] ^= k[1];
	s[2] ^= k[2];
	s[3] ^= k[3];
}

__global__ void encrypt(unsigned char* in){
	int p = blockIdx.x * 1024 + threadIdx.x;
	if(p * 16 >= length)
		return;
	unsigned char* block = in + p * 16;
	unsigned int s[4], t[4];
	unsigned int *rk;
	s[0] = bytestoword(block);
	s[1] = bytestoword(block + 4);
	s[2] = bytestoword(block + 8);
	s[3] = bytestoword(block + 12);

	addRoundKey(s, keySchedule);

	for(int i = 1; i < 10; i++){
		rk = keySchedule + i * 4;
		t[0] = Te0[s[0] >> 24] ^ Te1[(s[1] >> 16) & 0xff] ^ Te2[(s[2] >> 8 ) & 0xff] ^ Te3[(s[3]) & 0xff] ^ rk[0];
		t[1] = Te0[s[1] >> 24] ^ Te1[(s[2] >> 16) & 0xff] ^ Te2[(s[3] >> 8 ) & 0xff] ^ Te3[(s[0]) & 0xff] ^ rk[1];
		t[2] = Te0[s[2] >> 24] ^ Te1[(s[3] >> 16) & 0xff] ^ Te2[(s[0] >> 8 ) & 0xff] ^ Te3[(s[1]) & 0xff] ^ rk[2];
		t[3] = Te0[s[3] >> 24] ^ Te1[(s[0] >> 16) & 0xff] ^ Te2[(s[1] >> 8 ) & 0xff] ^ Te3[(s[2]) & 0xff] ^ rk[3];

		for(int j = 0; j < 4; j++)
			s[j] = t[j];
	}

	rk = keySchedule + 4 * 10;
	s[0] = (Te2[(t[0] >> 24)] & 0xff000000) ^ (Te3[(t[1] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[2] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[3] & 0xff] & 0x000000ff) ^ rk[0];
	s[1] = (Te2[(t[1] >> 24)] & 0xff000000) ^ (Te3[(t[2] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[3] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[0] & 0xff] & 0x000000ff) ^ rk[1];
	s[2] = (Te2[(t[2] >> 24)] & 0xff000000) ^ (Te3[(t[3] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[0] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[1] & 0xff] & 0x000000ff) ^ rk[2];
	s[3] = (Te2[(t[3] >> 24)] & 0xff000000) ^ (Te3[(t[0] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[1] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[2] & 0xff] & 0x000000ff) ^ rk[3];

	wordtobytes(block, s[0]);
	wordtobytes(block + 4, s[1]);
	wordtobytes(block + 8, s[2]);
	wordtobytes(block + 12, s[3]);
}

	"""
		
		mod = SourceModule(cuda_kernel)
		dKeySchedule = mod.get_global("keySchedule")[0]
		cuda.memcpy_htod(dKeySchedule, self.keySchedule)
		dThreadMax = mod.get_global("threadMax")[0]
		cuda.memcpy_htod(dThreadMax, numpy.array([self.threadMax], numpy.uint32))
		self.dLength = mod.get_global('length')[0]

		dTe0 = mod.get_global("Te0")[0]
		cuda.memcpy_htod(dTe0, self.Te[0])
		dTe1 = mod.get_global("Te1")[0]
		cuda.memcpy_htod(dTe1, self.Te[1])
		dTe2 = mod.get_global("Te2")[0]
		cuda.memcpy_htod(dTe2, self.Te[2])
		dTe3 = mod.get_global("Te3")[0]
		cuda.memcpy_htod(dTe3, self.Te[3])

		self.mod = mod

		self.cuda_buf = cuda.mem_alloc(self.cuda_buf_size)

		self.batchMax = self.threadMax * self.blockMax * 16

		self.cuda_inited = True