def make_GPU_gradient(mesh, context): '''Prepare to compute gradient on the GPU w.r.t. the given mesh. Return gradient function. ''' mx = int(getattr(mesh, 'nx', 1)) my = int(getattr(mesh, 'ny', 1)) mz = int(getattr(mesh, 'nz', 1)) dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64) dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64) dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64) sizeof_double = 8 with open(where + 'gradient2.cu') as fdlib: source = fdlib.read() module = SourceModule(source) mx_ptr = module.get_global("mx")[0] my_ptr = module.get_global("my")[0] mz_ptr = module.get_global("mz")[0] cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32)) cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32)) cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32)) dxInv_ptr = module.get_global("dxInv")[0] dyInv_ptr = module.get_global("dyInv")[0] dzInv_ptr = module.get_global("dzInv")[0] cuda.memcpy_htod(dxInv_ptr, dxInv) cuda.memcpy_htod(dyInv_ptr, dyInv) cuda.memcpy_htod(dzInv_ptr, dzInv) deriv_x = module.get_function("gradient_x") deriv_y = module.get_function("gradient_y") deriv_z = module.get_function("gradient_z") block, grid = mesh.get_domain_decomposition(DeviceData().max_threads) d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64) d_deriv_y = gpuarray.empty_like(d_deriv_x) d_deriv_z = gpuarray.empty_like(d_deriv_x) def _gradient(scalar_values): '''Calculate three-dimensional gradient for GPUArray scalar_values. ''' deriv_x(scalar_values, d_deriv_x, block=block, grid=grid) deriv_y(scalar_values, d_deriv_y, block=block, grid=grid) deriv_z(scalar_values, d_deriv_z, block=block, grid=grid) context.synchronize() return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension] return _gradient
def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function("copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0,255,(32,)).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global( global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all
def initCUDA(): global plotData_dArray global tex, transferTex global transferFuncArray_d global c_invViewMatrix global renderKernel #print "Compiling CUDA code for volumeRender" cudaCodeFile = open(volRenderDirectory + "/CUDAvolumeRender.cu","r") cudaCodeString = cudaCodeFile.read() cudaCodeStringComplete = cudaCodeString cudaCode = SourceModule(cudaCodeStringComplete, no_extern_c=True, include_dirs=[volRenderDirectory] ) tex = cudaCode.get_texref("tex") transferTex = cudaCode.get_texref("transferTex") c_invViewMatrix = cudaCode.get_global('c_invViewMatrix')[0] renderKernel = cudaCode.get_function("d_render") if not plotData_dArray: plotData_dArray = np3DtoCudaArray( plotData_h ) tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tex.set_filter_mode(cuda.filter_mode.LINEAR) tex.set_address_mode(0, cuda.address_mode.CLAMP) tex.set_address_mode(1, cuda.address_mode.CLAMP) tex.set_array(plotData_dArray) set_transfer_function( cmap_indx_0, trans_ramp_0, trans_center_0 ) print "CUDA volumeRender initialized\n"
def edgetaper_gpu(y_gpu, sf, win='barthann'): shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype block_size = (16,16,1) grid_size = (int(np.ceil(float(shape[1])/block_size[0])), int(np.ceil(float(shape[0])/block_size[1]))) # Ensure that sf is odd sf = sf+(1-np.mod(sf,2)) wx = scipy.signal.get_window(win, sf[1]) wy = scipy.signal.get_window(win, sf[0]) maxw = wx.max() * wy.max() hsf = np.floor(sf/2) wx = (wx[0:hsf[1]] / maxw).astype(dtype) wy = (wy[0:hsf[0]] / maxw).astype(dtype) preproc = _generate_preproc(dtype, shape) preproc += '#define wx_size %d\n' % wx.size preproc += '#define wy_size %d\n' % wy.size mod = SourceModule(preproc + edgetaper_code, keep=True) edgetaper_gpu = mod.get_function("edgetaper") wx_gpu, wx_size = mod.get_global('wx') wy_gpu, wy_size = mod.get_global('wy') cu.memcpy_htod(wx_gpu, wx) cu.memcpy_htod(wy_gpu, wy) edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]), block=block_size, grid=grid_size)
def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function( "copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0, 255, (32, )).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global(global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all
def __init__(self, pyr_scale=0.9, levels=15, winsize=9, num_iterations=5, poly_n=5, poly_sigma=1.2, use_gaussian_kernel: bool = True, use_initial_flow=None, quit_at_level=None, use_gpu=True, upscale_on_termination=True, fast_gpu_scaling=True, vesselmask_gpu=None): self.pyr_scale = pyr_scale self.levels = levels self.winsize = winsize self.num_iterations = num_iterations self.poly_n = poly_n self.poly_sigma = poly_sigma self.use_gaussian_kernel = use_gaussian_kernel self.use_initial_flow = use_initial_flow self.use_gpu = use_gpu self.upscale_on_termination = upscale_on_termination self._fast_gpu_scaling = fast_gpu_scaling self.quit_at_level = quit_at_level self._dump_everything = False self._show_everything = False self._vesselmask_gpu = vesselmask_gpu self._resize_kernel_size_factor = 4 self._max_resize_kernel_size = 9 with open( os.path.join(os.path.dirname(__file__), 'farneback_kernels.cu')) as f: read_data = f.read() f.closed mod = SourceModule(read_data) self._update_matrices_kernel = mod.get_function( 'FarnebackUpdateMatrices') self._invG_gpu = mod.get_global('invG')[0] self._weights_gpu = mod.get_global('weights')[0] self._poly_expansion_kernel = mod.get_function('calcPolyCoeficients') self._warp_kernel = mod.get_function('warpByFlowField') self._r1_texture = mod.get_texref('sourceTex') self._solve_equations_kernel = mod.get_function('solveEquationsCramer')
class BornThread(threading.Thread): def __init__(self, gpu, work_queue, result, density, x, y, z, Qx, Qy, Qz): threading.Thread.__init__(self) self.born_args = density, x, y, z, Qx, Qy, Qz, result self.work_queue = work_queue self.gpu = gpu self.precision = Qx.dtype self.defines = dict(XSIZE=len(x), YSIZE=len(y), ZSIZE=len(z), QXSIZE=len(Qx), QYSIZE=len(Qy), QZSIZE=len(Qz)) def run(self): self.dev = cuda.Device(self.gpu) self.ctx = self.dev.make_context() src = loadkernelsrc("kernelconstg.c", precision=self.precision, defines=self.defines) #print src self.cudamod = SourceModule(src) self.cudaBorn = self.cudamod.get_function("cudaBorn") self.kernel() self.ctx.pop() del self.ctx del self.dev def kernel(self): density, x, y, z, Qx, Qy, Qz, result = self.born_args nx, ny, nz, nqx, nqy, nqz = [ numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz ] cdensity = gpuarray.to_gpu(density) for v, t in (x, "x"), (y, "y"), (z, "z"), (Qx, "Qx"), (Qy, "Qy"), (Qz, "Qz"): cv = self.cudamod.get_global(t)[0] cuda.memcpy_htod(cv, v) cframe = cuda.mem_alloc(result[0].nbytes) n = int(1 * nqy * nqz) while True: try: qxi = numpy.int32(self.work_queue.get(block=False)) except Queue.Empty: break #print "%d of %d on %d\n"%(qxi,nqx,self.gpu), self.cudaBorn(cdensity, qxi, cframe, **cuda_partition(n)) ## Delay fetching result until the kernel is complete cuda_sync() ## Fetch result back to the CPU cuda.memcpy_dtoh(result[qxi], cframe)
class CudaCalculator(object): def __init__(self, eta, beta, L, no_angles, no_pulses, order=5): self.mod_K = SourceModule(RADON_KERNEL.format(order, no_angles, no_pulses)) self.K_gpu = self.mod_K.get_function("K_l") self.mod_reduction = SourceModule(REDUCTION_KERNEL) self.reduction_gpu = self.mod_reduction.get_function("reduction") self.eta = eta self.gamma = gamma(eta) self.beta = beta self.L = L self.h = calc_h(L, beta, eta) drv.memcpy_htod(self.mod_K.get_global("rsq4pi")[0], scipy.array([1./sqrt(4.*pi)], dtype=scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("sqeta")[0], scipy.array([sqrt(self.eta)], dtype=scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("h")[0], scipy.array([self.h], dtype=scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("four_pi_gamma")[0], scipy.array([4.*pi*self.gamma], dtype=scipy.float32)) y = sqrt(self.gamma)/self.h drv.memcpy_htod(self.mod_K.get_global("y")[0], scipy.array([y], dtype=scipy.float32)) n = scipy.arange(1, order+1, dtype=scipy.float32) n2 = n**2 ex = exp(-n2/4.) pre_s2 = ex*cosh(n*y) pre_s3 = ex*n*sinh(n*y) drv.memcpy_htod(self.mod_K.get_global("n2")[0], n2) drv.memcpy_htod(self.mod_K.get_global("pre_s1")[0], ex) drv.memcpy_htod(self.mod_K.get_global("pre_s2")[0], pre_s2) drv.memcpy_htod(self.mod_K.get_global("pre_s3")[0], pre_s3) def K(self, Q, P, angles, quadratures): drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32)) drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32)) Nx = Q.shape[0] Ny = int(floor(quadratures.size / 1024.)) K = scipy.empty((Nx,), dtype=scipy.float32) Kb = drv.mem_alloc(4*Ny*Nx) Q_gpu = drv.to_device(Q) P_gpu = drv.to_device(P) self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb, block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4) self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4) return K/self.L def reconstruct_wigner(self, angles_quadratures, Nq, Np): angles, quadratures = angles_quadratures q_mean, p_mean, s_max = estimate_position_from_quadratures(self.eta, angles, quadratures) q, p, Q, P = build_mesh(q_mean, p_mean, s_max, Nq, Np) W = self.K(Q.ravel(), P.ravel(), angles, quadratures) return q_mean, p_mean, Q, P, W.reshape(Q.shape)
def _load_kernel(self): path = os.path.join(os.path.dirname(__file__), "deskew.cu") with open(path, 'r') as fd: tpl = Template(fd.read()) source = tpl.render(dst_type=dtype_to_ctype(self._dtype)) module = SourceModule(source) self._kernel = module.get_function("deskew_kernel") self._d_px_shift, _ = module.get_global('px_shift') self._d_vsin, _ = module.get_global('vsin') self._d_vcos, _ = module.get_global('vcos') self._texture = module.get_texref("ref_vol") self._texture.set_address_mode(0, cuda.address_mode.BORDER) self._texture.set_address_mode(1, cuda.address_mode.BORDER) self._texture.set_address_mode(2, cuda.address_mode.BORDER) self._texture.set_filter_mode(cuda.filter_mode.LINEAR) self._kernel.prepare('Piiiii',texrefs=[self._texture])
def initCuda(self): """ Initialize CUDA environment - Create CUDA C program, and compile it - Copy destination PointCloud to CUDA global and constant value - Create handler of find_closest function in CUDA :return: None """ mod = SourceModule(""" #define ROW (""" + str(self.src.num) + """) #define OFFSET (""" + str(self.numCore) + """) __constant__ float dst[ROW][3]; __global__ void get_dst(float* ret){ int idx = threadIdx.x; ret[idx*3] = dst[idx][0]; ret[idx*3+1] = dst[idx][1]; ret[idx*3+2] = dst[idx][2]; } __global__ void find_closest(float* result, float *ret, float* distances) { for (int idx = threadIdx.x; idx < ROW; idx += OFFSET) { float x_src = result[idx*3]; float y_src = result[idx*3+1]; float z_src = result[idx*3+2]; float x_dst, y_dst, z_dst, minDist, dist; int minIdx = -1; for ( int i = 0; i < ROW; i++){ x_dst = dst[i][0]; y_dst = dst[i][1]; z_dst = dst[i][2]; dist = (x_src-x_dst) * (x_src-x_dst) + (y_src-y_dst) * (y_src-y_dst) + (z_src-z_dst) * (z_src-z_dst); if ( dist < minDist || minIdx < 0 ) { minDist = dist; minIdx = i; } } ret[idx*3] = dst[minIdx][0]; ret[idx*3+1] = dst[minIdx][1]; ret[idx*3+2] = dst[minIdx][2]; distances[idx] = sqrt(minDist); } } """) dstCuda, _ = mod.get_global('dst') assert self.dst.points.dtype == np.float32 cuda.memcpy_htod(dstCuda, self.dst.points) distances = np.zeros(self.src.num, dtype=np.float32) self.distances_gpu = gpuarray.to_gpu(distances) self.computeCorrespondenceCuda = mod.get_function('find_closest') getDstCuda = mod.get_function('get_dst')
def exponential_growth_d(self, g, TILEHEIGHT): template = """ #include <stdlib.h> #include <stdio.h> #include <math.h> #define GENUS %d #define TILEHEIGHT %d __device__ __constant__ double Yinvd[GENUS*GENUS]; __global__ void kernel(double* yd, double* u, int g, int y_len) { int tdy = threadIdx.y; int bdy = blockIdx.y; int tdx = threadIdx.x; __shared__ double yd_s[GENUS*TILEHEIGHT]; yd_s[tdy*g + tdx] = 0; if (bdy*TILEHEIGHT + tdy < y_len) { yd_s[tdy*g + tdx] = yd[(bdy*TILEHEIGHT + tdy) * g + tdx]; } __syncthreads(); if (bdy*TILEHEIGHT + tdy < y_len) { int i,j; double dot = 0; double Yinvy_i; for (i = 0; i < g; i++) { Yinvy_i = 0; for (j = 0; j < g; j++) { Yinvy_i += Yinvd[g*i + j] * yd_s[tdy*g+j]; } dot += yd_s[tdy*g+i] * Yinvy_i; } u[bdy*TILEHEIGHT + tdy] = M_PI * dot; } } """ %(g, TILEHEIGHT) mod = SourceModule(template) func = mod.get_function("kernel") Yinvd = mod.get_global("Yinvd")[0] return (func, Yinvd)
def set_barrier(s, vmax, vwidth): s.vwidth = vwidth s.vmax = vmax s.vx0 = s.nx / 2 - s.vwidth / 2 s.vx1 = s.nx / 2 + s.vwidth / 2 s.vc = np.complex64(np.exp(-1j * s.vmax * s.dt)) kern = ( kernels.replace("HNX", str(s.nx / 2)) .replace("NX", str(s.nx)) .replace("TID0", str(s.vx0)) .replace("TID_MAX", str(s.vx1)) ) print kern mod = SourceModule(kern) s.mul_l = mod.get_function("mul_l") s.mul_v = mod.get_function("mul_v") s.lcx_const, _ = mod.get_global("lcx")
def compile(steps, jinja_env, fields): """ Combine all the operation source codes into one large string, compile it, load the constant values, and store runtime functions into each operation instance. """ print 'Compiling all operations...' # Get the jinja2 template containing the source code. template = jinja_env.get_template('update.cu') # Assemble all the source into one long string. source = jinja_env.get_template('field_access_macros.cu').render( \ field_names=sorted(fields), fields=fields) for step in steps: for op in step.operations: for step_dir in range(3): op.render(jinja_env, step_dir) source += str(op.cuda_source) # Write out source code, for debugging purposes. f = open('source_code.debug', 'w') f.write(source) f.close() # Compile the cuda source code. mod = SourceModule(source) # Load the location (on the GPU) of all the fields. dest, size = mod.get_global("M_FIELD") field_locations = \ np.array([int(fields[fname].d_data) for fname in sorted(fields)]) drv.memcpy_htod(dest, field_locations) # Store ready-to-run functions back into each My_Operation class instance. for step in steps: for op in step.operations: # Create the list to store runtimes for all 3 step directions. op.runtime = [] for step_dir in range(3): op.runtime.append(mod.get_function(op.name + 'XYZ'[step_dir])) op.runtime[step_dir].set_cache_config(drv.func_cache.PREFER_L1) # Compiling finished. print '... compiling complete.', '\n'
def l1_wvd(N, block_size=16, use_double=False): Nb = block_size Ng = int(np.ceil(float(N) / Nb)) Nb2 = Nb Nbd = 2 * N + 1 Nwrite = int(np.ceil(float(Nbd) / Nb2)) cfg = dict(N=N, Ng=Ng, Nb=Nb, Nb2=Nb2, Nbd=Nbd, Nwrite=Nwrite, use_double=use_double) mod = SourceModule(src % cfg) kernel = mod.get_function('kernel') dtype = np.float64 if use_double else np.float32 w0 = np.exp(-2j * np.pi / N) ws = np.array([w0**k for k in range(N)]) wsr_cmem = mod.get_global('wsr')[0] wsi_cmem = mod.get_global('wsi')[0] drv.memcpy_htod(wsr_cmem, np.ascontiguousarray(ws.real.astype(dtype))) drv.memcpy_htod(wsi_cmem, np.ascontiguousarray(ws.imag.astype(dtype))) def run(z, zi=None): if zi is None: indata = np.concatenate([z.real, z.imag]) else: indata = np.concatenate([z, zi]) indata = drv.In(np.ascontiguousarray(indata.astype(dtype))) outdata = drv.InOut(np.zeros(Nbd, dtype=dtype)) kernel(outdata, indata, block=(Nb, 1, 1), grid=(Ng, N)) cost = outdata.array[0] grad_r = outdata.array[1:N + 1] grad_i = outdata.array[N + 1:] return cost, grad_r, grad_i return run
def go(self): import qimage2ndarray image = QtGui.QImage(self.rows, self.columns, QtGui.QImage.Format_RGB32) self.data = qimage2ndarray.rgb_view(image) # Needs a contiguos buffer self.data = numpy.copy(self.data) self.spheres = numpy.array(self.CreateSpheres()) # Init CUDA cuda.init() # Create CUDA Context ctx = pycuda.tools.make_default_context() # Declare event(s) startEvent = cuda.Event() stopEvent = cuda.Event() # Memory on Device gpu_alloc = cuda.mem_alloc(self.data.nbytes) gpu_rows = cuda.mem_alloc(self.rows.nbytes) gpu_columns = cuda.mem_alloc(self.columns.nbytes) # Copy data from Host to Device cuda.memcpy_htod(gpu_rows, self.rows) cuda.memcpy_htod(gpu_columns, self.columns) # Execute on host mod = SourceModule(code) gpu_spheres = mod.get_global("spheres") cuda.memcpy_htod(gpu_spheres[0], self.spheres) kernel = mod.get_function("RayTracer") startEvent.record() kernel(gpu_alloc, gpu_rows, gpu_columns, block=(self.threads, self.threads, 1), grid=(int(self.rows / self.threads), int(self.columns / self.threads))) stopEvent.record() stopEvent.synchronize() print("Time elapsed: %fms" % startEvent.time_till(stopEvent)) # Copy data from Device to Host cuda.memcpy_dtoh(self.data, gpu_alloc) ctx.pop() self.SetImage(self.data)
def get_transduction_func(dtype, block_size, Xaddress, change_ind1, change_ind2, change1, change2, compile_options): template = """ /* This is kept for documentation purposes the actual code used is after the end * of this template */ #include "curand_kernel.h" extern "C" { #include "stdio.h" #define BLOCK_SIZE %(block_size)d #define LA 0.5 /* Simulation Constants */ #define C_T 0.5 /* Total concentration of calmodulin */ #define G_T 50 /* Total number of G-protein */ #define PLC_T 100 /* Total number of PLC */ #define T_T 25 /* Total number of TRP/TRPL channels */ #define I_TSTAR 0.68 /* Average current through one opened TRP/TRPL channel (pA)*/ #define GAMMA_DSTAR 4.0 /* s^(-1) rate constant*/ #define GAMMA_GAP 3.0 /* s^(-1) rate constant*/ #define GAMMA_GSTAR 3.5 /* s^(-1) rate constant*/ #define GAMMA_MSTAR 3.7 /* s^(-1) rate constant*/ #define GAMMA_PLCSTAR 144 /* s^(-1) rate constant */ #define GAMMA_TSTAR 25 /* s^(-1) rate constant */ #define H_DSTAR 37.8 /* strength constant */ #define H_MSTAR 40 /* strength constant */ #define H_PLCSTAR 11.1 /* strength constant */ #define H_TSTARP 11.5 /* strength constant */ #define H_TSTARN 10 /* strength constant */ #define K_P 0.3 /* Dissociation coefficient for calcium positive feedback */ #define K_P_INV 3.3333 /* K_P inverse ( too many decimals are not important) */ #define K_N 0.18 /* Dissociation coefficient for calmodulin negative feedback */ #define K_N_INV 5.5555 /* K_N inverse ( too many decimals are not important) */ #define K_U 30 /* (mM^(-1)s^(-1)) Rate of Ca2+ uptake by calmodulin */ #define K_R 5.5 /* (mM^(-1)s^(-1)) Rate of Ca2+ release by calmodulin */ #define K_CA 1000 /* s^(-1) diffusion from microvillus to somata (tuned) */ #define K_NACA 3e-8 /* Scaling factor for Na+/Ca2+ exchanger model */ #define KAPPA_DSTAR 1300.0 /* s^(-1) rate constant - there is also a capital K_DSTAR */ #define KAPPA_GSTAR 7.05 /* s^(-1) rate constant */ #define KAPPA_PLCSTAR 15.6 /* s^(-1) rate constant */ #define KAPPA_TSTAR 150.0 /* s^(-1) rate constant */ #define K_DSTAR 100.0 /* rate constant */ #define F 96485 /* (mC/mol) Faraday constant (changed from paper)*/ #define N 4 /* Binding sites for calcium on calmodulin */ #define R 8.314 /* (J*K^-1*mol^-1)Gas constant */ #define T 293 /* (K) Absolute temperature */ #define VOL 3e-9 /* changed from 3e-12microlitres to nlitres * microvillus volume so that units agree */ #define N_S0_DIM 1 /* initial condition */ #define N_S0_BRIGHT 2 #define A_N_S0_DIM 4 /* upper bound for dynamic increase (of negetive feedback) */ #define A_N_S0_BRIGHT 200 #define TAU_N_S0_DIM 3000 /* time constant for negative feedback */ #define TAU_N_S0_BRIGHT 1000 #define NA_CO 120 /* (mM) Extracellular sodium concentration */ #define NA_CI 8 /* (mM) Intracellular sodium concentration */ #define CA_CO 1.5 /* (mM) Extracellular calcium concentration */ #define G_TRP 8 /* conductance of a TRP channel */ #define TRP_REV 0 /* TRP channel reversal potential (mV) */ __device__ __constant__ long long int d_X[5]; __device__ __constant__ int change_ind1[13]; __device__ __constant__ int change1[13]; __device__ __constant__ int change_ind2[13]; __device__ __constant__ int change2[13]; /* cc = n/(NA*VOL) [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */ __device__ float num_to_mM(int n) { return n * 5.5353e-4; // n/1806.6; } /* n = cc*VOL*NA [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */ __device__ float mM_to_num(float cc) { return rintf(cc * 1806.6); } /* Assumes Hill constant (=2) for positive calcium feedback */ __device__ float compute_fp(float Ca_cc) { float tmp = Ca_cc*K_P_INV; tmp *= tmp; return tmp/(1 + tmp); } /* Assumes Hill constant(=3) for negative calmodulin feedback */ __device__ float compute_fn(float Cstar_cc, float ns) { float tmp = Cstar_cc*K_N_INV; tmp *= tmp*tmp; return ns*tmp/(1 + tmp); } /* Vm [V] */ __device__ float compute_ca(int Tstar, float Cstar_cc, float Vm) { float I_in = Tstar*G_TRP*fmaxf(-Vm + 0.001*TRP_REV, 0); /* CaM = C_T - Cstar_cc */ float denom = (K_CA + (N*K_U*C_T) - (N*K_U)*Cstar_cc + 179.0952 * expf(-(F/(R*T))*Vm)); // (K_NACA*NA_CO^3/VOL*F) /* I_Ca ~= 0.4*I_in */ float numer = (0.4*I_in)/(2*VOL*F) + ((K_NACA*CA_CO*NA_CI*NA_CI*NA_CI)/(VOL*F)) + // in paper it's -K_NACA... due to different conventions N*K_R*Cstar_cc; return fmaxf(1.6e-4, numer/denom); } __global__ void transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm, %(type)s* g_ns, %(type)s* input, int* num_microvilli, int total_microvilli, int* count) { int tid = threadIdx.x; int gid = threadIdx.x + blockIdx.x * blockDim.x; int wid = tid %% 32; __shared__ int X[BLOCK_SIZE][7]; // number of molecules __shared__ float Ca[BLOCK_SIZE]; __shared__ float fn[BLOCK_SIZE]; float Vm, ns, lambda; float sumrate, dt_advanced; int reaction_ind; ushort2 tmp; // copy random generator state locally to avoid accessing global memory curandStateXORWOW_t localstate = state[gid]; int mid; // microvilli ID __shared__ volatile int mi; // starting point of mid per ward // use atomicAdd to obtain the starting mid for the warp if(wid == 0) { mi = atomicAdd(count, 32); } mid = mi + wid; int mid; while(mid < total_microvilli) { // load photoreceptor index of the microvilli ind = ((ushort*)d_X[4])[mid]; // load variables that are needed for computing calcium concentration tmp = ((ushort2*)d_X[2])[mid]; X[tid][5] = tmp.x; X[tid][6] = tmp.y; Vm = d_Vm[ind]*1e-3; ns = g_ns[ind]; // update calcium concentration Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn(num_to_mM(X[tid][5]), ns); lambda = input[ind]/num_microvilli[ind]; // load the rest of variables tmp = ((ushort2*)d_X[1])[mid]; X[tid][4] = tmp.y; X[tid][3] = tmp.x; tmp = ((ushort2*)d_X[0])[mid]; X[tid][2] = tmp.y; X[tid][1] = tmp.x; X[tid][0] = ((ushort*)d_X[3])[mid]; // compute total rate of reaction sumrate = lambda; sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11 sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12 sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10 sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8 sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7 sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1 sumrate += KAPPA_DSTAR * X[tid][3]; // 6 sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4 sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); // 3 sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2 sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5 ; // 9 // choose the next reaction time dt_advanced = -logf(curand_uniform(&localstate))/(LA + sumrate); // If the reaction time is smaller than dt, // pick the reaction and update, // then compute the total rate and next reaction time again // until all dt_advanced is larger than dt. // Note that you don't have to compensate for // the last reaction time that exceeds dt. // The reason is that the exponential distribution is MEMORYLESS. while(dt_advanced <= dt) { reaction_ind = 0; sumrate = curand_uniform(&localstate) * sumrate; if(sumrate > 2e-5) { sumrate -= lambda; reaction_ind = (sumrate<=2e-5) * 13; if(!reaction_ind) { sumrate -= mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); reaction_ind = (sumrate<=2e-5) * 11; if(!reaction_ind) { sumrate -= mM_to_num(K_R) * num_to_mM(X[tid][5]); reaction_ind = (sumrate<=2e-5) * 12; if(!reaction_ind) { sumrate -= GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; reaction_ind = (sumrate<=2e-5) * 10; if(!reaction_ind) { sumrate -= GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; reaction_ind = (sumrate<=2e-5) * 8; if(!reaction_ind) { sumrate -= GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 7; if(!reaction_ind) { sumrate -= GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 1; if(!reaction_ind) { sumrate -= KAPPA_DSTAR * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 6; if(!reaction_ind) { sumrate -= GAMMA_GAP * X[tid][2] * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 4; if(!reaction_ind) { sumrate -= KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); reaction_ind = (sumrate<=2e-5) * 3; if(!reaction_ind) { sumrate -= GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); reaction_ind = (sumrate<=2e-5) * 5; if(!reaction_ind) { sumrate -= KAPPA_GSTAR * X[tid][1] * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 2; if(!reaction_ind) { sumrate -= (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; reaction_ind = (sumrate<=2e-5) * 9; } } } } } } } } } } } } } int ind; // only up to two state variables are needed to be updated // update the first one. ind = change_ind1[reaction_ind]; X[tid][ind] += change1[reaction_ind]; //if(reaction_ind == 9) //{ // X[tid][ind] = max(X[tid][ind], 0); //} ind = change_ind2[reaction_ind]; //update the second one if(ind != 0) { X[tid][ind] += change2[reaction_ind]; } // compute the advance time again Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns ); //fp[tid] = compute_fp( Ca[tid] ); sumrate = lambda; sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11 sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12 sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10 sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8 sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7 sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1 sumrate += KAPPA_DSTAR * X[tid][3]; // 6 sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4 sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); // 3 sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2 sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; // 9 dt_advanced -= logf(curand_uniform(&localstate))/(LA + sumrate); } // end while ((ushort*)d_X[3])[mid] = X[tid][0]; ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]); ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]); ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]); if(wid == 0) { mi = atomicAdd(count, 32); } mid = mi + wid; } // copy the updated random generator state back to global memory state[gid] = localstate; } } """ template_run = """ #include "curand_kernel.h" extern "C" { #include "stdio.h" #define BLOCK_SIZE %(block_size)d #define LA 0.5 __device__ __constant__ long long int d_X[5]; __device__ __constant__ int change_ind1[13]; __device__ __constant__ int change1[13]; __device__ __constant__ int change_ind2[13]; __device__ __constant__ int change2[13]; __device__ float num_to_mM(int n) { return n * 5.5353e-4; // n/1806.6; } __device__ float mM_to_num(float cc) { return rintf(cc * 1806.6); } __device__ float compute_fp( float ca_cc) { float tmp = ca_cc*3.3333333333; tmp *= tmp; return tmp/(1+tmp); } __device__ float compute_fn( float Cstar_cc, float ns) { float tmp = Cstar_cc*5.55555555; tmp *= tmp*tmp; return ns*tmp/(1+tmp); } __device__ float compute_ca(int Tstar, float cstar_cc, float Vm) { float I_in = Tstar*8*fmaxf(-Vm,0); float denom = (1060 - 120*cstar_cc + 179.0952 * expf(-39.60793*Vm)); float numer = I_in * 690.9537 + 0.0795979 + 22*cstar_cc; return fmaxf(1.6e-4, numer/denom); } __global__ void transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm, %(type)s* g_ns, %(type)s* input, int* num_microvilli, int total_microvilli, int* count) { int tid = threadIdx.x; int gid = threadIdx.x + blockIdx.x * blockDim.x; int wid = tid %% 32; __shared__ int X[BLOCK_SIZE][7]; // number of molecules __shared__ float Ca[BLOCK_SIZE]; __shared__ float fn[BLOCK_SIZE]; float Vm, ns, lambda; float sumrate, dt_advanced; int reaction_ind; ushort2 tmp; // copy random generator state locally to avoid accessing global memory curandStateXORWOW_t localstate = state[gid]; int mid; // microvilli ID __shared__ volatile int mi; // starting point of mid per ward // use atomicAdd to obtain the starting mid for the warp if(wid == 0) { mi = atomicAdd(count, 32); } mid = mi + wid; int ind; while(mid < total_microvilli) { ind = ((ushort*)d_X[4])[mid]; // load variables that are needed for computing calcium concentration tmp = ((ushort2*)d_X[2])[mid]; X[tid][5] = tmp.x; X[tid][6] = tmp.y; Vm = d_Vm[ind]*1e-3; ns = g_ns[ind]; // update calcium concentration Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns); lambda = input[ind]/num_microvilli[ind]; // load the rest of variables tmp = ((ushort2*)d_X[1])[mid]; X[tid][4] = tmp.y; X[tid][3] = tmp.x; tmp = ((ushort2*)d_X[0])[mid]; X[tid][2] = tmp.y; X[tid][1] = tmp.x; X[tid][0] = ((ushort*)d_X[3])[mid]; sumrate = lambda + 54198 * Ca[tid] * (0.5 - X[tid][5] * 5.5353e-4) + 5.5 * X[tid][5]; // 11, 12 sumrate += 25 * (1+10*fn[tid]) * X[tid][6]; // 10 sumrate += 4 * (1+37.8*fn[tid]) * X[tid][4] ; // 8 sumrate += (1444+1598.4*fn[tid]) * X[tid][3] ; // 7, 6 sumrate += (3.7*(1+40*fn[tid]) + 7.05 * X[tid][1]) * X[tid][0] ; // 1, 2 sumrate += (1560 - 12.6 * X[tid][3]) * X[tid][2]; // 3, 4 sumrate += 3.5 * (50 - X[tid][2] - X[tid][1] - X[tid][3]) ; // 5 sumrate += 0.015 * (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5 ; // 9 dt_advanced = -logf(curand_uniform(&localstate))/(LA+sumrate); // If the reaction time is smaller than dt, // pick the reaction and update, // then compute the total rate and next reaction time again // until all dt_advanced is larger than dt. // Note that you don't have to compensate for // the last reaction time that exceeds dt. // The reason is that the exponential distribution is MEMORYLESS. while (dt_advanced <= dt) { reaction_ind = 0; sumrate = curand_uniform(&localstate) * sumrate; if (sumrate > 2e-5) { sumrate -= lambda; reaction_ind = (sumrate<=2e-5) * 13; if (!reaction_ind) { sumrate -= mM_to_num(30) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); reaction_ind = (sumrate<=2e-5) * 11; if (!reaction_ind) { sumrate -= mM_to_num(5.5) * num_to_mM(X[tid][5]); reaction_ind = (sumrate<=2e-5) * 12; if (!reaction_ind) { sumrate -= 25 * (1+10*fn[tid]) * X[tid][6]; reaction_ind = (sumrate<=2e-5) * 10; if (!reaction_ind) { sumrate -= 4 * (1+37.8*fn[tid]) * X[tid][4]; reaction_ind = (sumrate<=2e-5) * 8; if (!reaction_ind) { sumrate -= 144 * (1+11.1*fn[tid]) * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 7; if (!reaction_ind) { sumrate -= 3.7*(1+40*fn[tid]) * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 1; if (!reaction_ind) { sumrate -= 1300 * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 6; if (!reaction_ind) { sumrate -= 3.0 * X[tid][2] * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 4; if (!reaction_ind) { sumrate -= 15.6 * X[tid][2] * (100-X[tid][3]); reaction_ind = (sumrate<=2e-5) * 3; if (!reaction_ind) { sumrate -= 3.5 * (50 - X[tid][2] - X[tid][1] - X[tid][3]); reaction_ind = (sumrate<=2e-5) * 5; if(!reaction_ind) { sumrate -= 7.05 * X[tid][1] * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 2; if(!reaction_ind) { sumrate -= 0.015 * (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5; reaction_ind = (sumrate<=2e-5) * 9; } } } } } } } } } } } } } int ind; // only up to two state variables are needed to be updated // update the first one. ind = change_ind1[reaction_ind]; X[tid][ind] += change1[reaction_ind]; //update the second one ind = change_ind2[reaction_ind]; if (ind != 0) X[tid][ind] += change2[reaction_ind]; // compute the advance time again Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns ); sumrate = lambda + 54198*Ca[tid]*(0.5 - X[tid][5]*5.5353e-4) + 5.5*X[tid][5]; // 11, 12 sumrate += 25*(1 + 10*fn[tid])*X[tid][6]; // 10 sumrate += 4*(1 + 37.8*fn[tid])*X[tid][4]; // 8 sumrate += (1444 + 1598.4*fn[tid])*X[tid][3]; // 7, 6 sumrate += (3.7*(1 + 40*fn[tid]) + 7.05*X[tid][1])*X[tid][0]; // 1, 2 sumrate += (1560 - 12.6*X[tid][3])*X[tid][2]; // 3, 4 sumrate += 3.5*(50 - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += 0.015*(1 + 11.5*compute_fp( Ca[tid] )) *X[tid][4]*(X[tid][4] - 1)*(25 - X[tid][6])*0.5; // 9 dt_advanced -= logf(curand_uniform(&localstate))/(LA+sumrate); } // end while ((ushort*)d_X[3])[mid] = X[tid][0]; ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]); ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]); ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]); if(wid == 0) { mi = atomicAdd(count, 32); } mid = mi + wid; } // copy the updated random generator state back to global memory state[gid] = localstate; } } """ try: co = [compile_options[0]+' --maxrregcount=54'] except IndexError: co = ['--maxrregcount=54'] scalartype = dtype.type if isinstance(dtype, np.dtype) else dtype mod = SourceModule( template_run % { "type": dtype_to_ctype(dtype), "block_size": block_size, "fletter": 'f' if scalartype == np.float32 else '' }, options = co, no_extern_c = True) func = mod.get_function('transduction') d_X_address, d_X_nbytes = mod.get_global("d_X") cuda.memcpy_htod(d_X_address, Xaddress) d_change_ind1_address, d_change_ind1_nbytes = mod.get_global("change_ind1") d_change_ind2_address, d_change_ind2_nbytes = mod.get_global("change_ind2") d_change1_address, d_change1_nbytes = mod.get_global("change1") d_change2_address, d_change2_nbytes = mod.get_global("change2") cuda.memcpy_htod(d_change_ind1_address, change_ind1) cuda.memcpy_htod(d_change_ind2_address, change_ind2) cuda.memcpy_htod(d_change1_address, change1) cuda.memcpy_htod(d_change2_address, change2) func.prepare('PfPPPPiP') func.set_cache_config(cuda.func_cache.PREFER_SHARED) return func
def main(): #FourPermutations set-up FourPermutations = numpy.array([ [1,2,3,4], [1,2,4,3], [1,3,2,4], [1,3,4,2], [1,4,2,3], [1,4,3,2], [2,1,3,4], [2,1,4,3], [2,3,1,4], [2,3,4,1], [2,4,1,3], [2,4,3,1], [3,2,1,4], [3,2,4,1], [3,1,2,4], [3,1,4,2], [3,4,2,1], [3,4,1,2], [4,2,3,1], [4,2,1,3], [4,3,2,1], [4,3,1,2], [4,1,2,3], [4,1,3,2],]).astype(numpy.uint8) #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":8, "bit_length_edge_type":3, "curand_nr_threads_per_block":256, "nr_tile_types":4, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":1, "fit_dim_thread_y":1, "fit_dim_block_x":1, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4 } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("TestAssemblyKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes dest = numpy.arange(256).astype(numpy.uint8) for i in range(0, 256/8): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) dest[i*8 + 0] = 36 dest[i*8 + 1] = 151 dest[i*8 + 2] = 90 dest[i*8 + 3] = 109 dest[i*8 + 4] = 224 dest[i*8 + 5] = 4 dest[i*8 + 6] = 0 dest[i*8 + 7] = 0 dest[0] = 40 dest[1] = 0 dest[2] = 0 dest[3] = 0 dest[4] = 0 dest[5] = 0 dest[6] = 0 dest[7] = 0 dest_h = drv.mem_alloc(dest.nbytes) drv.memcpy_htod(dest_h, dest) print "Genomes before: " print dest #Set-up grids grids = numpy.zeros((32, 19, 19)).astype(numpy.uint8) grids_h = drv.mem_alloc(grids.nbytes) drv.memcpy_htod(grids_h, grids) print "Grids:" print grids #Set-up fitness values fitness = numpy.zeros(256).astype(numpy.float32) fitness_h = drv.mem_alloc(fitness.nbytes) drv.memcpy_htod(fitness_h, fitness) print "Fitness values:" print fitness #Set-up curand curand = numpy.zeros(40*256).astype(numpy.uint8); curand_h = drv.mem_alloc(curand.nbytes) #Set-up four permutations FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) #Set-up timers #start = drv.Event() #stop = drv.Event() #start.record() #Call kernels CurandKernel(curand_h, block=(32,1,1), grid=(1,1)) Kernel(dest_h, fitness_h, grids_h, curand_h, block=(32,1,1), grid=(1,1)) #drv.Context.synchronize() #Clean-up timer #stop.synchronize() #print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) #pass #Output drv.memcpy_dtoh(dest, dest_h) print "Genomes after: " print dest drv.memcpy_dtoh(fitness, fitness_h) print "Fitness after: " print fitness drv.memcpy_dtoh(grids, grids_h) print "Grids[0] after: " print grids[0] print "Grids[31] after:" print grids[31]
g_y = cuda.to_device(y_map_bin) g_out = cuda.to_device(results) g_num_el = np.int32(num_el) align = np.ceil( 1.0*sliceSize*threadsPerRow/minAlign)*minAlign g_align = np.int32(align) g_i = np.int32(i) g_j = np.int32(j) g_i_ds= np.int32(i) g_j_ds= np.int32(j) g_cls1N_aligned = np.int32(align_cls1_n) #gamma, copy to constant memory (g_gamma,gsize)=module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(gamma) ) g_cls_start = cuda.to_device(start_cls) g_cls_count = cuda.to_device(count_cls) g_cls = cuda.to_device(bin_cls) #start_event = cuda.Event() #stop_event = cuda.Event() start_event.record() func(g_val, g_col,
class RimeEKBSqrt(Node): def __init__(self): super(RimeEKBSqrt, self).__init__() def initialise(self, solver, stream=None): slvr = solver ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan') # Get a property dictionary off the solver D = slvr.template_dict() # Include our kernel parameters D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS) D['rime_const_data_struct'] = slvr.const_data().string_def() D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \ mbu.redistribute_threads( D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'], npolchan, na, ntime) regs = str(FLOAT_PARAMS['maxregs'] \ if slvr.is_float() else DOUBLE_PARAMS['maxregs']) kname = 'rime_jones_EKBSqrt_float' \ if slvr.is_float() is True else \ 'rime_jones_EKBSqrt_double' kernel_string = KERNEL_TEMPLATE.substitute(**D) self.mod = SourceModule(kernel_string, options=['-lineinfo','-maxrregcount', regs], include_dirs=[montblanc.get_source_path()], no_extern_c=True) self.rime_const_data = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) self.launch_params = self.get_launch_params(slvr, D) def shutdown(self, solver, stream=None): pass def pre_execution(self, solver, stream=None): pass def get_launch_params(self, slvr, D): polchans_per_block = D['BLOCKDIMX'] ants_per_block = D['BLOCKDIMY'] times_per_block = D['BLOCKDIMZ'] ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan') polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block) ant_blocks = mbu.blocks_required(na, ants_per_block) time_blocks = mbu.blocks_required(ntime, times_per_block) return { 'block' : (polchans_per_block, ants_per_block, times_per_block), 'grid' : (polchan_blocks, ant_blocks, time_blocks), } def execute(self, solver, stream=None): slvr = solver if stream is not None: cuda.memcpy_htod_async( self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( self.rime_const_data[0], slvr.const_data().ndary()) self.kernel(slvr.uvw, slvr.lm, slvr.frequency, slvr.B_sqrt, slvr.jones, stream=stream, **self.launch_params) def post_execution(self, solver, stream=None): pass
} } ''' template = string.Template(template) code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS, KERNEL_W=KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED) module = SourceModule(code) convolutionRowGPU = module.get_function('convolutionRowGPU') convolutionColumnGPU = module.get_function('convolutionColumnGPU') d_Kernel_rows = module.get_global('d_Kernel_rows')[0] d_Kernel_columns = module.get_global('d_Kernel_columns')[0] # Helper functions for computing alignment... def iDivUp(a, b): # Round a / b to nearest higher integer value a = numpy.int32(a) b = numpy.int32(b) return (a / b + 1) if (a % b != 0) else (a / b) def iDivDown(a, b): # Round a / b to nearest lower integer value a = numpy.int32(a) b = numpy.int32(b) return a / b
def get_transduction_func(dtype, block_size, num_microvilli, Xaddress, change_ind1, change_ind2, change1, change2): template = """ #include "curand_kernel.h" extern "C" { #include "stdio.h" #define NUM_MICROVILLI %(num_microvilli)d #define BLOCK_SIZE %(block_size)d #define LA 0.5 /* Simulation Constants */ #define C_T 0.5 /* Total concentration of calmodulin */ #define G_T 50 /* Total number of G-protein */ #define PLC_T 100 /* Total number of PLC */ #define T_T 25 /* Total number of TRP/TRPL channels */ #define I_TSTAR 0.68 /* Average current through one opened TRP/TRPL channel (pA)*/ #define GAMMA_DSTAR 4.0 /* s^(-1) rate constant*/ #define GAMMA_GAP 3.0 /* s^(-1) rate constant*/ #define GAMMA_GSTAR 3.5 /* s^(-1) rate constant*/ #define GAMMA_MSTAR 3.7 /* s^(-1) rate constant*/ #define GAMMA_PLCSTAR 144 /* s^(-1) rate constant */ #define GAMMA_TSTAR 25 /* s^(-1) rate constant */ #define H_DSTAR 37.8 /* strength constant */ #define H_MSTAR 40 /* strength constant */ #define H_PLCSTAR 11.1 /* strength constant */ #define H_TSTARP 11.5 /* strength constant */ #define H_TSTARN 10 /* strength constant */ #define K_P 0.3 /* Dissociation coefficient for calcium positive feedback */ #define K_P_INV 3.3333 /* K_P inverse ( too many decimals are not important) */ #define K_N 0.18 /* Dissociation coefficient for calmodulin negative feedback */ #define K_N_INV 5.5555 /* K_N inverse ( too many decimals are not important) */ #define K_U 30 /* (mM^(-1)s^(-1)) Rate of Ca2+ uptake by calmodulin */ #define K_R 5.5 /* (mM^(-1)s^(-1)) Rate of Ca2+ release by calmodulin */ #define K_CA 1000 /* s^(-1) diffusion from microvillus to somata (tuned) */ #define K_NACA 3e-8 /* Scaling factor for Na+/Ca2+ exchanger model */ #define KAPPA_DSTAR 1300.0 /* s^(-1) rate constant - there is also a capital K_DSTAR */ #define KAPPA_GSTAR 7.05 /* s^(-1) rate constant */ #define KAPPA_PLCSTAR 15.6 /* s^(-1) rate constant */ #define KAPPA_TSTAR 150.0 /* s^(-1) rate constant */ #define K_DSTAR 100.0 /* rate constant */ #define F 96485 /* (mC/mol) Faraday constant (changed from paper)*/ #define N 4 /* Binding sites for calcium on calmodulin */ #define R 8.314 /* (J*K^-1*mol^-1)Gas constant */ #define T 293 /* (K) Absolute temperature */ #define VOL 3e-9 /* changed from 3e-12microlitres to nlitres * microvillus volume so that units agree */ #define N_S0_DIM 1 /* initial condition */ #define N_S0_BRIGHT 2 #define A_N_S0_DIM 4 /* upper bound for dynamic increase (of negetive feedback) */ #define A_N_S0_BRIGHT 200 #define TAU_N_S0_DIM 3000 /* time constant for negative feedback */ #define TAU_N_S0_BRIGHT 1000 #define NA_CO 120 /* (mM) Extracellular sodium concentration */ #define NA_CI 8 /* (mM) Intracellular sodium concentration */ #define CA_CO 1.5 /* (mM) Extracellular calcium concentration */ #define G_TRP 8 /* conductance of a TRP channel */ #define TRP_REV 0 /* TRP channel reversal potential */ __device__ __constant__ long long int d_X[4]; __device__ __constant__ int change_ind1[13]; __device__ __constant__ int change1[13]; __device__ __constant__ int change_ind2[13]; __device__ __constant__ int change2[13]; /* cc = n/(NA*VOL) [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */ __device__ float num_to_mM(int n) { return n * 5.5353e-4; // n/1806.6; } /* n = cc*VOL*NA [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */ __device__ float mM_to_num(float cc) { return rintf(cc * 1806.6); } /* Assumes Hill constant (=2) for positive calcium feedback */ __device__ float compute_fp(float Ca_cc) { float tmp = Ca_cc*K_P_INV; tmp *= tmp; return tmp/(1 + tmp); } /* Assumes Hill constant(=3) for negative calmodulin feedback */ __device__ float compute_fn(float Cstar_cc, float ns) { float tmp = Cstar_cc*K_N_INV; tmp *= tmp*tmp; return ns*tmp/(1 + tmp); } /* Vm [V] */ __device__ float compute_ca(int Tstar, float Cstar_cc, float Vm) { float I_in = Tstar*G_TRP*fmaxf(-Vm, -TRP_REV); /* CaM = C_T - Cstar_cc */ float denom = (K_CA + (N*K_U*C_T) - (N*K_U)*Cstar_cc + 179.0952 * expf(-(F/(R*T))*Vm)); // (K_NACA*NA_CO^3/VOL*F) /* I_Ca ~= 0.4*I_in */ float numer = (0.4*I_in)/(2*VOL*F) + ((K_NACA*CA_CO*NA_CI*NA_CI*NA_CI)/(VOL*F)) + // in paper it's -K_NACA... due to different conventions N*K_R*Cstar_cc; return fmaxf(1.6e-4, numer/denom); } __global__ void transduction(curandStateXORWOW_t *state, int ld1, float dt, %(type)s* d_Vm, float ns) { int tid = threadIdx.x; int bid = blockIdx.x; __shared__ int X[BLOCK_SIZE][7]; // number of molecules __shared__ float Ca[BLOCK_SIZE]; __shared__ float Vm; // membrane voltage, shared over all threads __shared__ float fn[BLOCK_SIZE]; if(tid == 0) { Vm = d_Vm[bid] * 0.001; // V } __syncthreads(); float sumrate; float dt_advanced; int reaction_ind; short2 tmp; // copy random generator state locally to avoid accessing global memory curandStateXORWOW_t localstate = state[BLOCK_SIZE*bid + tid]; // iterate over all microvilli in one photoreceptor for(int i = tid; i < NUM_MICROVILLI; i += BLOCK_SIZE) { // load variables that are needed for computing calcium concentration //Ca[tid] = ((%(type)s*)d_X[7])[bid*ld2 + i]; // no need to store calcium tmp = ((short2*)d_X[2])[bid*ld1 + i]; X[tid][5] = tmp.x; X[tid][6] = tmp.y; // update calcium concentration Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn(num_to_mM(X[tid][5]), ns); // load the rest of variables tmp = ((short2*)d_X[1])[bid*ld1 + i]; X[tid][4] = tmp.y; X[tid][3] = tmp.x; tmp = ((short2*)d_X[0])[bid*ld1 + i]; X[tid][2] = tmp.y; X[tid][1] = tmp.x; X[tid][0] = ((short*)d_X[3])[bid*ld1 + i]; // compute total rate of reaction sumrate = 0; sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11 sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12 sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10 sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8 sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7 sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1 sumrate += KAPPA_DSTAR * X[tid][3]; // 6 sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4 sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); // 3 sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2 sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5 ; // 9 // choose the next reaction time dt_advanced = -logf(curand_uniform(&localstate))/(LA + sumrate); // If the reaction time is smaller than dt, // pick the reaction and update, // then compute the total rate and next reaction time again // until all dt_advanced is larger than dt. // Note that you don't have to compensate for // the last reaction time that exceeds dt. // The reason is that the exponential distribution is MEMORYLESS. while(dt_advanced <= dt) { reaction_ind = 0; sumrate = curand_uniform(&localstate) * sumrate; sumrate -= mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); reaction_ind = (sumrate<=2e-5) * 11; if(!reaction_ind) { sumrate -= mM_to_num(K_R) * num_to_mM(X[tid][5]); reaction_ind = (sumrate<=2e-5) * 12; if(!reaction_ind) { sumrate -= GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; reaction_ind = (sumrate<=2e-5) * 10; if(!reaction_ind) { sumrate -= GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; reaction_ind = (sumrate<=2e-5) * 8; if(!reaction_ind) { sumrate -= GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 7; if(!reaction_ind) { sumrate -= GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 1; if(!reaction_ind) { sumrate -= KAPPA_DSTAR * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 6; if(!reaction_ind) { sumrate -= GAMMA_GAP * X[tid][2] * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 4; if(!reaction_ind) { sumrate -= KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); reaction_ind = (sumrate<=2e-5) * 3; if(!reaction_ind) { sumrate -= GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); reaction_ind = (sumrate<=2e-5) * 5; if(!reaction_ind) { sumrate -= KAPPA_GSTAR * X[tid][1] * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 2; if(!reaction_ind) { sumrate -= (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; reaction_ind = (sumrate<=2e-5) * 9; } } } } } } } } } } } int ind; // only up to two state variables are needed to be updated // update the first one. ind = change_ind1[reaction_ind]; X[tid][ind] += change1[reaction_ind]; //if(reaction_ind == 9) //{ // X[tid][ind] = max(X[tid][ind], 0); //} ind = change_ind2[reaction_ind]; //update the second one if(ind != 0) { X[tid][ind] += change2[reaction_ind]; } // compute the advance time again Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns ); //fp[tid] = compute_fp( Ca[tid] ); sumrate = 0; sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11 sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12 sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10 sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8 sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7 sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1 sumrate += KAPPA_DSTAR * X[tid][3]; // 6 sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4 sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); // 3 sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2 sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; // 9 dt_advanced -= logf(curand_uniform(&localstate))/(LA + sumrate); } // end while ((short*)d_X[3])[bid*ld1 + i] = X[tid][0]; ((short2*)d_X[0])[bid*ld1 + i] = make_short2(X[tid][1], X[tid][2]); ((short2*)d_X[1])[bid*ld1 + i] = make_short2(X[tid][3], X[tid][4]); ((short2*)d_X[2])[bid*ld1 + i] = make_short2(X[tid][5], X[tid][6]); } // copy the updated random generator state back to global memory state[BLOCK_SIZE*bid + tid] = localstate; } } """ #ptxas info : 77696 bytes gmem, 336 bytes cmem[3] #ptxas info : Compiling entry function 'transduction' for 'sm_35' #ptxas info : Function properties for transduction # 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads #ptxas info : Used 60 registers, 7176 bytes smem, 352 bytes cmem[0], 324 bytes cmem[2] #float : Used 65 registers, 7172 bytes smem, 344 bytes cmem[0], 168 bytes cmem[2] scalartype = dtype.type if isinstance(dtype, np.dtype) else dtype mod = SourceModule(template % { "type": dtype_to_ctype(dtype), "block_size": block_size, "num_microvilli": num_microvilli, "fletter": 'f' if scalartype == np.float32 else '' }, options=["--ptxas-options=-v --maxrregcount=56"], no_extern_c=True) func = mod.get_function('transduction') d_X_address, d_X_nbytes = mod.get_global("d_X") cuda.memcpy_htod(d_X_address, Xaddress) d_change_ind1_address, d_change_ind1_nbytes = mod.get_global("change_ind1") d_change_ind2_address, d_change_ind2_nbytes = mod.get_global("change_ind2") d_change1_address, d_change1_nbytes = mod.get_global("change1") d_change2_address, d_change2_nbytes = mod.get_global("change2") cuda.memcpy_htod(d_change_ind1_address, change_ind1) cuda.memcpy_htod(d_change_ind2_address, change_ind2) cuda.memcpy_htod(d_change1_address, change1) cuda.memcpy_htod(d_change2_address, change2) func.prepare([np.intp, np.int32, np.float32, np.intp, np.float32]) func.set_cache_config(cuda.func_cache.PREFER_SHARED) return func
while ( tid < TID_MAX && j >= VY0 && j < VY1 ) { rvc = vc[0]; rpsi = psi[tid]; psi[tid].x = rvc.x * rpsi.x - rvc.y * rpsi.y; psi[tid].y = rvc.x * rpsi.y + rvc.y * rpsi.x; tid += gridDim.x * blockDim.x; } } '''.replace('NY2',str(ny*2)).replace('NXY',str(nx*ny)).replace('NX',str(nx)).replace('NY',str(ny)).replace('TID0',str(vx0*ny)).replace('TID_MAX',str(vx1*ny)).replace('VY0',str(vy0)).replace('VY1',str(vy1)) #print kernels mod = SourceModule(kernels) lcf = mod.get_function('lcf') vcf = mod.get_function('vcf') lcx_const, _ = mod.get_global('lcx') vc_const, _ = mod.get_global('vc') cuda.memcpy_htod(lcx_const, lcx_sqrt) cuda.memcpy_htod(vc_const, vc) tpb = 256 bpg1, bpg2 = 0, 0 for bpg in xrange(65535, 0, -1): if (nx * ny / tpb) % bpg == 0: bpg1 = bpg if (vwidth * ny / tpb) % bpg == 0: bpg2 = bpg if bpg1 * bpg2 != 0: break print 'tpb = %d, bpg1 = %g, bpg2 = %g' % (tpb, bpg1, bpg2) # save to the h5 file f['data'].create_dataset('psi0', data=psi_gpu.get(), compression='gzip')
]) render_params = np.zeros((1,), render_params_t)[0] # how to do it easier render_params["hintTreeRoot"] = hint_grids.shape[0]-1 render_params["viewSize"][:] = (512, 512) render_params["fovCoef"] = np.tan(np.radians( 45.0 / 2 )) eyePos = np.array([1.5, 1.5, 1.5]) targetPos = np.array([0.5, 0.5, 0.5]) v2wMtx = makeViewToWldMtx(eyePos, targetPos, np.array([0, 0, 1])) w2vMtx = np.linalg.inv(v2wMtx) render_params["eyePos"][:] = eyePos render_params["viewToWldMtx"][:] = v2wMtx[:3] render_params["wldToViewMtx"][:] = w2vMtx[:3] cuda.memcpy_htod(mod.get_global("rp")[0], render_params) hint_grid_tex = mod.get_texref("hint_grid_tex") hint_brick_tex = mod.get_texref("hint_brick_tex") hint_grid_tex.set_address(d_hint_grids, len(hint_grids.data)) hint_brick_tex.set_address(d_hint_bricks, len(hint_bricks.data)) hint_brick_tex.set_format(cuda.array_format.UNSIGNED_INT32, 2) dst = np.zeros((512, 512), np.float32) print "running kernel" #TestFetch(np.float32(0.6), cuda.Out(dst), block = (8, 8, 1), grid=(32, 32), texrefs = [hint_grid_tex, hint_brick_tex]) Trace(cuda.Out(dst), block = (16, 16, 1), grid=(32, 32), texrefs = [hint_grid_tex, hint_brick_tex]) def vis(): import pylab pylab.imshow(dst, origin="bottom")
def __init__(self, model, batch_size, dx, dt=None): source = """ __constant__ float fd_d[3]; __global__ void step_d(const float *const model, float *wfc, float *wfp, const int nb, const int nz, const int nx) { int zblocks_per_shot = gridDim.y / nb; int x = blockDim.x * blockIdx.x + threadIdx.x; int z = blockDim.y * (blockIdx.y % zblocks_per_shot) + threadIdx.y; int b = blockIdx.y / zblocks_per_shot; int i = z * nx + x; int ib = b * nz * nx + i; float lap; bool in_domain = (x > 1) && (x < nx - 2) && (z > 1) && (z < nz - 2) && (b < nb); if (in_domain) { /* Laplacian */ lap = (fd_d[0] * wfc[ib] + fd_d[1] * (wfc[ib + 1] + wfc[ib - 1] + wfc[ib + nx] + wfc[ib - nx]) + fd_d[2] * (wfc[ib + 2] + wfc[ib - 2] + wfc[ib + 2 * nx] + wfc[ib - 2 * nx])); /* Main evolution equation */ wfp[ib] = model[i] * lap + 2 * wfc[ib] - wfp[ib]; } } __global__ void add_sources_d(const float *const model, float *wfp, const float *const source_amplitude, const int *const sources_z, const int *const sources_x, const int nz, const int nx, const int nt, const int ns, const int it) { int x = threadIdx.x; int b = blockIdx.x; int i = sources_z[b * ns + x] * nx + sources_x[b * ns + x]; int ib = b * nz * nx + i; wfp[ib] += source_amplitude[b * ns * nt + x * nt + it] * model[i]; } """ mod = SourceModule(source, options=[ '-ccbin', 'clang-3.8', '--restrict', '--use_fast_math', '-O3' ]) jitfunc1 = mod.get_function('step_d') jitfunc2 = mod.get_function('add_sources_d') fd_d = mod.get_global('fd_d')[0] pad = 3 super(VPycuda1, self).__init__(jitfunc1, jitfunc2, fd_d, model, batch_size, pad, dx, dt)
def __init__(self,instance,algorithm,verbose=False): """Initializes a direct parallel worker :param instance: an instance of class from_scatterers_direct(\ cctbx.xray.structure_factors.manager.managed_calculation_base) :type instance: cctbx.xray.structure_factors.from_scatterers_direct :param algorithm: an instance of class direct_summation_cuda_platform(\ direct_summation_simple) with algorithm set to "simple" or "pycuda" :type algorithm: cctbx.xray.structure_factors.direct_summation_cuda_platform """ self.scatterers = instance._xray_structure.scatterers() self.registry = instance._xray_structure.scattering_type_registry() self.miller_indices = instance._miller_set.indices() self.unit_cell = instance._miller_set.unit_cell() self.space_group = instance._miller_set.space_group().make_tidy() if verbose: self.print_diagnostics() # some diagnostics used for development if hasattr(algorithm,"simple"): instance._results = ext.structure_factors_simple( self.unit_cell, instance._miller_set.space_group(), self.miller_indices, self.scatterers, self.registry); return if hasattr(algorithm,"pycuda"): import pycuda.driver as cuda from pycuda.compiler import SourceModule self.validate_the_inputs(instance,cuda,algorithm) self.prepare_miller_arrays_for_cuda(algorithm) self.prepare_scattering_sites_for_cuda(algorithm) self.prepare_gaussians_symmetries_cell(algorithm) assert cuda.Device.count() >= 1 device = cuda.Device(0) WARPSIZE=device.get_attribute(cuda.device_attribute.WARP_SIZE) # 32 MULTIPROCESSOR_COUNT=device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT) sort_mod = SourceModule((mod_fhkl_sorted%(self.gaussians.shape[0], self.symmetry.shape[0], self.sym_stride, self.g_stride, int(self.use_debye_waller), self.order_z,self.order_p)).replace("floating_point_t",algorithm.float_t) ) r_m_m_address = sort_mod.get_global("reciprocal_metrical_matrix")[0] cuda.memcpy_htod(r_m_m_address, self.reciprocal_metrical_matrix) gaussian_address = sort_mod.get_global("gaussians")[0] cuda.memcpy_htod(gaussian_address, self.gaussians) symmetry_address = sort_mod.get_global("symmetry")[0] cuda.memcpy_htod(symmetry_address, self.symmetry) CUDA_fhkl = sort_mod.get_function("CUDA_fhkl") intermediate_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) intermediate_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) for x in range(self.scatterers.number_of_types): fhkl_real = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) fhkl_imag = algorithm.numpy.zeros((self.n_flat_hkl,),algorithm.numpy_t) CUDA_fhkl(cuda.InOut(fhkl_real), cuda.InOut(fhkl_imag), cuda.In(self.flat_sites), cuda.In(self.weights), cuda.In(self.u_iso), algorithm.numpy.uint32(self.scatterers.increasing_order[x]), algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][0]), algorithm.numpy.uint32(self.scatterers.sorted_ranges[x][1]), cuda.In(self.flat_mix), block=(FHKL_BLOCKSIZE,1,1), grid=((self.n_flat_hkl//FHKL_BLOCKSIZE,1))) intermediate_real += fhkl_real intermediate_imag += fhkl_imag flex_fhkl_real = flex.double(intermediate_real[0:len(self.miller_indices)].astype(algorithm.numpy.float64)) flex_fhkl_imag = flex.double(intermediate_imag[0:len(self.miller_indices)].astype(algorithm.numpy.float64)) instance._results = fcalc_container(flex.complex_double(flex_fhkl_real,flex_fhkl_imag)) return
class RimeSumCoherencies(Node): def __init__(self): super(RimeSumCoherencies, self).__init__() def initialise(self, solver, stream=None): slvr = solver ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') # Get a property dictionary off the solver D = slvr.template_dict() # Include our kernel parameters D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS) D['rime_const_data_struct'] = slvr.const_data().string_def() D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \ mbu.redistribute_threads( D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'], npolchan, nbl, ntime) regs = str(FLOAT_PARAMS['maxregs'] \ if slvr.is_float() else DOUBLE_PARAMS['maxregs']) # Create the signature of the call to the function stamping macro stamp_args = ', '.join([ 'float' if slvr.is_float() else 'double', 'float2' if slvr.is_float() else 'double2', 'float3' if slvr.is_float() else 'double3', 'true' if slvr.use_weight_vector() else 'false', '1' if slvr.outputs_residuals() else '0']) stamp_fn = ''.join(['stamp_sum_coherencies_fn(', stamp_args, ')']) D['stamp_function'] = stamp_fn kname = 'rime_sum_coherencies' self.mod = SourceModule( KERNEL_TEMPLATE.substitute(**D), options=['-lineinfo','-maxrregcount', regs], include_dirs=[montblanc.get_source_path()], no_extern_c=True) self.rime_const_data = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) self.launch_params = self.get_launch_params(slvr, D) def shutdown(self, solver, stream=None): pass def pre_execution(self, solver, stream=None): pass def get_launch_params(self, slvr, D): polchans_per_block = D['BLOCKDIMX'] bl_per_block = D['BLOCKDIMY'] times_per_block = D['BLOCKDIMZ'] ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block) bl_blocks = mbu.blocks_required(nbl, bl_per_block) time_blocks = mbu.blocks_required(ntime, times_per_block) return { 'block' : (polchans_per_block, bl_per_block, times_per_block), 'grid' : (polchan_blocks, bl_blocks, time_blocks), } def execute(self, solver, stream=None): slvr = solver if stream is not None: cuda.memcpy_htod_async( self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( self.rime_const_data[0], slvr.const_data().ndary()) # The gaussian shape array can be empty if # no gaussian sources were specified. gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \ else slvr.gauss_shape sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape self.kernel(slvr.uvw, gauss, sersic, slvr.frequency, slvr.antenna1, slvr.antenna2, slvr.jones, slvr.flag, slvr.weight_vector, slvr.observed_vis, slvr.G_term, slvr.model_vis, slvr.chi_sqrd_result, stream=stream, **self.launch_params) # Call the pycuda reduction kernel. # Divide by the single sigma squared value if a weight vector # is not required. Otherwise the kernel will incorporate the # individual sigma squared values into the sum gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get() if not slvr.use_weight_vector(): slvr.set_X2(gpu_sum/slvr.sigma_sqrd) else: slvr.set_X2(gpu_sum) def post_execution(self, solver, stream=None): pass
phi = np.linspace(0, 2 * math.pi, nsamps).astype(np.float64) psi = np.linspace(0, 2 * math.pi, nsamps).astype(np.float64) tref = np.array([24715.581890875823 for item in theta]).astype(np.float64) rhoTS = np.ones((nmodes, ntimes)).astype(np.complex128) rhoTS[0, :] = np.arange(ntimes) + 1.0j * np.arange(ntimes) rhoTS[1, :] = 2.0 * np.arange(ntimes) + 1.0j * 2.0 * np.arange(ntimes) rhoTS[2, :] = 3.0 * np.arange(ntimes) + 1.0j * 3.0 * np.arange(ntimes) #_____________________ # Pass down data #_____________________ # **-- constants --** nmodes_gpu = mod.get_global("nmodes")[0] nsamps_gpu = mod.get_global("nsamps")[0] ntimes_gpu = mod.get_global("ntimes")[0] nclmns_gpu = mod.get_global("nclmns")[0] detector_tensor_gpu = mod.get_global("det_tns")[0] cuda.memcpy_htod(nmodes_gpu, nmodes) cuda.memcpy_htod(nsamps_gpu, nsamps) cuda.memcpy_htod(ntimes_gpu, ntimes) cuda.memcpy_htod(nclmns_gpu, nclmns) cuda.memcpy_htod(detector_tensor_gpu, detector_tensor) # **---- data -----** selected_modes_gpu = gpuarray.to_gpu(mlist_sort) theta_gpu = gpuarray.to_gpu(theta)
normals = numpy.loadtxt("data/normals.dat").astype(numpy.float32) indexes = numpy.loadtxt("data/indexes.dat").astype(numpy.ushort) vis = numpy.zeros((1, verts.size/3), numpy.float32) template_params = {'dN' : design.size, 'visN': vis.size, 'vN' : verts.size, 'kernelStep' : kernelStep} kernel_code = Template( file = 'vis.cu', searchList = [template_params], ) cuda_module = SourceModule(kernel_code) cuda_call = cuda_module.get_function("vis") cuda_call.prepare("P", (256, 1, 1)) N = vis.size grid_dimensions = (min(32, (N+256-1) // 256 ), 1) vis_gpu = cuda_driver.mem_alloc(vis.nbytes) design_gpu = cuda_module.get_global('design')[0] cuda_driver.memcpy_htod(design_gpu, design) cuda_driver.memcpy_htod(vis_gpu, vis) kernel_n = cuda_module.get_global('kernelN')[0] v_tex = cuda_module.get_texref('v_tex') v_tex_gpu = cuda_driver.to_device(verts) v_tex.set_address(v_tex_gpu, verts.nbytes) v_tex.set_format(cuda_driver.array_format.FLOAT, 1) n_tex = cuda_module.get_texref('n_tex') n_tex_gpu = cuda_driver.to_device(normals) n_tex.set_address(n_tex_gpu, normals.nbytes) n_tex.set_format(cuda_driver.array_format.FLOAT, 1)
while ( tid < TID_MAX ) { rpsi = psi[tid]; psi[tid].x = vc_real * rpsi.x - vc_imag * rpsi.y; psi[tid].y = vc_real * rpsi.y + vc_imag * rpsi.x; tid += gridDim.x * blockDim.x; } } '''.replace('HNX',str(nx/2)).replace('NX',str(nx)).replace('TID0',str(vx0)).replace('TID_MAX',str(vx1)) print kernels mod = SourceModule(kernels) lcf = mod.get_function('lcf') vcf = mod.get_function('vcf') lcx_const, _ = mod.get_global('lcx') cuda.memcpy_htod(lcx_const, lcx_sqrt[1:nx/2+1]) tpb = 256 bpg = 30 * 4 print 'tpb = %d, bpg = %g' % (tpb, bpg) # save to the h5 file # plot & save to the h5 file import matplotlib.pyplot as plt plt.ion() fig = plt.figure() ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2)
psi = np.linspace(0, 2*math.pi, nsamps).astype(np.float64) tref = np.array([24715.581890875823 for item in theta]).astype(np.float64) rhoTS = np.ones((nmodes, ntimes)).astype(np.complex128) rhoTS[0,:] = np.arange(ntimes) + 1.0j*np.arange(ntimes) rhoTS[1,:] = 2.0*np.arange(ntimes) + 1.0j*2.0*np.arange(ntimes) rhoTS[2,:] = 3.0*np.arange(ntimes) + 1.0j*3.0*np.arange(ntimes) #_____________________ # Pass down data #_____________________ # **-- constants --** nmodes_gpu = mod.get_global("nmodes")[0] nsamps_gpu = mod.get_global("nsamps")[0] ntimes_gpu = mod.get_global("ntimes")[0] nclmns_gpu = mod.get_global("nclmns")[0] detector_tensor_gpu = mod.get_global("det_tns")[0] cuda.memcpy_htod(nmodes_gpu, nmodes) cuda.memcpy_htod(nsamps_gpu, nsamps) cuda.memcpy_htod(ntimes_gpu, ntimes) cuda.memcpy_htod(nclmns_gpu, nclmns) cuda.memcpy_htod(detector_tensor_gpu, detector_tensor) # **---- data -----**
class GPURBFEll(object): """RBF Kernel with ellpack format""" cache_size = 100 Gamma = 1.0 #template func_name = 'rbfEllpackILPcol2multi' #template module_file = os.path.dirname(__file__) + '/cu/KernelsEllpackCol2.cu' #template texref_nameI = 'VecI_TexRef' texref_nameJ = 'VecJ_TexRef' max_concurrent_kernels = 1 def __init__(self, gamma=1.0, cache_size=100): """ Initialize object Parameters ------------- max_kernel_nr: int determines maximal concurrent kernel column gpu computation """ self.cache_size = cache_size self.threadsPerRow = 1 self.prefetch = 2 self.tpb = 128 self.Gamma = gamma def init_cuda(self, X, Y, cls_start, max_kernels=1): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X = X self.Y = Y self.cls_start = cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y = [] #handle to gpu memory for results for each concurrent classifier self.g_out = [] #gpu kernel out self.kernel_out = [] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg = [] #function reference self.func = [] #texture references for each concurrent kernel self.tex_ref = [] #main vectors #gpu self.g_vecI = [] self.g_vecJ = [] #cpu self.main_vecI = [] self.main_vecJ = [] #cpu class self.cls_count = [] self.cls = [] #gpu class self.g_cls_count = [] self.g_cls = [] self.sum_cls = [] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N, self.Dim = X.shape column_size = self.N * 4 cacheMB = self.cache_size * 1024 * 1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB / column_size).astype(int) cache_items = min(self.N, cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open(self.module_file, "r") as CudaFile: module_code = CudaFile.read() #compile module self.module = SourceModule(module_code, keep=True, no_extern_c=True) (g_gamma, gsize) = self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma)) #get functions reference Dim = self.Dim vecBytes = Dim * 4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex = self.module.get_texref('VecI_TexRef') self.g_vecI[f] = cuda.mem_alloc(vecBytes) vecI_tex.set_address(self.g_vecI[f], vecBytes) #init texture for vector J vecJ_tex = self.module.get_texref('VecJ_TexRef') self.g_vecJ[f] = cuda.mem_alloc(vecBytes) vecJ_tex.set_address(self.g_vecJ[f], vecBytes) self.tex_ref.append((vecI_tex, vecJ_tex)) self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32)) self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist) #transform X to particular format v, c, r = spf.csr2ellpack(self.X, align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start) def cls_init(self, kernel_nr, y_cls, cls1, cls2, cls1_n, cls2_n): """ Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2. Parameters ------------ kernel_nr : int concurrent kernel number y_cls : array-like binary class labels (1,-1) cls1: int first class number cls2: int second class number cls1_n : int number of elements of class 1 cls2_n : int number of elements of class 2 kernel_out : array-like array for gpu kernel result, size=2*len(y_cls) """ warp = 32 align_cls1_n = cls1_n + (warp - cls1_n % warp) % warp align_cls2_n = cls2_n + (warp - cls2_n % warp) % warp self.cls1_N_aligned = align_cls1_n sum_cls = align_cls1_n + align_cls2_n self.sum_cls[kernel_nr] = sum_cls self.cls_count[kernel_nr] = np.array([cls1_n, cls2_n], dtype=np.int32) self.cls[kernel_nr] = np.array([cls1, cls2], dtype=np.int32) self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr]) self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr]) self.bpg[kernel_nr] = int( np.ceil((self.threadsPerRow * sum_cls + 0.0) / self.tpb)) self.g_y[kernel_nr] = cuda.to_device(y_cls) self.kernel_out[kernel_nr] = np.zeros(2 * y_cls.shape[0], dtype=np.float32) ker_out = self.kernel_out[kernel_nr] self.g_out[kernel_nr] = cuda.to_device( ker_out) # cuda.mem_alloc_like(ker_out) #add prepare for device functions def K2Col(self, i, j, i_ds, j_ds, kernel_nr): """ computes i-th and j-th kernel column Parameters --------------- i: int i-th kernel column number in subproblem j: int j-th kernel column number in subproblem i_ds: int i-th kernel column number in whole dataset j_ds: int j-th kernel column number in whole dataset kernel_nr : int number of concurrent kernel ker2ColOut: array like array for output Returns ------- ker2Col """ #make i-th and j-the main vectors vecI = self.main_vecI[kernel_nr] vecJ = self.main_vecJ[kernel_nr] # self.X[i_ds,:].todense(out=vecI) # self.X[j_ds,:].todense(out=vecJ) #vecI.fill(0) #vecJ.fill(0) #self.X[i_ds,:].toarray(out=vecI) #self.X[j_ds,:].toarray(out=vecJ) vecI = self.X.getrow(i_ds).todense() vecJ = self.X.getrow(j_ds).todense() #copy them to texture cuda.memcpy_htod(self.g_vecI[kernel_nr], vecI) cuda.memcpy_htod(self.g_vecJ[kernel_nr], vecJ) # temp = np.empty_like(vecI) # cuda.memcpy_dtoh(temp,self.g_vecI[kernel_nr]) # print 'temp',temp #lauch kernel gfunc = self.func[kernel_nr] gy = self.g_y[kernel_nr] gout = self.g_out[kernel_nr] gN = np.int32(self.N) g_i = np.int32(i) g_j = np.int32(j) g_ids = np.int32(i_ds) g_jds = np.int32(j_ds) gNalign = np.int32(self.cls1_N_aligned) gcs = self.g_cls_start gcc = self.g_cls_count[kernel_nr] gc = self.g_cls[kernel_nr] bpg = self.bpg[kernel_nr] #print 'start gpu i,j,kernel_nr ',i,j,kernel_nr #texReflist = list(self.tex_ref[kernel_nr]) #gfunc(self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc,block=(self.tpb,1,1),grid=(bpg,1),texrefs=texReflist) #print 'end gpu',i,j #copy the results #grid=(bpg,1),block=(self.tpb,1,1) gfunc.prepared_call((bpg, 1), (self.tpb, 1, 1), self.g_val, self.g_col, self.g_len, self.g_sdot, gy, gout, gN, g_i, g_j, g_ids, g_jds, gNalign, gcs, gcc, gc) cuda.memcpy_dtoh(self.kernel_out[kernel_nr], gout) return self.kernel_out[kernel_nr] def K_vec(self, vec): ''' vec - array-like, row ordered data, should be not to big ''' dot = self.X.dot(vec.T) x2 = self.Xsquare.reshape((self.Xsquare.shape[0], 1)) if (sp.issparse(vec)): v2 = vec.multiply(vec).sum(1).reshape((1, vec.shape[0])) else: v2 = np.einsum('...i,...i', vec, vec) return np.exp(-self.Gamma * (x2 + v2 - 2 * dot)) def compute_diag(self): """ Computes kernel matrix diagonal """ #for rbf diagonal consists of ones exp(0)==1 self.Diag = np.ones(self.X.shape[0], dtype=np.float32) if (sp.issparse(self.X)): # result as matrix self.Xsquare = self.X.multiply(self.X).sum(1) #result as array self.Xsquare = np.asarray(self.Xsquare).flatten() else: self.Xsquare = np.einsum('...i,...i', self.X, self.X) def clean(self, kernel_nr): """ clean the kernel cache """ #self.kernel_cache.clear() self.bpg[kernel_nr] = 0 def clean_cuda(self): ''' clean all cuda resources ''' for f in range(self.max_concurrent_kernels): #vecI_tex=?? #self.g_vecI[f].free() del self.g_vecI[f] #init texture for vector J #vecJ_tex=?? #self.g_vecJ[f].free() del self.g_vecJ[f] self.g_cls_count[f].free() self.g_cls[f].free() self.g_y[f].free() self.g_out[f].free() #test it #del self.g_out[f] ?? #copy format data structure to gpu memory self.g_val.free() self.g_col.free() self.g_len.free() self.g_sdot.free() self.g_cls_start.free() print self.ctx self.ctx.pop() print self.ctx del self.ctx def predict_init(self, SV): """ Init the classifier for prediction """ self.X = SV self.compute_diag()
class RimeSumCoherencies(Node): def __init__(self): super(RimeSumCoherencies, self).__init__() def initialise(self, solver, stream=None): slvr = solver ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') # Get a property dictionary off the solver D = slvr.template_dict() # Include our kernel parameters D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS) D['rime_const_data_struct'] = slvr.const_data().string_def() D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \ mbu.redistribute_threads( D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'], npolchan, nbl, ntime) regs = str(FLOAT_PARAMS['maxregs'] \ if slvr.is_float() else DOUBLE_PARAMS['maxregs']) # Create the signature of the call to the function stamping macro stamp_args = ', '.join([ 'float' if slvr.is_float() else 'double', 'float2' if slvr.is_float() else 'double2', 'float3' if slvr.is_float() else 'double3', 'true' if slvr.use_weight_vector() else 'false', '1' if slvr.outputs_residuals() else '0' ]) stamp_fn = ''.join(['stamp_sum_coherencies_fn(', stamp_args, ')']) D['stamp_function'] = stamp_fn kname = 'rime_sum_coherencies' self.mod = SourceModule(KERNEL_TEMPLATE.substitute(**D), options=['-lineinfo', '-maxrregcount', regs], include_dirs=[montblanc.get_source_path()], no_extern_c=True) self.rime_const_data = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) self.launch_params = self.get_launch_params(slvr, D) def shutdown(self, solver, stream=None): pass def pre_execution(self, solver, stream=None): pass def get_launch_params(self, slvr, D): polchans_per_block = D['BLOCKDIMX'] bl_per_block = D['BLOCKDIMY'] times_per_block = D['BLOCKDIMZ'] ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block) bl_blocks = mbu.blocks_required(nbl, bl_per_block) time_blocks = mbu.blocks_required(ntime, times_per_block) return { 'block': (polchans_per_block, bl_per_block, times_per_block), 'grid': (polchan_blocks, bl_blocks, time_blocks), } def execute(self, solver, stream=None): slvr = solver if stream is not None: cuda.memcpy_htod_async(self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod(self.rime_const_data[0], slvr.const_data().ndary()) # The gaussian shape array can be empty if # no gaussian sources were specified. gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \ else slvr.gauss_shape sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape self.kernel(slvr.uvw, gauss, sersic, slvr.frequency, slvr.antenna1, slvr.antenna2, slvr.jones, slvr.flag, slvr.weight_vector, slvr.observed_vis, slvr.G_term, slvr.model_vis, slvr.chi_sqrd_result, stream=stream, **self.launch_params) # Call the pycuda reduction kernel. # Divide by the single sigma squared value if a weight vector # is not required. Otherwise the kernel will incorporate the # individual sigma squared values into the sum gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get() if not slvr.use_weight_vector(): slvr.set_X2(gpu_sum / slvr.sigma_sqrd) else: slvr.set_X2(gpu_sum) def post_execution(self, solver, stream=None): pass
def func1(self, g, TILEWIDTH, TILEHEIGHT): template = """ #include <stdlib.h> #include <stdio.h> #include <math.h> #define GENUS %d #define TILEHEIGHT %d #define TILEWIDTH %d __device__ __constant__ double xd[GENUS]; __device__ __constant__ double yd[GENUS]; /*************************************************************************** normpart -------- A helper function for the finite sum functions. Computes: -pi * ||T*(n + fracshift)||^2 = -pi * ||T * (n + (shift - intshift))||^2 = -pi * ||T * (n + Yinv*y - round(Yinv*y))||^2 ***************************************************************************/ __device__ double normpart(int g, double* Yinvd_s, double* Td_s, double* Sd_s) { int tx = threadIdx.x; int ty = threadIdx.y; double norm = 0; int i,j,k; for (i = 0; i < g; i++) { double sum = 0; for (j = 0; j < g; j++) { double T_ij = Td_s[ty*g*g + i*g + j]; double n_j = Sd_s[tx*g + j]; double shift_j = 0; for (k = 0; k < g; k++) { shift_j += Yinvd_s[ty*g*g + g*j + k]*yd[k]; } sum += T_ij * (n_j + shift_j - round(shift_j)); } norm += sum*sum; } return -M_PI * norm; } /************************************************************************* exppart ------- A helper function for the finite sum functions. Computes: 2pi * <(n - intshift), (1/2)X(n - intshift) + x> =2pi * <n - round(shift), (1/2)X(n - round(shift) + x> =2pi * <n - round(Yinv*y), (1/2)X(n - round(Yinv*y) + x> ***************************************************************************/ __device__ double exppart(int g, double *Xd_s, double *Yinvd_s, double* Sd_s) { int tx = threadIdx.x; int ty = threadIdx.y; double exppart = 0; int i,j,k,h; for (i = 0; i < g; i++) { double n_i = Sd_s[tx*g + i]; double shift_i = 0; for (k = 0; k < g; k++) { shift_i += Yinvd_s[ty*g*g + i*g + k] * yd[k]; } double A = n_i - round(shift_i); double Xshift_i = 0; for (j = 0; j < g; j++) { double X_ij = Xd_s[ty*g*g + i*g + j]; double shift_j = 0; for (h = 0; h < g; h++) { shift_j += Yinvd_s[ty*g*g + j*g + h] * yd[h]; } Xshift_i += .5 * (X_ij * (Sd_s[tx*g + j] - round(shift_j))); } double B = Xshift_i + xd[i]; exppart += A*B; } return 2* M_PI * exppart; } /**************************************************************************** Derivative Product Computes: ___ | | | | 2*pi*I <d, n - intshift> d in derivs ****************************************************************************/ __device__ void deriv_prod(int g, double *Sd_s, double* Yinvd_s, double* dpr, double* dpi, double* deriv_real, double* deriv_imag, int nderivs) { int tx = threadIdx.x; int ty = threadIdx.y; double total_real = 1; double total_imag = 0; int i,j,k; for (i = 0; i < nderivs; i++) { double term_real = 0; double term_imag = 0; for (j = 0; i < g; i++) { double shift_j = 0; for (k = 0; k < g; k++) { shift_j += Yinvd_s[ty*g*g + j*g + k] * yd[k]; } double intshift = round(shift_j); double nmintshift = Sd_s[tx*g + j] - intshift; term_real += deriv_real[j + g*i] * nmintshift; term_imag += deriv_imag[j + g*i] * nmintshift; } total_real = total_real * term_real - total_imag * term_imag; total_imag = total_real * term_imag + total_imag * term_real; } //Computes: (2*pi*i)^(nderivs) * (total_real + total_imag*i) double pi_mult = pow(2*M_PI, nderivs); /*Determines what the result of i^nderivs is, and performs the correct multiplication afterwards.*/ if (nderivs %% 4 == 0) { dpr[0] = pi_mult*total_real; dpi[0] = pi_mult*total_imag; } else if (nderivs %% 4 == 1) { dpr[0] = -pi_mult * total_imag; dpi[0] = pi_mult * total_real; } else if (nderivs %% 4 == 2) { dpr[0] = -pi_mult * total_real; dpi[0] = -pi_mult * total_imag; } else if (nderivs %% 4 == 3) { dpr[0] = pi_mult * total_imag; dpi[0] = -pi_mult * total_real; } } /************************************************************************** Finite Sum Without Derivatives Kernel Function **************************************************************************/ __global__ void riemann_theta(double *fsum_reald, double *fsum_imagd, double *Xd, double *Yinvd, double* Td, double *Sd, int g, int N, int K) { /*Built in variables to be used, x variable denotes the summation index while the y variable denotes the Omega index*/ int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double Sd_s[TILEWIDTH * GENUS]; __shared__ double Xd_s[TILEHEIGHT * GENUS * GENUS]; __shared__ double Yinvd_s[TILEHEIGHT * GENUS * GENUS]; __shared__ double Td_s[TILEHEIGHT * GENUS * GENUS]; /*Determine n_0, the start of the summation vector, the full vector is of the form, n_0, n_1, n_2, n_g*/ int n_start = (bx * TILEWIDTH + tx) * g; /* Now n = S[n_start], S[n_start + 1], S[n_start + 2]...S[n_start + (g-1)] */ /*Determine the Omega to evaluate on*/ int omega_start = (by*TILEHEIGHT + ty)*g*g; /* Now omega is Omega[omega_start], ... Omega[omega_start + g*g-1] where Omega is Xd, Yinvd, and Td */ /*Load data into shared arrays */ int i; for (i = 0; i < g; i++){ Sd_s[tx*g + i] = Sd[n_start + i]; } for (i = 0; i < g*g; i++) { Xd_s[ty*g*g + i] = Xd[omega_start + i]; Yinvd_s[ty*g*g + i] = Yinvd[omega_start + i]; Td_s[ty*g*g + i] = Td[omega_start + i]; } __syncthreads(); if (n_start < N*g && omega_start < K*g*g) { /*Compute the 'cosine' and 'sine' parts of the summand*/ double ept, npt, cpt, spt; ept = exppart(g, Xd_s, Yinvd_s, Sd_s); npt = exp(normpart(g, Yinvd_s, Td_s, Sd_s)); cpt = npt*cos(ept); spt = npt*sin(ept); fsum_reald[n_start/g + omega_start/g/g * N] = cpt; fsum_imagd[n_start/g + omega_start/g/g * N] = spt; } } /*********************************************************************** Finite Sum with Derivatives Kernel Function ************************************************************************/ __global__ void riemann_theta_derivatives(double* fsum_reald, double* fsum_imagd, double* Xd, double *Yinvd, double *Td, double *Sd, double *deriv_reald, double *deriv_imagd, int nderivs, int g, int N, int K) { /*Built in variables to be used, x variable denotes the summation index while the y variable denotes the Omega index*/ int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double Sd_s[TILEWIDTH * GENUS]; __shared__ double Xd_s[TILEHEIGHT * GENUS * GENUS]; __shared__ double Yinvd_s[TILEHEIGHT * GENUS * GENUS]; __shared__ double Td_s[TILEHEIGHT * GENUS * GENUS]; /*Determine n_0, the start of the summation vector, the full vector is of the form, n_0, n_1, n_2, n_g*/ int n_start = (bx * TILEWIDTH + tx) * g; /* Now n = S[n_start], S[n_start + 1], S[n_start + 2]...S[n_start + (g-1)] */ /*Determine the Omega to evaluate on*/ int omega_start = (by*TILEHEIGHT + ty)*g*g; /* Now omega is Omega[omega_start], ... Omega[omega_start + g*g-1] where Omega is Xd, Yinvd, and Td */ /*Load data into shared arrays */ int i; for (i = 0; i < g; i++){ Sd_s[tx*g + i] = Sd[n_start + i]; } for (i = 0; i < g*g; i++) { Xd_s[ty*g*g + i] = Xd[omega_start + i]; Yinvd_s[ty*g*g + i] = Yinvd[omega_start + i]; Td_s[ty*g*g + i] = Td[omega_start + i]; } __syncthreads(); if (n_start < N*g && omega_start < K*g*g) { /*Compute the 'cosine' and 'sine' parts of the summand */ double dpr[1]; double dpi[1]; dpr[0] = 0; dpi[0] = 0; double ept, npt, cpt, spt; ept = exppart(g, Xd_s, Yinvd_s, Sd_s); npt = exp(normpart(g, Yinvd_s, Td_s, Sd_s)); cpt = npt*cos(ept); spt = npt*sin(ept); deriv_prod(g,Sd_s,Yinvd_s,dpr,dpi, deriv_reald,deriv_imagd, nderivs); fsum_reald[n_start/g + omega_start/g/g * N] = dpr[0] * cpt - dpi[0] * spt; fsum_imagd[n_start/g + omega_start/g/g * N] = dpi[0] * cpt + dpr[0] * spt; } } """ %(g, TILEHEIGHT, TILEWIDTH) mod = SourceModule(template) func = mod.get_function("riemann_theta") deriv_func = mod.get_function("riemann_theta_derivatives") xd = mod.get_global("xd")[0] yd = mod.get_global("yd")[0] return func, deriv_func, xd, yd
import os import math import numpy import pycuda.gpuarray as gpuarray import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import SourceModule from .utils import gpu_func from .enums import MAX_BLOCK_SIZE, CUR_DIR, CACHE_DIR mod = SourceModule(open(os.path.join(CUR_DIR, 'kernel/maxpool.cu')).read(), cache_dir=CACHE_DIR) maxpool_kernel = mod.get_function('maxpool_kernel') maxpool_back_kernel = mod.get_function('maxpool_back_kernel') d_a_size = mod.get_global('d_a_size')[0] d_out_size = mod.get_global('d_out_size')[0] @gpu_func def maxpool(d_a, window_shape): h, w = window_shape in_z, in_y, in_x = d_a.shape out_z = in_z out_y = in_y / h out_x = in_x / w assert h * w < MAX_BLOCK_SIZE cuda.memcpy_htod(d_a_size, numpy.array(d_a.shape, dtype=numpy.int32)) cuda.memcpy_htod(d_out_size,
gmemPos += gmemStride; } } ''' template = string.Template(template) code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS, KERNEL_W = KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED) module = SourceModule(code) convolutionRowGPU = module.get_function('convolutionRowGPU') convolutionColumnGPU = module.get_function('convolutionColumnGPU') d_Kernel_rows = module.get_global('d_Kernel_rows')[0] d_Kernel_columns = module.get_global('d_Kernel_columns')[0] # Helper functions for computing alignment... def iDivUp(a, b): # Round a / b to nearest higher integer value a = numpy.int32(a) b = numpy.int32(b) return (a / b + 1) if (a % b != 0) else (a / b) def iDivDown(a, b): # Round a / b to nearest lower integer value a = numpy.int32(a) b = numpy.int32(b) return a / b;
def main(): #Initialise InteractionMatrix def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Initialise GPU (equivalent of autoinit) drv.init() assert drv.Device.count() >= 1 dev = drv.Device(0) ctx = dev.make_context(0) #Convert GlobalParams to List GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32) count = 0 for x in GlobalParamsDict.keys(): GlobalParams[count] = GlobalParamsDict[x] count += 1 #Convert FitnessParams to List FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32) count = 0 for x in FitnessParamsDict.keys(): FitnessParams[count] = FitnessParamsDict[x] count += 1 #Convert GAParams to List GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32) count = 0 for x in GAParamsDict.keys(): GAParams[count] = GAParamsDict[x] count += 1 # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main_discoverytime', './templates')) # Load source code from file Source = env.get_template('./kernel.cu') #Template( file(KernelFile).read() ) #Create dictionary argument for rendering RenderArgs= {"params_size":GlobalParams.nbytes,\ "fitnessparams_size":FitnessParams.nbytes,\ "gaparams_size":GAParams.nbytes,\ "genome_bytelength":int(ByteLengthGenome),\ "genome_bitlength":int(BitLengthGenome),\ "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\ "textures":range( 0, NrFitnessFunctionGrids ),\ "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\ "with_mixed_crossover":WithMixedCrossover, "with_bank_conflict":WithBankConflict, "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection, "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues, "with_uniform_crossover":WithUniformCrossover, "with_single_point_crossover":WithSinglePointCrossover, "with_surefire_mutation":WithSurefireMutation, "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory, "ga_threaddimx":int(GA_ThreadDim), "glob_nr_tiletypes":int(NrTileTypes), "glob_nr_edgetypes":int(NrEdgeTypes), "glob_nr_tileorientations":int(NrTileOrientations), "fit_dimgridx":int(DimGridX), "fit_dimgridy":int(DimGridY), "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids), "fit_nr_fourpermutations":int(NrFourPermutations), "fit_assembly_redundancy":int(NrAssemblyRedundancy), "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock), "sort_threaddimx":int(Sort_ThreadDimX), "glob_nr_genomes":int(NrGenomes), "fit_dimthreadx":int(ThreadDimX), "fit_dimthready":int(ThreadDimY), "fit_dimsubgridx":int(SubgridDimX), "fit_dimsubgridy":int(SubgridDimY), "fit_nr_subgridsperbank":int(NrSubgridsPerBank), "glob_bitlength_edgetype":int(EdgeTypeBitLength), "fitness_break_value":int(BitLengthGenome), # ADAPTED FOR DISCOVERY KERNEL "fitness_flag_index":int(NrGenomes) } # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_20", code="sm_20", cache_dir=None) #Allocate values on GPU Genomes_h = drv.mem_alloc(Genomes.nbytes) FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes) FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes) AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes) Mutexe_h = drv.mem_alloc(Mutexe.nbytes) #ReductionList_h = drv.mem_alloc(ReductionList.nbytes) #Copy values to global memory drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #Copy values to constant / texture memory for id in range(0, NrFitnessFunctionGrids): FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) ) drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C") InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address drv.memcpy_htod(GlobalParams_h[0], GlobalParams) FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address drv.memcpy_htod(FitnessParams_h[0], FitnessParams) GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address drv.memcpy_htod(GAParams_h[0], GAParams) FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst") FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst") #Set up curandStates curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper) CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes) #Compile kernels curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel") #fitness_fnc = KernelSourceModule.get_function("FitnessKernel") sorting_fnc = KernelSourceModule.get_function("SortingKernel") ga_fnc = KernelSourceModule.get_function("GAKernel") #Initialise Curand curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1)) #Build parameter lists for FitnessKernel and GAKernel FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h); SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h) GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h); #TEST ONLY #return #ADAPTED #TEST ONLY #START ADAPTED print "GENOMES NOW:\n" print Genomes print ":::STARTING KERNEL EXECUTION:::" #STOP ADAPTED #Discovery time parameters min_fitness_value = BitLengthGenome # Want all bits set mutation_rate = -2.0 #normally: -2 #Define Numpy construct to sideways join arrays (glue columns together) #Taken from: http://stackoverflow.com/questions/5355744/numpy-joining-structured-arrays #def join_struct_arrays(arrays): # sizes = np.array([a.itemsize for a in arrays]) # offsets = np.r_[0, sizes.cumsum()] # n = len(arrays[0]) # joint = np.empty((n, offsets[-1]), dtype=np.int32) # for a, size, offset in zip(arrays, sizes, offsets): # joint[:,offset:offset+size] = a.view(np.int32).reshape(n,size) # dtype = sum((a.dtype.descr for a in arrays), []) # return joint.ravel().view(dtype) #Test join_struct_arrays: #a = np.array([[1, 2], [11, 22], [111, 222]]).astype(np.int32); #b = np.array([[3, 4], [33, 44], [333, 444]]).astype(np.int32); #c = np.array([[5, 6], [55, 66], [555, 666]]).astype(np.int32); #print "Test join_struct_arrays:" #print join_struct_arrays([a, b, c]) #FAILED #Set up PYTABLES #class GAGenome(IsDescription): #gen_id = Int32Col() #fitness_val = Float32Col() #genome = StringCol(mByteLengthGenome) #last_nr_mutations = Int32Col() # Contains the Nr of mutations genome underwent during this generation #mother_id = Int32Col() # Contains the crossover "mother" #father_id = Int32Col() # Contains the crossover "father" (empty if no crossing over) #assembledgrid = StringCol(DimGridX*DimGridY) # 16-character String #class GAGenerations(IsDescription): # nr_generations = Int32Col() # nr_genomes = Int32Col() # mutation_rate = Float32Col() # Contains the Nr of mutations genome underwent during this generation #from datetime import datetime #filename = "fujiama_"+str(NrGenomes)+"_"+str(RateMutation)+"_"+".h5" #print filename #h5file = openFile(filename, mode = "w", title = "GA FILE") #group = h5file.createGroup("/", 'fujiama_ga', 'Fujiama Genetic Algorithm output') #table = h5file.createTable(group, 'GaGenerations', GAGenerations, "Raw data") #atom = Atom.from_dtype(np.float32) #Initialise File I/O FILE = open("fujiamakernel_nrgen-" + str(NrGenomes) + "_adaptation.plot", "w") #ADAPTED FOR TESTING HISTOGRAM #TestValues = [13,24,26,31,32,14] #print np.histogram(TestValues, bins=[0, 24, 32])[0][1] #quit() #Initialise CUDA timers start = drv.Event() stop = drv.Event() while mutation_rate < 1: # normally: 1 #ds = h5file.createArray(f.root, 'ga_raw_'+str(mutation_rate), atom, x.shape) mutation_rate += 0.1 GAParams[0] = 10.0 ** mutation_rate drv.memcpy_htod(GAParams_h[0], GAParams) print "Mutation rate: ", GAParams[0] #ADAPTED: Initialise global memory (absolutely necessary!!) drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #execute kernels for specified number of generations start.record() biggest_fit = 0 reprange = 100 average_breakup = np.zeros((reprange)).astype(np.float32) for rep in range(0, reprange): breakup_generation = GlobalParamsDict["NrGenerations"] dontcount = 0 #ADAPTED: Initialise global memory (absolutely necessary!!) drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #execute kernels for specified number of generations start.record() for gen in range(0, GlobalParamsDict["NrGenerations"]): #print "Processing Generation: %d"%(gen) #Launch CPU processing (should be asynchroneous calls) sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel drv.memcpy_dtoh(FitnessPartialSums, FitnessPartialSums_h) #Copy from Device to Host and finish sorting FitnessSumConst = FitnessPartialSums.sum() drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory #drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitnessValues from Device to Device Const #TEST ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) #TEST #Note: Fitness Function is here integrated into GA kernel! drv.memcpy_dtoh(Genomes_res, Genomes_h) #Copy data from GPU drv.memcpy_dtoh(FitnessValues_res, FitnessValues_h) #drv.memcpy_dtoh(AssembledGrids_res, AssembledGrids_h) #Takes about as much time as the whole simulation! #print FitnessValues_res #maxxie = FitnessValues_res.max() #if maxxie > biggest_fit: # biggest_fit = maxxie #print "max fitness:", maxxie #if maxxie >= 25.0 and breakup_generation == -1: if np.histogram(FitnessValues_res, (0, 24, 32))[0][1] >= NrGenomes/2: breakup_generation = gen break # else: # breakup_generation = -1 #if FitnessValues[NrGenomes] == float(1): # breakup_generation = i # break #else: # breakup_generation = -1 #maxxie = FitnessValues_res.max() #if maxxie >= 30: # print "Max fitness value: ", FitnessValues_res.max() #ds[:] = FitnessValues #join_struct_arrays(Genomes, FitnessValues, AssembledGrids); #trow = table.row #trow['nr_generations'] = NrGenerations #trow['nr_genomes'] = NrGenomes #trow['mutation_rate'] = mutation_rate #trow.append() #trow.flush() stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / breakup_generation) print "Discovery time (generations) for mutation rate %f: %d"%(GAParams[0], breakup_generation) #print "Max:", biggest_fit #TEST MODE #print "Printing all FitnessValues now:" #print FitnessValues_res #raw_input("Next redundancy with keystroke!..."); #if breakup_generation==0: # print FitnessValues_res # print "Genomes: " # print Genomes_res average_breakup[rep] = breakup_generation / reprange #if breakup_generation == -1: # dontcount = 1 # break #if dontcount == 1: # average_breakup.fill(20000) FILE.write( str(GAParams[0]) + " " + str(np.median(average_breakup)) + " " + str(np.std(average_breakup)) + "\n"); FILE.flush() #Clean-up pytables #h5file.close() #Clean up File I/O FILE.close()
def bloom_gpu(img, threshold, sigma, sigma_n): kernel_radius = np.int32(sigma_n * sigma + 0.5) poly = np.polynomial.Polynomial([0, 0, -0.5 / (sigma * sigma)]) x = np.arange(-kernel_radius, kernel_radius + 1) kernel = np.exp(poly(x), dtype=np.float32) kernel /= kernel.sum() bloom_cuda_source_template = """ /* * Function: luminance * ---------------------- * Performs perceptual luminance-preserving conversion of sRGB image to grayscale image. * @param lum: resulting grayscale (1-channel) image as 1D float array; * 1D-mapping: from left to right, from top to bottom. * @param img: sRGB (3-channel) image as 1D float array; * 1D-mapping: from R to B, from left to right, from top to bottom. * @param n: total number of pixels in image (length of lum array) as single-element int array. */ __global__ void luminance(float *lum, float *img, int *n) { // Luminance perception coefficients for sRGB. const float c_r = 0.2126; const float c_g = 0.7152; const float c_b = 0.0722; // Get 1D-mapped index of pixel. int g_idx = blockIdx.x * blockDim.x + threadIdx.x; // Perform perceptual luminance-preserving conversion of sRGB image to grayscale image // (one grayscale pixel per thread) // after checking that 1D-mapped index of pixel is less than total number of pixels. if (g_idx < n[0]) lum[g_idx] = c_r * img[g_idx * 3 + 0] + c_g * img[g_idx * 3 + 1] + c_b * img[g_idx * 3 + 2]; } /* * Function: array_max * ---------------------- * Finds the maximum value in 1D array using reduction technique. * @param max_val: resulting maximum value in array as single-element float array. * @param array: data as 1D float array. * @param n: number of elements in array as single-element int array. * @param mutex: mutex value (need to be zero) as single-element int array. */ __global__ void array_max(float *max_val, float *array, int *n, int *mutex) { extern __shared__ float sdata[]; // Load array into shared memory. // The number of blocks is halved, so it is possible to load the maximum of two values: // the element within current block and the element with index shifted by size of the block. const int g_idx = __mul24(blockIdx.x, blockDim.x << 1) + threadIdx.x; float x = -1.0; if(g_idx + blockDim.x < n[0]) x = fmaxf(array[g_idx], array[g_idx + blockDim.x]); sdata[threadIdx.x] = x; __syncthreads(); // Perform reduction within one block. if(threadIdx.x < 512 && sdata[threadIdx.x + 512] > sdata[threadIdx.x]) { sdata[threadIdx.x] = sdata[threadIdx.x + 512]; } __syncthreads(); if(threadIdx.x < 256 && sdata[threadIdx.x + 256] > sdata[threadIdx.x]) { sdata[threadIdx.x] = sdata[threadIdx.x + 256]; } __syncthreads(); if(threadIdx.x < 128 && sdata[threadIdx.x + 128] > sdata[threadIdx.x]) { sdata[threadIdx.x] = sdata[threadIdx.x + 128]; } __syncthreads(); if(threadIdx.x < 64 && sdata[threadIdx.x + 64] > sdata[threadIdx.x]) { sdata[threadIdx.x] = sdata[threadIdx.x + 64]; } __syncthreads(); // Single-warp threads in use now, no thread synchronisation needed. if (threadIdx.x < 32) { sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 32]); sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 16]); sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 8]); sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 4]); sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 2]); sdata[threadIdx.x] = fmaxf(sdata[threadIdx.x], sdata[threadIdx.x + 1]); } // The first thread updates maximum value using mutex to ensure update is correct. if(threadIdx.x == 0) { // Lock mutex while(atomicCAS(mutex,0,1) != 0); // Write result max_val[0] = fmaxf(max_val[0], sdata[0]); // Unlock mutex atomicExch(mutex, 0); } } /* * Function: array_highpass * --------------------------- * Zeros elements of array which are less or equal to threshold. * @param array: data as 1D float array. * @param n: number of elements in array as single-element int array. * @param threshold: the maxmum value to zero as single-element float array. */ __global__ void array_highpass(float *array, int *n, float *threshold) { // Get 1D-mapped index of pixel. int g_idx = blockIdx.x * blockDim.x + threadIdx.x; // Perform threshold-based zeroing // after checking that 1D-mapped index of pixel is less than total number of pixels. if (g_idx < n[0]) if (array[g_idx] <= threshold[0]) array[g_idx] = 0.0; } #define kernel_radius $kernel_radius #define kernel_radius_aligned $kernel_radius_aligned #define kernel_w $kernel_w #define row_tile_w $row_tile_w #define col_tile_w $col_tile_w #define col_tile_h $col_tile_h __device__ __constant__ float kernel[kernel_w]; /* * Function: convolution_row * --------------------------- * Performs 1D convolution for image in horizontal direction using predefined above kernel. * @param input: data as 1D float array. * @param dataW: width of image in pixels as pointer to int. * @param dataH: height of image in pixels as pointer to int. */ __global__ void convolution_row(float *input, int *dataW, int *dataH) { extern __shared__ float data[]; // Define working area const int tile_start = __mul24(blockIdx.x, row_tile_w); const int tile_end = tile_start + row_tile_w; const int apron_start = tile_start - kernel_radius; const int apron_end = tile_end + kernel_radius; // tile_start is clamped by definition! const int tile_end_clamped = min(tile_end, dataW[0] - 1); const int apron_start_clamped = max(apron_start, 0); const int apron_end_clamped = min(apron_end, dataW[0] - 1); const int row_start = __mul24(blockIdx.y, dataW[0]); const int apron_start_aligned = tile_start - kernel_radius_aligned; const int load_pos = apron_start_aligned + threadIdx.x; // Transfer data to shared memory if (load_pos >= apron_start) { data[load_pos - apron_start] = (apron_start_clamped <= load_pos && load_pos <= apron_end_clamped) ? input[row_start + load_pos] : 0; } __syncthreads(); // Perform convolution and write result to global memory const int write_pos = tile_start + threadIdx.x; if (write_pos <= tile_end_clamped) { const int smem_pos = write_pos - apron_start; float sum = 0; """ for k in range(-kernel_radius, kernel_radius + 1): bloom_cuda_source_template += string.Template( 'sum += data[smem_pos + $k] * kernel[kernel_radius - $k];\n').substitute(k=k) bloom_cuda_source_template += """ input[row_start + write_pos] = sum; } } /* * Function: convolution_column * --------------------------- * Performs 1D convolution for image in vertical direction using predefined above kernel. * @param input: data as 1D float array. * @param dataW: width of image in pixels as pointer to int. * @param dataH: height of image in pixels as pointer to int. * @param smem_stride: stride in shared memory array as pointer to int. * @param gmem_stride: stride in global memory array as pointer to int. */ __global__ void convolution_column(float *input, int *dataW, int *dataH, int *smem_stride, int *gmem_stride) { extern __shared__ float data[]; // Define working area const int tile_start = __mul24(blockIdx.y, col_tile_h); const int tile_end = tile_start + col_tile_h - 1; const int apron_start = tile_start - kernel_radius; const int apron_end = tile_end + kernel_radius; const int tile_end_clamped = min(tile_end, dataH[0] - 1); const int apron_start_clamped = max(apron_start, 0); const int apron_end_clamped = min(apron_end, dataH[0] - 1); const int column_start = __mul24(blockIdx.x, col_tile_w) + threadIdx.x; int smem_pos = __mul24(threadIdx.y, col_tile_w) + threadIdx.x; int gmem_pos = __mul24(apron_start + threadIdx.y, dataW[0]) + column_start; // Transfer data from global to shared memory. for (int y = apron_start + threadIdx.y; y < apron_end; y += blockDim.y, smem_pos += smem_stride[0], gmem_pos += gmem_stride[0]) { data[smem_pos] = (apron_start_clamped <= y && y <= apron_end_clamped) ? input[gmem_pos] : 0; } __syncthreads(); // Perform convolution (each thread performs convolution several times to different parts of image) smem_pos = __mul24(threadIdx.y + kernel_radius, col_tile_w) + threadIdx.x; gmem_pos = __mul24(tile_start + threadIdx.y, dataW[0]) + column_start; for (int y = tile_start + threadIdx.y; y <= tile_end_clamped; y += blockDim.y, smem_pos += smem_stride[0], gmem_pos += gmem_stride[0]) { float sum = 0; """ for k in range(-kernel_radius, kernel_radius + 1): bloom_cuda_source_template += string.Template( 'sum += data[smem_pos + __mul24($k, col_tile_w)] * kernel[kernel_radius - $k];\n').substitute(k=k) bloom_cuda_source_template += """ input[gmem_pos] = sum; } } /* * Function: array_add_sat255 * ----------------------------- * Performs addition with saturation to 255.0 of sRGB (3-channel) image with grayscale (1-channel) image. * @param array3: initial and resulting sRGB (3-channel) image as 1D float array; * 1D-mapping: from R to B, from left to right, from top to bottom. * @param array1: grayscale (1-channel) image as 1D float array; * 1D-mapping: from left to right, from top to bottom. * @param w: width of image in pixels as single-element int array. * @param n: total number of pixels in image (length of array1) as single-element int array. */ __global__ void arrays_add_sat255(float *array3, float *array1, int *w, int *h) { int px_x = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; int px_y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y; int px = px_y * w[0] + px_x; array3[px * 3 + 0] = array3[px * 3 + 0] + array1[px]; array3[px * 3 + 1] = array3[px * 3 + 1] + array1[px]; array3[px * 3 + 2] = array3[px * 3 + 2] + array1[px]; if (array3[px * 3 + 0] > 255.0) array3[px * 3 + 0] = 255.0; if (array3[px * 3 + 1] > 255.0) array3[px * 3 + 1] = 255.0; if (array3[px * 3 + 2] > 255.0) array3[px * 3 + 2] = 255.0; } """ kernel_radius_aligned = int(16) row_tile_w = int(128) col_tile_w = int(16) col_tile_h = int(48) bloom_cuda_source = string.Template(bloom_cuda_source_template). \ substitute(kernel_radius=kernel_radius, kernel_radius_aligned=kernel_radius_aligned, kernel_w=kernel.size, row_tile_w=row_tile_w, col_tile_w=col_tile_w, col_tile_h=col_tile_h) bloom_cuda_module = SourceModule(bloom_cuda_source) img = img.astype(np.float32) sat = np.empty_like(img) w = np.int32(img.shape[1]) h = np.int32(img.shape[0]) n = np.int32(w * h); w_al = np.int32(int_align_up(w, 16)) smem_stride = np.int32(col_tile_w * 8) gmem_stride = np.int32(w_al * 8) lum = np.zeros((h, w), np.float32) img_g = drv.mem_alloc(img.nbytes) lum_g = drv.mem_alloc(lum.nbytes) kernel_g = bloom_cuda_module.get_global('kernel')[0] w_g = drv.mem_alloc(4) h_g = drv.mem_alloc(4) n_g = drv.mem_alloc(4) w_al_g = drv.mem_alloc(4) smem_stride_g = drv.mem_alloc(4) gmem_stride_g = drv.mem_alloc(4) drv.memcpy_htod(img_g, img) drv.memcpy_htod(lum_g, lum) drv.memcpy_htod(kernel_g, kernel) drv.memcpy_htod(w_g, w) drv.memcpy_htod(h_g, h) drv.memcpy_htod(n_g, n) drv.memcpy_htod(w_al_g, w_al) drv.memcpy_htod(smem_stride_g, smem_stride) drv.memcpy_htod(gmem_stride_g, gmem_stride) block_size = 1024 grid_size = int_div_up(n, block_size) bloom_cuda_module.get_function("luminance")( lum_g, img_g, n_g, block=(block_size, 1, 1), grid=(grid_size, 1, 1)) lum_max = np.zeros((1, 1), np.float32) mutex = np.zeros((1, 1), np.int32) bloom_cuda_module.get_function("array_max")( drv.Out(lum_max), lum_g, n_g, drv.In(mutex), block=(block_size, 1, 1), grid=(grid_size >> 1, 1), shared=block_size * 4) lum_threshold = np.float32(lum_max * threshold) bloom_cuda_module.get_function("array_highpass")( lum_g, n_g, drv.In(lum_threshold), block=(block_size, 1, 1), grid=(grid_size, 1)) block_size_x = int(kernel_radius_aligned + row_tile_w + kernel_radius) grid_size_x = int(int_div_up(w_al, row_tile_w)) grid_size_y = int(h) shared_size = int((kernel_radius + row_tile_w + kernel_radius) * 4) bloom_cuda_module.get_function("convolution_row")( lum_g, w_al_g, h_g, block=(block_size_x, 1, 1), grid=(grid_size_x, grid_size_y), shared=shared_size) grid_size_x = int(int_div_up(w_al, col_tile_w)) grid_size_y = int(int_div_up(h, col_tile_h)) shared_size = int(col_tile_w * (kernel_radius + row_tile_w + kernel_radius) * 4) bloom_cuda_module.get_function("convolution_column")( lum_g, w_al_g, h_g, smem_stride_g, gmem_stride_g, block=(col_tile_w, 8, 1), grid=(grid_size_x, grid_size_y), shared=shared_size) bloom_cuda_module.get_function("arrays_add_sat255")( img_g, lum_g, w_g, h_g, block=(block_size, 1, 1), grid=(grid_size, 1) ) drv.memcpy_dtoh(sat, img_g) sat = sat.astype(np.uint8) return sat
*pivot = pivot_local; atomicExch(&lock, 0); needlock = 0; } } } } """ % (size) mod = SourceModule(kernel, options=["--ptxas-options=-v"]) start = timer() phase_adj = mod.get_function("phase_adj") entropia_gpu = mod.get_global('entropia')[0] device_real = mod.get_global('device_real')[0] device_imag = mod.get_global('device_imag')[0] cuda.memcpy_htod(device_real, FFT.real.astype(np.float32)) cuda.memcpy_htod(device_imag, FFT.imag.astype(np.float32)) entropia_cpu = array([2147483647]) cuda.memcpy_htod(entropia_gpu, entropia_cpu.astype(np.float32)) phc0_cpu = array([0]) phc0_gpu = cuda.to_device(phc0_cpu.astype(np.int32)) phc1_cpu = array([0]) phc1_gpu = cuda.to_device(phc1_cpu.astype(np.int32)) pivot_cpu = array([0]) pivot_gpu = cuda.to_device(pivot_cpu.astype(np.int32)) phase_adj(
def main(): #Initialise InteractionMatrix def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Initialise GPU (equivalent of autoinit) drv.init() assert drv.Device.count() >= 1 dev = drv.Device(0) ctx = dev.make_context(0) #Convert GlobalParams to List GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32) count = 0 for x in GlobalParamsDict.keys(): GlobalParams[count] = GlobalParamsDict[x] count += 1 #Convert FitnessParams to List FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32) count = 0 for x in FitnessParamsDict.keys(): FitnessParams[count] = FitnessParamsDict[x] count += 1 #Convert GAParams to List GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32) count = 0 for x in GAParamsDict.keys(): GAParams[count] = GAParamsDict[x] count += 1 # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', 'cuda')) # Load source code from file Source = env.get_template('kernel.cu') #Template( file(KernelFile).read() ) #Create dictionary argument for rendering RenderArgs= {"params_size":GlobalParams.nbytes,\ "fitnessparams_size":FitnessParams.nbytes,\ "gaparams_size":GAParams.nbytes,\ "genome_bytelength":int(ByteLengthGenome),\ "genome_bitlength":int(BitLengthGenome),\ "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\ "textures":range( 0, NrFitnessFunctionGrids ),\ "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\ "with_mixed_crossover":WithMixedCrossover, "with_bank_conflict":WithBankConflict, "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection, "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues, "with_uniform_crossover":WithUniformCrossover, "with_single_point_crossover":WithSinglePointCrossover, "with_surefire_mutation":WithSurefireMutation, "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory, "ga_threaddimx":int(ThreadDim), "glob_nr_tiletypes":int(NrTileTypes), "glob_nr_edgetypes":int(NrEdgeTypes), "glob_nr_tileorientations":int(NrTileOrientations), "fit_dimgridx":int(DimGridX), "fit_dimgridy":int(DimGridY), "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids), "fit_nr_fourpermutations":int(NrFourPermutations), "fit_assembly_redundancy":int(NrAssemblyRedundancy), "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock), "sort_threaddimx":int(Sort_ThreadDimX), "glob_nr_genomes":int(NrGenomes), "fit_dimthreadx":int(ThreadDimX), "fit_dimthready":int(ThreadDimY), "fit_dimsubgridx":int(SubgridDimX), "fit_dimsubgridy":int(SubgridDimY), "fit_nr_subgridsperbank":int(NrSubgridsPerBank), "glob_bitlength_edgetype":int(EdgeTypeBitLength) } # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None) #Allocate values on GPU Genomes_h = drv.mem_alloc(Genomes.nbytes) FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes) FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes) AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes) Mutexe_h = drv.mem_alloc(Mutexe.nbytes) ReductionList_h = drv.mem_alloc(ReductionList.nbytes) #Copy values to global memory drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #Copy values to constant / texture memory for id in range(0, NrFitnessFunctionGrids): FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) ) drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C") InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address drv.memcpy_htod(GlobalParams_h[0], GlobalParams) FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address drv.memcpy_htod(FitnessParams_h[0], FitnessParams) GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address drv.memcpy_htod(GAParams_h[0], GAParams) FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst") FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst") #Set up curandStates curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper) CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes) #Compile kernels curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel") fitness_fnc = KernelSourceModule.get_function("FitnessKernel") sorting_fnc = KernelSourceModule.get_function("SortingKernel") ga_fnc = KernelSourceModule.get_function("GAKernel") #Initialise Curand curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1)) #Build parameter lists for FitnessKernel and GAKernel FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h); SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h) GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h); #TEST ONLY return #TEST ONLY #Initialise CUDA timers start = drv.Event() stop = drv.Event() #execute kernels for specified number of generations start.record() for gen in range(0, GlobalParamsDict["NrGenerations"]): #print "Processing Generation: %d"%(gen) #fitness_fnc(*(FitnessKernelParams), block=fit_blocks, grid=fit_grid) #Launch CPU processing (should be asynchroneous calls) sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel drv.memcpy_dtoh(ReductionList, ReductionList_h) #Copy from Device to Host and finish sorting FitnessSumConst = ReductionList.sum() drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitneValues from Device to Device Const ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) drv.memcpy_dtoh(Genomes, Genomes_h) #Copy data from GPU drv.memcpy_dtoh(FitnessValues, FitnessValues_h) drv.memcpy_dtoh(AssembledGrids, AssembledGrids_h) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) pass
class GPURBFEll(object): """RBF Kernel with ellpack format""" cache_size =100 Gamma=1.0 #template func_name='rbfEllpackILPcol2multi' #template module_file = os.path.dirname(__file__)+'/cu/KernelsEllpackCol2.cu' #template texref_nameI='VecI_TexRef' texref_nameJ='VecJ_TexRef' max_concurrent_kernels=1 def __init__(self,gamma=1.0,cache_size=100): """ Initialize object Parameters ------------- max_kernel_nr: int determines maximal concurrent kernel column gpu computation """ self.cache_size=cache_size self.threadsPerRow=1 self.prefetch=2 self.tpb=128 self.Gamma = gamma def init_cuda(self,X,Y, cls_start, max_kernels=1 ): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X =X self.Y = Y self.cls_start=cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y=[] #handle to gpu memory for results for each concurrent classifier self.g_out=[] #gpu kernel out self.kernel_out=[] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg=[] #function reference self.func=[] #texture references for each concurrent kernel self.tex_ref=[] #main vectors #gpu self.g_vecI=[] self.g_vecJ=[] #cpu self.main_vecI=[] self.main_vecJ=[] #cpu class self.cls_count=[] self.cls=[] #gpu class self.g_cls_count=[] self.g_cls=[] self.sum_cls=[] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N,self.Dim = X.shape column_size = self.N*4 cacheMB = self.cache_size*1024*1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB/column_size).astype(int) cache_items = min(self.N,cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open (self.module_file,"r") as CudaFile: module_code = CudaFile.read(); #compile module self.module = SourceModule(module_code,keep=True,no_extern_c=True) (g_gamma,gsize)=self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) ) #get functions reference Dim =self.Dim vecBytes = Dim*4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex=self.module.get_texref('VecI_TexRef') self.g_vecI[f]=cuda.mem_alloc( vecBytes) vecI_tex.set_address(self.g_vecI[f],vecBytes) #init texture for vector J vecJ_tex=self.module.get_texref('VecJ_TexRef') self.g_vecJ[f]=cuda.mem_alloc( vecBytes) vecJ_tex.set_address(self.g_vecJ[f],vecBytes) self.tex_ref.append((vecI_tex,vecJ_tex) ) self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32)) self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist) #transform X to particular format v,c,r=spf.csr2ellpack(self.X,align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start) def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n): """ Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2. Parameters ------------ kernel_nr : int concurrent kernel number y_cls : array-like binary class labels (1,-1) cls1: int first class number cls2: int second class number cls1_n : int number of elements of class 1 cls2_n : int number of elements of class 2 kernel_out : array-like array for gpu kernel result, size=2*len(y_cls) """ warp=32 align_cls1_n = cls1_n+(warp-cls1_n%warp)%warp align_cls2_n = cls2_n+(warp-cls2_n%warp)%warp self.cls1_N_aligned=align_cls1_n sum_cls= align_cls1_n+align_cls2_n self.sum_cls[kernel_nr] = sum_cls self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32) self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32) self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr]) self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr]) self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb )) self.g_y[kernel_nr] = cuda.to_device(y_cls) self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32) ker_out = self.kernel_out[kernel_nr] self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out) #add prepare for device functions def K2Col(self,i,j,i_ds,j_ds,kernel_nr): """ computes i-th and j-th kernel column Parameters --------------- i: int i-th kernel column number in subproblem j: int j-th kernel column number in subproblem i_ds: int i-th kernel column number in whole dataset j_ds: int j-th kernel column number in whole dataset kernel_nr : int number of concurrent kernel ker2ColOut: array like array for output Returns ------- ker2Col """ #make i-th and j-the main vectors vecI= self.main_vecI[kernel_nr] vecJ= self.main_vecJ[kernel_nr] # self.X[i_ds,:].todense(out=vecI) # self.X[j_ds,:].todense(out=vecJ) #vecI.fill(0) #vecJ.fill(0) #self.X[i_ds,:].toarray(out=vecI) #self.X[j_ds,:].toarray(out=vecJ) vecI=self.X.getrow(i_ds).todense() vecJ=self.X.getrow(j_ds).todense() #copy them to texture cuda.memcpy_htod(self.g_vecI[kernel_nr],vecI) cuda.memcpy_htod(self.g_vecJ[kernel_nr],vecJ) # temp = np.empty_like(vecI) # cuda.memcpy_dtoh(temp,self.g_vecI[kernel_nr]) # print 'temp',temp #lauch kernel gfunc=self.func[kernel_nr] gy = self.g_y[kernel_nr] gout = self.g_out[kernel_nr] gN = np.int32(self.N) g_i = np.int32(i) g_j = np.int32(j) g_ids = np.int32(i_ds) g_jds = np.int32(j_ds) gNalign = np.int32(self.cls1_N_aligned) gcs = self.g_cls_start gcc = self.g_cls_count[kernel_nr] gc = self.g_cls[kernel_nr] bpg=self.bpg[kernel_nr] #print 'start gpu i,j,kernel_nr ',i,j,kernel_nr #texReflist = list(self.tex_ref[kernel_nr]) #gfunc(self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc,block=(self.tpb,1,1),grid=(bpg,1),texrefs=texReflist) #print 'end gpu',i,j #copy the results #grid=(bpg,1),block=(self.tpb,1,1) gfunc.prepared_call((bpg,1),(self.tpb,1,1),self.g_val,self.g_col,self.g_len,self.g_sdot,gy,gout,gN,g_i,g_j,g_ids,g_jds,gNalign,gcs,gcc,gc) cuda.memcpy_dtoh(self.kernel_out[kernel_nr],gout) return self.kernel_out[kernel_nr] def K_vec(self,vec): ''' vec - array-like, row ordered data, should be not to big ''' dot=self.X.dot(vec.T) x2=self.Xsquare.reshape((self.Xsquare.shape[0],1)) if(sp.issparse(vec)): v2 = vec.multiply(vec).sum(1).reshape((1,vec.shape[0])) else: v2 = np.einsum('...i,...i',vec,vec) return np.exp(-self.Gamma*(x2+v2-2*dot)) def compute_diag(self): """ Computes kernel matrix diagonal """ #for rbf diagonal consists of ones exp(0)==1 self.Diag = np.ones(self.X.shape[0],dtype=np.float32) if(sp.issparse(self.X)): # result as matrix self.Xsquare = self.X.multiply(self.X).sum(1) #result as array self.Xsquare = np.asarray(self.Xsquare).flatten() else: self.Xsquare =np.einsum('...i,...i',self.X,self.X) def clean(self,kernel_nr): """ clean the kernel cache """ #self.kernel_cache.clear() self.bpg[kernel_nr]=0 def clean_cuda(self): ''' clean all cuda resources ''' for f in range(self.max_concurrent_kernels): #vecI_tex=?? #self.g_vecI[f].free() del self.g_vecI[f] #init texture for vector J #vecJ_tex=?? #self.g_vecJ[f].free() del self.g_vecJ[f] self.g_cls_count[f].free() self.g_cls[f].free() self.g_y[f].free() self.g_out[f].free() #test it #del self.g_out[f] ?? #copy format data structure to gpu memory self.g_val.free() self.g_col.free() self.g_len.free() self.g_sdot.free() self.g_cls_start.free() print self.ctx self.ctx.pop() print self.ctx del self.ctx def predict_init(self, SV): """ Init the classifier for prediction """ self.X =SV self.compute_diag()
kernels = ''' __constant__ float2 vc[1]; __global__ void vcf(float2 *psi) { int tid = threadIdx.x; __shared__ float2 spsi[TID_MAX]; spsi[tid] = psi[tid]; if ( tid < TID_MAX ) { psi[tid].x = vc[0].x * spsi[tid].x - vc[0].y * spsi[tid].y; psi[tid].y = vc[0].x * spsi[tid].y + vc[0].y * spsi[tid].x; //psi[tid].x = vc[0].x; //psi[tid].y = vc[0].y; } }'''.replace('TID_MAX', str(35)) print kernels mod = SourceModule(kernels) vcf = mod.get_function('vcf') vc_const, _ = mod.get_global('vc') cuda.memcpy_htod(vc_const, vc) psi_gpu = gpuarray.to_gpu(psi) vcf(psi_gpu, block=(256,1,1), grid=(1,1)) assert np.linalg.norm(psi_gpu.get() - vc*psi) < 1e-6 ctx.pop()
def finite_sum_d(self, TILEWIDTH, TILEHEIGHT, g): template = """ #include <stdlib.h> #include <stdio.h> #include <math.h> #define GENUS %d #define TILEHEIGHT %d #define TILEWIDTH %d __device__ __constant__ double Xd[GENUS*GENUS]; __device__ __constant__ double Yinvd[GENUS * GENUS]; __device__ __constant__ double Td[GENUS*GENUS]; /*************************************************************************** normpart -------- A helper function for the finite sum functions. Computes: -pi * ||T*(n + fracshift)||^2 = -pi * ||T * (n + (shift - intshift))||^2 = -pi * ||T * (n + Yinv*y - round(Yinv*y))||^2 ***************************************************************************/ __device__ double normpart(int g, double* Sd_s, double* yd_s) { int tx = threadIdx.x; int ty = threadIdx.y; double norm = 0; int i,j,k; for (i = 0; i < g; i++) { double sum = 0; for (j = 0; j < g; j++) { double T_ij = Td[i*g + j]; double n_j = Sd_s[tx*g + j]; double shift_j = 0; for (k = 0; k < g; k++) { shift_j += Yinvd[g*j + k]*yd_s[ty*g + k]; } sum += T_ij * (n_j + shift_j - round(shift_j)); } norm += sum * sum; } return -M_PI * norm; } /************************************************************************* exppart ------- A helper function for the finite sum functions. Computes: 2pi * <(n - intshift), (1/2)X(n - intshift) + x> =2pi * <n - round(shift), (1/2)X(n - round(shift) + x> =2pi * <n - round(Yinv*y), (1/2)X(n - round(Yinv*y) + x> ***************************************************************************/ __device__ double exppart(int g, double* Sd_s, double* xd_s, double* yd_s) { int tx = threadIdx.x; int ty = threadIdx.y; double exppart = 0; int i,j,k,h; for (i = 0; i < g; i++) { double n_i = Sd_s[tx*g + i]; double shift_i = 0; for (k = 0; k < g; k++) { shift_i += Yinvd[k + i*g] * yd_s[ty*g + k]; } double A = n_i - round(shift_i); double Xshift_i = 0; for (j = 0; j < g; j++) { double X_ij = Xd[j + i * g]; double shift_j = 0; for (h = 0; h < g; h++) { shift_j += Yinvd[h + j * g] * yd_s[ty*g + h]; } Xshift_i += (.5) * (X_ij * (Sd_s[tx*g + j] - round(shift_j))); } double B = Xshift_i + xd_s[ty*g + i]; exppart += A * B; } return 2 * M_PI * exppart; } /********************************************************************** Derivative Product Computes: ___ | | 2*pi*I <d, n-intshift> | | d in derivs = ___ | | 2*pi*I <d, n-round(shift)> | | d in derivs = ___ | | 2*pi*I <d, n-round(Yinv*y)> | | d in derivs ************************************************************************/ __device__ void deriv_prod(int g, double* Sd_s, double* yd_s, double* dpr, double* dpi, double* deriv_real, double* deriv_imag, int nderivs) { int tx = threadIdx.x; int ty = threadIdx.y; double total_real = 1; double total_imag = 0; int i,j,k; for (i = 0; i < nderivs; i++){ double term_real = 0; double term_imag = 0; for (j = 0; j < g; j++){ double shift_j = 0; for (k = 0; k < g; k++){ shift_j += Yinvd[j*g + k] * yd_s[ty*g + k]; } double intshift = round(shift_j); double nmintshift = Sd_s[tx*g + j] - intshift; term_real += deriv_real[j + g*i] * nmintshift; term_imag += deriv_imag[j + g*i] * nmintshift; } total_real = total_real * term_real - total_imag * term_imag; total_imag = total_real * term_imag + total_imag * term_real; } //Computes: (2*pi*i)^(nderivs) * (total_real + total_imag*i) double pi_mult = pow(2*M_PI, nderivs); /*Determines what the result of i^nderivs is, and performs the correct multiplication afterwards.*/ if (nderivs %% 4 == 0) { dpr[0] = pi_mult*total_real; dpi[0] = pi_mult*total_imag; } else if (nderivs %% 4 == 1) { dpr[0] = -pi_mult * total_imag; dpi[0] = pi_mult * total_real; } else if (nderivs %% 4 == 2) { dpr[0] = -pi_mult * total_real; dpi[0] = -pi_mult * total_imag; } else if (nderivs %% 4 == 3) { dpr[0] = pi_mult * total_imag; dpi[0] = -pi_mult * total_real; } } /*********************************************************************** Finite Sum Without Derivatives Kernel Function ************************************************************************/ __global__ void riemann_theta(double* fsum_reald, double* fsum_imagd, double* xd, double* yd, double* Sd, int g, int N, int K) { /*Built in variables to be used, br is block row, bc is block column, and similiarly for tr and tc.*/ int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double Sd_s[TILEWIDTH * GENUS]; __shared__ double xd_s[TILEHEIGHT * GENUS]; __shared__ double yd_s[TILEHEIGHT * GENUS]; /*Determine n_1, the start of the summation vector, the full vector is of the form n_1, n_2, ..., n_g*/ int n_start = (bx * TILEWIDTH + tx) * g; /*Now n = S[n_start], S[n_start + 1], ..., S[n_start + (g - 1)]*/ /*Determine z the point of evaluation*/ int z_start = (by * TILEHEIGHT + ty) * g; /*Now x = (x[z_start], x[z_start + 1], ... , x[z_start + (g-1)], and similiarly for y.*/ /*Load data into shared arrays*/ int i; for (i = 0; i < g; i++) { Sd_s[tx*g + i] = Sd[n_start + i]; xd_s[ty*g + i] = xd[z_start + i]; yd_s[ty*g + i] = yd[z_start + i]; } __syncthreads(); if (n_start < N*g && z_start < K*g) { /*Compute the "cosine" and "sine" parts of the summand*/ double ept, npt, cpt, spt; ept = exppart(g,Sd_s, xd_s, yd_s); npt = exp(normpart(g, Sd_s, yd_s)); cpt = npt * cos(ept); spt = npt * sin(ept); fsum_reald[n_start/g + z_start/g * N] = cpt; fsum_imagd[n_start/g + z_start/g * N] = spt; } } /************************************************************************************ Finite Sum with Derivatives Kernel Function ************************************************************************************/ __global__ void riemann_theta_derivatives(double* fsum_reald, double* fsum_imagd, double* xd, double* yd, double* Sd, double* deriv_real, double* deriv_imag, int nderivs, int g, int N, int K) { /*Built in variables to be used, br is block row, bc is block column, and similiarly for tr and tc.*/ int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double Sd_s[TILEWIDTH * GENUS]; __shared__ double xd_s[TILEHEIGHT * GENUS]; __shared__ double yd_s[TILEHEIGHT * GENUS]; /*Determine n_1, the start of the summation vector, the full vector is of the form n_1, n_2, ..., n_g*/ int n_start = (bx * TILEWIDTH + tx) * g; /*Now n = S[n_start], S[n_start + 1], ..., S[n_start + (g - 1)]*/ /*Determine z the point of evaluation*/ int z_start = (by * TILEHEIGHT + ty) * g; /*Now x = (x[z_start], x[z_start + 1], ... , x[z_start + (g-1)], and similiarly for y.*/ /*Load data into shared arrays*/ int i; for (i = 0; i < g; i++) { Sd_s[tx*g + i] = Sd[n_start + i]; xd_s[ty*g + i] = xd[z_start + i]; yd_s[ty*g + i] = yd[z_start + i]; } __syncthreads(); if (n_start < N*g && z_start < K*g) { /*Compute the "cosine" and "sine" parts of the summand*/ double dpr[1]; double dpi[1]; dpr[0] = 0; dpi[0] = 0; double ept, npt, cpt, spt; ept = exppart(g,Sd_s, xd_s, yd_s); npt = exp(normpart(g, Sd_s, yd_s)); cpt = npt * cos(ept); spt = npt * sin(ept); deriv_prod(g, Sd_s, yd_s, dpr, dpi, deriv_real, deriv_imag, nderivs); fsum_reald[n_start/g + z_start/g * N] = dpr[0] * cpt - dpi[0] * spt; fsum_imagd[n_start/g + z_start/g * N] = dpi[0] * cpt + dpr[0] * spt; } } """ %(g, TILEHEIGHT, TILEWIDTH) mod = SourceModule(template) func = mod.get_function("riemann_theta") deriv_func = mod.get_function("riemann_theta_derivatives") Xd = mod.get_global("Xd")[0] Yinvd = mod.get_global("Yinvd")[0] Td = mod.get_global("Td")[0] return (func, deriv_func, Xd, Yinvd, Td)
for k in range(farthest_index+1, NUM_CHARGERS): current_assignments[k] = current_assignments[k-1]+1 sys.exit() """ else: defines += "#define NUM_THREADS " + str(999) + "\n" +\ "#define TOTAL_WORK " + str(0) + "\n" +\ "#define NUM_WORK_PER_THREAD " + str(0) + "\n" +\ "#define LAST_THREAD_START_OFFSET " + str(0) + "\n" mod = SourceModule(preamble + defines + kernel_approx_src, no_extern_c=True) (routes_lengths_gpu, size_in_bytes) = mod.get_global("routes_lengths") (stops_lengths_gpu, size_in_bytes) = mod.get_global("stops_lengths") cuda.memcpy_htod(routes_lengths_gpu, routes_lengths_np) cuda.memcpy_htod(stops_lengths_gpu, stops_lengths_np) routes_gpu = cuda.mem_alloc(routes_np.nbytes) cuda.memcpy_htod(routes_gpu, routes_np) stops_gpu = cuda.mem_alloc(stops_np.nbytes) cuda.memcpy_htod(stops_gpu, stops_np) final_utilities_np = np.zeros((NUM_RUNS), dtype=np.float32) final_utilities_gpu = cuda.mem_alloc(final_utilities_np.nbytes) final_chargers_np = np.zeros((NUM_RUNS, NUM_CHARGER_INTS), dtype=np.uint32)
def solve_gpu(currentmodelrun, modelend, G): """Solving using FDTD method on GPU. Implemented using Nvidia CUDA. Args: currentmodelrun (int): Current model run number. modelend (int): Number of last model to run. G (class): Grid class instance - holds essential parameters describing the model. Returns: tsolve (float): Time taken to execute solving memsolve (int): memory usage on final iteration in bytes """ import pycuda.driver as drv from pycuda.compiler import SourceModule drv.init() # Suppress nvcc warnings on Windows if sys.platform == 'win32': compiler_opts = ['-w'] else: compiler_opts = None # Create device handle and context on specifc GPU device (and make it current context) dev = drv.Device(G.gpu.deviceID) ctx = dev.make_context() # Electric and magnetic field updates - prepare kernels, and get kernel functions if Material.maxpoles > 0: kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]), options=compiler_opts) else: # Set to one any substitutions for dispersive materials kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1), options=compiler_opts) update_e_gpu = kernels_fields.get_function("update_e") update_h_gpu = kernels_fields.get_function("update_h") # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0] if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem: raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name)) else: drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU if Material.maxpoles > 0: # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values). update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A") update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B") G.gpu_initialise_dispersive_arrays() # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU G.gpu_set_blocks_per_grid() G.gpu_initialise_arrays() # PML updates if G.pmls: # Prepare kernels pmlmodulelectric = 'gprMax.pml_updates.pml_updates_electric_' + G.pmlformulation + '_gpu' kernelelectricfunc = getattr(import_module(pmlmodulelectric), 'kernels_template_pml_electric_' + G.pmlformulation) pmlmodulemagnetic = 'gprMax.pml_updates.pml_updates_magnetic_' + G.pmlformulation + '_gpu' kernelmagneticfunc = getattr(import_module(pmlmodulemagnetic), 'kernels_template_pml_magnetic_' + G.pmlformulation) kernels_pml_electric = SourceModule(kernelelectricfunc.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) kernels_pml_magnetic = SourceModule(kernelmagneticfunc.substitute(REAL=cudafloattype, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsH.shape[1], NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels updatecoeffsE = kernels_pml_electric.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_pml_magnetic.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Set block per grid, initialise arrays on GPU, and get kernel functions for pml in G.pmls: pml.gpu_initialise_arrays() pml.gpu_get_update_funcs(kernels_pml_electric, kernels_pml_magnetic) pml.gpu_set_blocks_per_grid(G) # Receivers if G.rxs: # Initialise arrays on GPU rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G) # Prepare kernel and get kernel function kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1), options=compiler_opts) store_outputs_gpu = kernel_store_outputs.get_function("store_outputs") # Sources - initialise arrays on GPU, prepare kernel and get kernel functions if G.voltagesources + G.hertziandipoles + G.magneticdipoles: kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1, NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) if G.hertziandipoles: srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G) update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole") if G.magneticdipoles: srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G) update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole") if G.voltagesources: srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G) update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source") # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions if G.snapshots: # Initialise arrays on GPU snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G) # Prepare kernel and get kernel function kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.nx + 1, NY_FIELDS=G.ny + 1, NZ_FIELDS=G.nz + 1), options=compiler_opts) store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot") # Iteration loop timer iterstart = drv.Event() iterend = drv.Event() iterstart.record() for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=not G.progressbars): # Get GPU memory usage on final iteration if iteration == G.iterations - 1: memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0] # Store field component values for every receiver if G.rxs: store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration), rxcoords_gpu.gpudata, rxs_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1)) # Store any snapshots for i, snap in enumerate(G.snapshots): if snap.time == iteration + 1: if not G.snapsgpu2cpu: store_snapshot_gpu(np.int32(i), np.int32(snap.xs), np.int32(snap.xf), np.int32(snap.ys), np.int32(snap.yf), np.int32(snap.zs), np.int32(snap.zf), np.int32(snap.dx), np.int32(snap.dy), np.int32(snap.dz), G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata, snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata, block=Snapshot.tpb, grid=Snapshot.bpg) else: store_snapshot_gpu(np.int32(0), np.int32(snap.xs), np.int32(snap.xf), np.int32(snap.ys), np.int32(snap.yf), np.int32(snap.zs), np.int32(snap.zf), np.int32(snap.dx), np.int32(snap.dy), np.int32(snap.dz), G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata, snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata, block=Snapshot.tpb, grid=Snapshot.bpg) gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), 0, snap) # Update magnetic field components update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update magnetic field components with the PML correction for pml in G.pmls: pml.gpu_update_magnetic(G) # Update magnetic field components for magetic dipole sources if G.magneticdipoles: update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata, srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1)) # Update electric field components # If all materials are non-dispersive do standard update if Material.maxpoles == 0: update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # If there are any dispersive materials do 1st part of dispersive update # (it is split into two parts as it requires present and updated electric field values). else: update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update electric field components with the PML correction for pml in G.pmls: pml.gpu_update_electric(G) # Update electric field components for voltage sources if G.voltagesources: update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata, srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1)) # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last) if G.hertziandipoles: update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata, srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1)) # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates. if Material.maxpoles > 0: update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Copy output from receivers array back to correct receiver objects if G.rxs: gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G) # Copy data from any snapshots back to correct snapshot objects if G.snapshots and not G.snapsgpu2cpu: for i, snap in enumerate(G.snapshots): gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) iterend.record() iterend.synchronize() tsolve = iterstart.time_till(iterend) * 1e-3 # Remove context from top of stack and delete ctx.pop() del ctx return tsolve, memsolve
class CudaRender: """Main class. Do all stuff""" def __init__(self, w = 800, h = 800, name = "CudaRender"): self.w = w self.h = h self.name = name self.buffers = {} self.pointers = {} self.scn = Blender.Scene.GetCurrent() self.mouse_states = {} self.cuda_buffers = {} self.mouse_coords = {GLUT_LEFT_BUTTON : (0,0)} self.frame = 0 self.timebase = 0 self.calculated_vis = False self.from_files = {'model' : False, 'cudaRes': False} self.kernel_step = 4 self.secs = 0 self.cuda_stop = False self.live = True self.memory_type = '1dtex' # 'global' def create_events(self): self.start = cuda_driver.Event() self.end = cuda_driver.Event() def load_models_from_blender(self): self.objs = Blender.Object.GetSelected() if self.objs: self.verts = [] self.normals = [] self.indexes = [] index_offset = 0 for obj in self.objs: mesh = obj.getData(mesh=True) mesh.quadToTriangle() mesh.transform(obj.getMatrix()) counter = 0 for face in mesh.faces: self.indexes.extend(range(index_offset + counter,index_offset + counter + 3)) counter += 3 for v in face.v: self.verts.extend(v.co) self.normals.extend(face.no) index_offset = max(self.indexes) + 1 self.verts = numpy.asarray(self.verts).astype(numpy.float32) self.indexes = numpy.asarray(self.indexes).astype(numpy.ushort) self.normals = numpy.asarray(self.normals).astype(numpy.float32) def save_models_data_to_files(self): numpy.savetxt(os.path.join(p, 'data', 'verts.dat'), self.verts) numpy.savetxt(os.path.join(p, 'data', 'normals.dat'), self.normals) numpy.savetxt(os.path.join(p, 'data', 'indexes.dat'), self.indexes) def save_cuda_res_to_files(self): numpy.savetxt(os.path.join(p, 'data', 'vis.dat'), self.vis) def load_cuda_res_from_files(self): self.vis = numpy.loadtxt(os.path.join(p, 'data', 'vis.dat')).astype(numpy.float32) def load_models_from_files(self): self.verts = numpy.loadtxt(os.path.join(p, 'data', 'verts.dat')).astype(numpy.float32) self.normals = numpy.loadtxt(os.path.join(p, 'data', 'normals.dat')).astype(numpy.float32) self.indexes = numpy.loadtxt(os.path.join(p, 'data', 'indexes.dat')).astype(numpy.ushort) def load(self): self.load_cam() self.design = numpy.loadtxt(os.path.join(p, 'data', 'des3d_240_21.txt')).astype(numpy.float32) if (self.from_files['model']): self.load_models_from_files() else: self.load_models_from_blender() self.save_models_data_to_files() if (self.from_files['cudaRes']): self.load_cuda_res_from_files() else: self.vis = numpy.zeros((1, self.verts.size/3), numpy.float32) def create_glut_window(self): glutInit(sys.argv) glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH) glutInitWindowSize(self.w,self.h) glutCreateWindow(self.name) glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_CONTINUE_EXECUTION) glEnable(GL_DEPTH_TEST) #TODO: get bgColor form blender glutDisplayFunc(self.display) glutReshapeFunc(self.reshape) glutKeyboardFunc(self.keyboard) glutMouseFunc(self.mouse_button) glutMotionFunc(self.mouse_move) if (load_cuda): import pycuda.gl.autoinit import pycuda.driver as cuda_driver self.create_events() def mouse_button(self, button, state, x, y): self.mouse_states[button] = state self.mouse_coords[button] = (x,y) def mouse_move(self, x, y): cam_move_button = GLUT_LEFT_BUTTON f = 0.01 if cam_move_button in self.mouse_states and self.mouse_states[cam_move_button] == GLUT_DOWN: mod = glutGetModifiers() dx = self.mouse_coords[cam_move_button][0] - x dy = self.mouse_coords[cam_move_button][1] - y self.mouse_coords[cam_move_button] = (x,y) if mod <> GLUT_ACTIVE_CTRL: self.cam['phi'] += f*dx self.cam['theta'] += f*dy if mod == GLUT_ACTIVE_CTRL: self.cam['r'] += f*dy if self.cam['r'] < 4: self.cam['r'] = 4 if self.cam['r'] > 20 : self.cam['r'] = 20 self.set_cam() glutPostRedisplay() def keyboard(self, key, x, y): code = ord(key) if (code == 27):#Esc glutLeaveMainLoop() if (key == ' '):#Space self.cuda_stop = not self.cuda_stop def reshape(self, width, height): self.w = width self.h = height glutPostRedisplay() def cuda_from_display(self): #TODO: cuda streams ? if (not self.calculated_vis and load_cuda and not self.cuda_stop): try: step = self.kernel_iterator.next() self.cuda_map() self.cuda_call_kernal_step(step) glutSetWindowTitle("%s kernel steps: %d" % (self.name, step)) self.cuda_unmap() except StopIteration: self.cuda_print_secs() self.cuda_free_memory() self.calculated_vis = True self.save_cuda_res_to_files() def display(self): self.cuda_from_display() glViewport(0, 0, self.w, self.h) #TODO: do smoothing glEnable(GL_POLYGON_SMOOTH) glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT) self.put_buffer('aPos', 3) #self.put_buffer('aNorm', 3) self.put_buffer('aVis') glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, self.buffers['ind']) glDrawElements(GL_TRIANGLES, self.indexes.size, GL_UNSIGNED_SHORT, None) glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0) if (self.calculated_vis or self.cuda_stop): self.fps() glutSwapBuffers() if (not self.calculated_vis): glutPostRedisplay() def fps(self): self.frame += 1; time = glutGet(GLUT_ELAPSED_TIME) if (time - self.timebase > 1000): s = "FPS:%4.2f" % (self.frame*1000/(time - self.timebase)) self.timebase = time self.frame = 0 s = self.name + " " + s glutSetWindowTitle(s) def put_buffer(self, name, size = 1, type=GL_FLOAT): glBindBuffer(GL_ARRAY_BUFFER, self.buffers[name]) glVertexAttribPointer(self.pointers[name], size, type, GL_FALSE, 0, None) glBindBuffer(GL_ARRAY_BUFFER, 0) def make_buffer(self, name, data, target=GL_ARRAY_BUFFER, usage=GL_STATIC_DRAW): self.buffers[name] = glGenBuffers(1) glBindBuffer(target, self.buffers[name]) glBufferData(target, data.size*data.itemsize, data, usage) glBindBuffer(target, 0) def make_cuda_buffer(self, name): self.cuda_buffers[name] = cuda_gl.BufferObject(long(self.buffers[name])) def init_buffers(self): self.make_buffer('aVis', self.vis, GL_ARRAY_BUFFER, GL_DYNAMIC_DRAW) self.make_buffer('aPos', self.verts) #self.make_buffer('aNorm', self.normals) self.make_buffer('ind', self.indexes, GL_ELEMENT_ARRAY_BUFFER) if (not self.calculated_vis): self.make_cuda_buffer('aVis') if (self.memory_type == 'global'): self.make_cuda_buffer('aPos') #self.make_cuda_buffer('ind') def create_shaders(self): #TODO: get errors self.shaders = {} self.shaders['vertex'] = glCreateShader(GL_VERTEX_SHADER) self.shaders['fragment'] = glCreateShader(GL_FRAGMENT_SHADER) for (name,shader) in self.shaders.iteritems(): source = open(os.path.join(p, 'glsl', name + '.glsl')).read() glShaderSource(shader, source) glCompileShader(shader) self.program = glCreateProgram() glAttachShader(self.program, self.shaders['vertex']) glAttachShader(self.program, self.shaders['fragment']) glLinkProgram(self.program) glUseProgram(self.program) #self.program.mvMatrixUniform = glGetUniformLocation(self.program, "uMVMatrix"); def make_pointer(self, name): self.pointers[name] = glGetAttribLocation(self.program, name) glEnableVertexAttribArray(self.pointers[name]) def init_pointers(self): self.make_pointer('aVis') self.make_pointer('aPos') #self.make_pointer('aNorm') def load_cam(self): self.cam_obj = self.scn.objects.camera cam = self.cam_obj.getData() self.cam = {} matrix = self.cam_obj.getMatrix() self.cam['pos'] = pos = matrix[3] self.cam['forwards'] = -matrix[2] self.cam['target'] = matrix[3] - matrix[2] self.cam['up'] = matrix[1] self.cam['fov'] = cam.angle self.cam['start'] = cam.clipStart self.cam['end'] = cam.clipEnd #TODO: FIX self.cam['r'] = math.sqrt(numpy.dot(pos,pos)) self.cam['theta'] = math.acos(pos[2]/self.cam['r']) self.cam['phi'] = math.atan(pos[1]/pos[0]) def set_cam(self): r = self.cam['r']; theta = self.cam['theta']; phi = self.cam['phi'] self.cam['pos'][0] = r*math.sin(theta)*math.cos(phi) self.cam['pos'][1] = r*math.sin(theta)*math.sin(phi) self.cam['pos'][2] = r*math.cos(theta) glMatrixMode(GL_PROJECTION) glLoadIdentity() gluPerspective(self.cam['fov'], self.w / self.h, self.cam['start'], self.cam['end']) gluLookAt(self.cam['pos'][0], self.cam['pos'][1], self.cam['pos'][2],\ 0, 0, 0,\ 0, 0, 1) def set_matrix(self): self.set_cam() glMatrixMode(GL_MODELVIEW) glLoadIdentity() def cuda_print_secs(self): print "cuda time: %fs" % self.secs #TODO: optim block size and grid size def init_cuda(self): inc = {'common': os.path.join(p,'cu', 'common.cu')} template_params = {'inc': inc, 'dN' : self.design.size, 'visN': self.vis.size, 'vN' : self.verts.size, 'kernelStep' : self.kernel_step} kernel_code = Template( file = os.path.join(p, 'cu', 'vis_%s.cu' % (self.memory_type)), searchList = [template_params], ) self.cuda_module = SourceModule(kernel_code) self.cuda_call = self.cuda_module.get_function("vis") self.block_seize = (256, 1, 1) if (self.memory_type == 'global'): self.cuda_call.prepare("PPP", self.block_seize) if (self.memory_type == '1dtex'): self.cuda_call.prepare("P", self.block_seize) def cuda_get_memory(self): self.grid_dimensions = (min(32, (self.vis.size+256-1) // 256 ), 1) self.cuda_mem = {} if (self.memory_type == 'global'): self.cuda_mem['normals_gpu'] = cuda_driver.mem_alloc(self.normals.nbytes) cuda_driver.memcpy_htod(self.cuda_mem['normals_gpu'], self.normals) self.cuda_mem['design_gpu'] = self.cuda_module.get_global('design')[0] cuda_driver.memcpy_htod(self.cuda_mem['design_gpu'], self.design) self.cuda_mem['kernel_n'] = self.cuda_module.get_global('kernelN')[0] if (self.memory_type == '1dtex'): self.put_data_to_cuda1dtex(self.normals, 'n_tex') self.put_data_to_cuda1dtex(self.verts, 'v_tex') def put_data_to_cuda1dtex(self, data, name): if (not name in self.cuda_mem): self.cuda_mem[name] = self.cuda_module.get_texref(name) self.cuda_mem[name + '_gpu'] = cuda_driver.to_device(data) self.cuda_mem[name].set_address(self.cuda_mem[name + '_gpu'], data.nbytes) self.cuda_mem[name].set_format(cuda_driver.array_format.FLOAT, 1) def cuda_map(self): self.cuda_mem['map_vis'] = self.cuda_buffers['aVis'].map() if (self.memory_type == 'global'): self.cuda_mem['map_pos'] = self.cuda_buffers['aPos'].map() def cuda_unmap(self): cuda_driver.Context.synchronize() self.cuda_mem['map_vis'].unmap() if (self.memory_type == 'global'): self.cuda_mem['map_pos'].unmap() def cuda_free_memory(self): pass def cuda_call_kernel_global(self): self.cuda_call.prepared_call(self.grid_dimensions, self.cuda_mem['map_vis'].device_ptr(), self.cuda_mem['map_pos'].device_ptr(), self.cuda_mem['normals_gpu'] ) def cuda_call_kernel_1dtex(self): self.cuda_call.prepared_call(self.grid_dimensions, self.cuda_mem['map_vis'].device_ptr()) def cuda_call_kernal_step(self, step): self.start.record() cuda_driver.memcpy_htod(self.cuda_mem['kernel_n'], numpy.array([step]).astype(numpy.int32)) getattr(self, 'cuda_call_kernel_' + self.memory_type)() self.end.record() self.end.synchronize() #cuda_driver.Context.synchronize() # ????/???/ self.secs += self.start.time_till(self.end)*1e-3 def cuda_kernel_iterator(self): self.kernel_iterator = iter(range(0, self.design.size, self.kernel_step)) def call_cuda(self): self.secs = 0 for i in range(0, self.design.size, self.kernel_step): self.cuda_call_kernal_step(i) self.cuda_print_secs() def run(self): self.load() self.create_glut_window() self.set_matrix() self.init_buffers() self.create_shaders() self.init_pointers() if (not self.calculated_vis and load_cuda): self.init_cuda() self.cuda_kernel_iterator() self.cuda_get_memory() if (not self.live): self.cuda_map() self.call_cuda() self.cuda_free_memory() self.cuda_unmap() self.calculated_vis = True glutMainLoop()
def main(): #FourPermutations set-up FourPermutations = numpy.array([ [1,2,3,4], [1,2,4,3], [1,3,2,4], [1,3,4,2], [1,4,2,3], [1,4,3,2], [2,1,3,4], [2,1,4,3], [2,3,1,4], [2,3,4,1], [2,4,1,3], [2,4,3,1], [3,2,1,4], [3,2,4,1], [3,1,2,4], [3,1,4,2], [3,4,2,1], [3,4,1,2], [4,2,3,1], [4,2,1,3], [4,3,2,1], [4,3,1,2], [4,1,2,3], [4,1,3,2],]).astype(numpy.uint8) BankSize = 8 # Do not go beyond 8! #Define constants DimGridX = 19 DimGridY = 19 #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" BlockDimX = 100 BlockDimY = 100 SearchSpaceSize = BlockDimX * BlockDimY * 32 #BlockDimX = 600 #BlockDimY = 600 FitnessValDim = SearchSpaceSize GenomeDim = SearchSpaceSize #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":4, "bit_length_edge_type":3, "curand_nr_threads_per_block":32, "nr_tile_types":2, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":32*BankSize, "fit_dim_thread_y":1, "fit_dim_block_x":BlockDimX, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4, "banksize":BankSize, "curand_dim_block_x":BlockDimX } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("SearchSpaceKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes dest = numpy.arange(GenomeDim*4).astype(numpy.uint8) #for i in range(0, GenomeDim/4): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) # dest[i*4 + 0] = 40 # dest[i*4 + 1] = 0 # dest[i*4 + 2] = 0 # dest[i*4 + 3] = 0 dest_h = drv.mem_alloc(GenomeDim*4) #dest.nbytes) #drv.memcpy_htod(dest_h, dest) #print "Genomes before: " #print dest #Set-up grids #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up fitness values #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32) #fitness_h = drv.mem_alloc(fitness.nbytes) fitness_left = numpy.zeros(FitnessValDim).astype(numpy.float32) fitness_left_h = drv.mem_alloc(fitness_left.nbytes) fitness_bottom = numpy.zeros(FitnessValDim).astype(numpy.float32) fitness_bottom_h = drv.mem_alloc(fitness_bottom.nbytes) #drv.memcpy_htod(fitness_h, fitness) #print "Fitness values:" #print fitness #Set-up grids grids = numpy.zeros((10000*32, DimGridX, DimGridY)).astype(numpy.uint8) #TEST grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up curand #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8); #curand_h = drv.mem_alloc(curand.nbytes) curand_h = drv.mem_alloc(40*GenomeDim) #Set-up four permutations FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) #SearchSpace control #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" #Schedule kernel calls #MaxBlockDim = 100 OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*32) MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*32) BlockCounter=0 print "Will do that many kernels a ",BlockDimX,"x",BlockDimY,":", MaxBlockCycles for i in range(MaxBlockCycles): #Set-up timer start = drv.Event() stop = drv.Event() start.record() print "Start kernel:" #Call kernels CurandKernel(curand_h, block=(32,1,1), grid=(BlockDimX, BlockDimY)) print "Finished Curand kernel, starting main kernel..." Kernel(dest_h, grids_h, fitness_left_h, fitness_bottom_h, curand_h, block=(32*BankSize,1,1), grid=(BlockDimX,BlockDimY)) #End timer stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) pass #Output #drv.memcpy_dtoh(dest, dest_h) #print "Genomes after: " #print dest[0:4] drv.memcpy_dtoh(fitness_left, fitness_left_h) print "FitnessLeft after: " print fitness_left#[1000000:1000500] drv.memcpy_dtoh(fitness_bottom, fitness_bottom_h) print "FitnessBottom after: " print fitness_bottom#[1000000:1000500] drv.memcpy_dtoh(grids, grids_h) print "Grids[0] after: " for i in range(0,5): print "Grid ",i,": " print grids[i]
def get_transduction_func(dtype, block_size, Xaddress, change_ind1, change_ind2, change1, change2, compile_options): template = """ /* This is kept for documentation purposes the actual code used is after the end * of this template */ #include "curand_kernel.h" extern "C" { #include "stdio.h" #define BLOCK_SIZE %(block_size)d #define LA 0.5 /* Simulation Constants */ #define C_T 0.5 /* Total concentration of calmodulin */ #define G_T 50 /* Total number of G-protein */ #define PLC_T 100 /* Total number of PLC */ #define T_T 25 /* Total number of TRP/TRPL channels */ #define I_TSTAR 0.68 /* Average current through one opened TRP/TRPL channel (pA)*/ #define GAMMA_DSTAR 4.0 /* s^(-1) rate constant*/ #define GAMMA_GAP 3.0 /* s^(-1) rate constant*/ #define GAMMA_GSTAR 3.5 /* s^(-1) rate constant*/ #define GAMMA_MSTAR 3.7 /* s^(-1) rate constant*/ #define GAMMA_PLCSTAR 144 /* s^(-1) rate constant */ #define GAMMA_TSTAR 25 /* s^(-1) rate constant */ #define H_DSTAR 37.8 /* strength constant */ #define H_MSTAR 40 /* strength constant */ #define H_PLCSTAR 11.1 /* strength constant */ #define H_TSTARP 11.5 /* strength constant */ #define H_TSTARN 10 /* strength constant */ #define K_P 0.3 /* Dissociation coefficient for calcium positive feedback */ #define K_P_INV 3.3333 /* K_P inverse ( too many decimals are not important) */ #define K_N 0.18 /* Dissociation coefficient for calmodulin negative feedback */ #define K_N_INV 5.5555 /* K_N inverse ( too many decimals are not important) */ #define K_U 30 /* (mM^(-1)s^(-1)) Rate of Ca2+ uptake by calmodulin */ #define K_R 5.5 /* (mM^(-1)s^(-1)) Rate of Ca2+ release by calmodulin */ #define K_CA 1000 /* s^(-1) diffusion from microvillus to somata (tuned) */ #define K_NACA 3e-8 /* Scaling factor for Na+/Ca2+ exchanger model */ #define KAPPA_DSTAR 1300.0 /* s^(-1) rate constant - there is also a capital K_DSTAR */ #define KAPPA_GSTAR 7.05 /* s^(-1) rate constant */ #define KAPPA_PLCSTAR 15.6 /* s^(-1) rate constant */ #define KAPPA_TSTAR 150.0 /* s^(-1) rate constant */ #define K_DSTAR 100.0 /* rate constant */ #define F 96485 /* (mC/mol) Faraday constant (changed from paper)*/ #define N 4 /* Binding sites for calcium on calmodulin */ #define R 8.314 /* (J*K^-1*mol^-1)Gas constant */ #define T 293 /* (K) Absolute temperature */ #define VOL 3e-9 /* changed from 3e-12microlitres to nlitres * microvillus volume so that units agree */ #define N_S0_DIM 1 /* initial condition */ #define N_S0_BRIGHT 2 #define A_N_S0_DIM 4 /* upper bound for dynamic increase (of negetive feedback) */ #define A_N_S0_BRIGHT 200 #define TAU_N_S0_DIM 3000 /* time constant for negative feedback */ #define TAU_N_S0_BRIGHT 1000 #define NA_CO 120 /* (mM) Extracellular sodium concentration */ #define NA_CI 8 /* (mM) Intracellular sodium concentration */ #define CA_CO 1.5 /* (mM) Extracellular calcium concentration */ #define G_TRP 8 /* conductance of a TRP channel */ #define TRP_REV 0 /* TRP channel reversal potential (mV) */ __device__ __constant__ long long int d_X[5]; __device__ __constant__ int change_ind1[14]; __device__ __constant__ int change1[14]; __device__ __constant__ int change_ind2[14]; __device__ __constant__ int change2[14]; /* cc = n/(NA*VOL) [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */ __device__ float num_to_mM(int n) { return n * 5.5353e-4; // n/1806.6; } /* n = cc*VOL*NA [6.0221413e+23 mol^-1 * 3*10e-21 m^3] */ __device__ float mM_to_num(float cc) { return rintf(cc * 1806.6); } /* Assumes Hill constant (=2) for positive calcium feedback */ __device__ float compute_fp(float Ca_cc) { float tmp = Ca_cc*K_P_INV; tmp *= tmp; return tmp/(1 + tmp); } /* Assumes Hill constant(=3) for negative calmodulin feedback */ __device__ float compute_fn(float Cstar_cc, float ns) { float tmp = Cstar_cc*K_N_INV; tmp *= tmp*tmp; return ns*tmp/(1 + tmp); } /* Vm [V] */ __device__ float compute_ca(int Tstar, float Cstar_cc, float Vm) { float I_in = Tstar*G_TRP*fmaxf(-Vm + 0.001*TRP_REV, 0); /* CaM = C_T - Cstar_cc */ float denom = (K_CA + (N*K_U*C_T) - (N*K_U)*Cstar_cc + 179.0952 * expf(-(F/(R*T))*Vm)); // (K_NACA*NA_CO^3/VOL*F) /* I_Ca ~= 0.4*I_in */ float numer = (0.4*I_in)/(2*VOL*F) + ((K_NACA*CA_CO*NA_CI*NA_CI*NA_CI)/(VOL*F)) + // in paper it's -K_NACA... due to different conventions N*K_R*Cstar_cc; return fmaxf(1.6e-4, numer/denom); } __global__ void transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm, %(type)s* g_ns, %(type)s* input, int* num_microvilli, int total_microvilli, int* count) { int tid = threadIdx.x; int gid = threadIdx.x + blockIdx.x * blockDim.x; int wid = tid %% 32; int wrp = tid >> 5; __shared__ int X[BLOCK_SIZE][7]; // number of molecules __shared__ float Ca[BLOCK_SIZE]; __shared__ float fn[BLOCK_SIZE]; float Vm, ns, lambda; float sumrate, dt_advanced; int reaction_ind; ushort2 tmp; // copy random generator state locally to avoid accessing global memory curandStateXORWOW_t localstate = state[gid]; int mid; // microvilli ID volatile __shared__ int mi[4]; // starting point of mid per ward // use atomicAdd to obtain the starting mid for the warp if(wid == 0) { mi[wrp] = atomicAdd(count, 32); } mid = mi[wrp] + wid; int ind; while(mid < total_microvilli) { // load photoreceptor index of the microvilli ind = ((ushort*)d_X[4])[mid]; // load variables that are needed for computing calcium concentration tmp = ((ushort2*)d_X[2])[mid]; X[tid][5] = tmp.x; X[tid][6] = tmp.y; Vm = d_Vm[ind]*1e-3; ns = g_ns[ind]; // update calcium concentration Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn(num_to_mM(X[tid][5]), ns); lambda = input[ind]/num_microvilli[ind]; // load the rest of variables tmp = ((ushort2*)d_X[1])[mid]; X[tid][4] = tmp.y; X[tid][3] = tmp.x; tmp = ((ushort2*)d_X[0])[mid]; X[tid][2] = tmp.y; X[tid][1] = tmp.x; X[tid][0] = ((ushort*)d_X[3])[mid]; // compute total rate of reaction sumrate = lambda; sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11 sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12 sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10 sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8 sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7 sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1 sumrate += KAPPA_DSTAR * X[tid][3]; // 6 sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4 sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); // 3 sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2 sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5 ; // 9 // choose the next reaction time dt_advanced = -logf(curand_uniform(&localstate))/(LA + sumrate); // If the reaction time is smaller than dt, // pick the reaction and update, // then compute the total rate and next reaction time again // until all dt_advanced is larger than dt. // Note that you don't have to compensate for // the last reaction time that exceeds dt. // The reason is that the exponential distribution is MEMORYLESS. while(dt_advanced <= dt) { reaction_ind = 0; sumrate = curand_uniform(&localstate) * sumrate; if(sumrate > 2e-5) { sumrate -= lambda; reaction_ind = (sumrate<=2e-5) * 13; if(!reaction_ind) { sumrate -= mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); reaction_ind = (sumrate<=2e-5) * 11; if(!reaction_ind) { sumrate -= mM_to_num(K_R) * num_to_mM(X[tid][5]); reaction_ind = (sumrate<=2e-5) * 12; if(!reaction_ind) { sumrate -= GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; reaction_ind = (sumrate<=2e-5) * 10; if(!reaction_ind) { sumrate -= GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; reaction_ind = (sumrate<=2e-5) * 8; if(!reaction_ind) { sumrate -= GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 7; if(!reaction_ind) { sumrate -= GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 1; if(!reaction_ind) { sumrate -= KAPPA_DSTAR * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 6; if(!reaction_ind) { sumrate -= GAMMA_GAP * X[tid][2] * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 4; if(!reaction_ind) { sumrate -= KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); reaction_ind = (sumrate<=2e-5) * 3; if(!reaction_ind) { sumrate -= GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); reaction_ind = (sumrate<=2e-5) * 5; if(!reaction_ind) { sumrate -= KAPPA_GSTAR * X[tid][1] * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 2; if(!reaction_ind) { sumrate -= (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; reaction_ind = (sumrate<=2e-5) * 9; } } } } } } } } } } } } } int ind; // only up to two state variables are needed to be updated // update the first one. ind = change_ind1[reaction_ind]; X[tid][ind] += change1[reaction_ind]; //if(reaction_ind == 9) //{ // X[tid][ind] = max(X[tid][ind], 0); //} ind = change_ind2[reaction_ind]; //update the second one if(ind != 0) { X[tid][ind] += change2[reaction_ind]; } // compute the advance time again Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns ); //fp[tid] = compute_fp( Ca[tid] ); sumrate = lambda; sumrate += mM_to_num(K_U) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); //11 sumrate += mM_to_num(K_R) * num_to_mM(X[tid][5]); //12 sumrate += GAMMA_TSTAR * (1 + H_TSTARN*fn[tid]) * X[tid][6]; // 10 sumrate += GAMMA_DSTAR * (1 + H_DSTAR*fn[tid]) * X[tid][4]; // 8 sumrate += GAMMA_PLCSTAR * (1 + H_PLCSTAR*fn[tid]) * X[tid][3]; // 7 sumrate += GAMMA_MSTAR * (1 + H_MSTAR*fn[tid]) * X[tid][0]; // 1 sumrate += KAPPA_DSTAR * X[tid][3]; // 6 sumrate += GAMMA_GAP * X[tid][2] * X[tid][3]; // 4 sumrate += KAPPA_PLCSTAR * X[tid][2] * (PLC_T-X[tid][3]); // 3 sumrate += GAMMA_GSTAR * (G_T - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += KAPPA_GSTAR * X[tid][1] * X[tid][0]; // 2 sumrate += (KAPPA_TSTAR/(K_DSTAR*K_DSTAR)) * (1 + H_TSTARP*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(T_T-X[tid][6])*0.5; // 9 dt_advanced -= logf(curand_uniform(&localstate))/(LA + sumrate); } // end while ((ushort*)d_X[3])[mid] = X[tid][0]; ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]); ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]); ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]); if(wid == 0) { mi[wrp] = atomicAdd(count, 32); } mid = mi[wrp] + wid; } // copy the updated random generator state back to global memory state[gid] = localstate; } } """ template_run = """ #include "curand_kernel.h" extern "C" { #include "stdio.h" #define BLOCK_SIZE %(block_size)d #define LA 0.5 __device__ __constant__ long long int d_X[5]; __device__ __constant__ int change_ind1[14]; __device__ __constant__ int change1[14]; __device__ __constant__ int change_ind2[14]; __device__ __constant__ int change2[14]; __device__ float num_to_mM(int n) { return n * 5.5353e-4; // n/1806.6; } __device__ float mM_to_num(float cc) { return rintf(cc * 1806.6); } __device__ float compute_fp( float ca_cc) { float tmp = ca_cc*3.3333333333; tmp *= tmp; return tmp/(1+tmp); } __device__ float compute_fn( float Cstar_cc, float ns) { float tmp = Cstar_cc*5.55555555; tmp *= tmp*tmp; return ns*tmp/(1+tmp); } __device__ float compute_ca(int Tstar, float cstar_cc, float Vm) { float I_in = Tstar*8*fmaxf(-Vm,0); float denom = (1060 - 120*cstar_cc + 179.0952 * expf(-39.60793*Vm)); float numer = I_in * 690.9537 + 0.0795979 + 22*cstar_cc; return fmaxf(1.6e-4, numer/denom); } __global__ void transduction(curandStateXORWOW_t *state, float dt, %(type)s* d_Vm, %(type)s* g_ns, %(type)s* input, int* num_microvilli, int total_microvilli, int* count) { int tid = threadIdx.x; int gid = threadIdx.x + blockIdx.x * blockDim.x; int wid = tid %% 32; int wrp = tid >> 5; __shared__ int X[BLOCK_SIZE][7]; // number of molecules __shared__ float Ca[BLOCK_SIZE]; __shared__ float fn[BLOCK_SIZE]; float Vm, ns, lambda; float sumrate, dt_advanced; int reaction_ind; ushort2 tmp; // copy random generator state locally to avoid accessing global memory curandStateXORWOW_t localstate = state[gid]; int mid; // microvilli ID volatile __shared__ int mi[4]; // starting point of mid per ward, blocksize must be 128 // use atomicAdd to obtain the starting mid for the warp if(wid == 0) { mi[wrp] = atomicAdd(count, 32); } mid = mi[wrp] + wid; int ind; while(mid < total_microvilli) { ind = ((ushort*)d_X[4])[mid]; // load variables that are needed for computing calcium concentration tmp = ((ushort2*)d_X[2])[mid]; X[tid][5] = tmp.x; X[tid][6] = tmp.y; Vm = d_Vm[ind]*1e-3; ns = g_ns[ind]; // update calcium concentration Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns); lambda = input[ind]/(double)num_microvilli[ind]; // load the rest of variables tmp = ((ushort2*)d_X[1])[mid]; X[tid][4] = tmp.y; X[tid][3] = tmp.x; tmp = ((ushort2*)d_X[0])[mid]; X[tid][2] = tmp.y; X[tid][1] = tmp.x; X[tid][0] = ((ushort*)d_X[3])[mid]; sumrate = lambda + 54198 * Ca[tid] * (0.5 - X[tid][5] * 5.5353e-4) + 5.5 * X[tid][5]; // 11, 12 sumrate += 25 * (1+10*fn[tid]) * X[tid][6]; // 10 sumrate += 4 * (1+37.8*fn[tid]) * X[tid][4] ; // 8 sumrate += (1444+1598.4*fn[tid]) * X[tid][3] ; // 7, 6 sumrate += (3.7*(1+40*fn[tid]) + 7.05 * X[tid][1]) * X[tid][0] ; // 1, 2 sumrate += (1560 - 12.6 * X[tid][3]) * X[tid][2]; // 3, 4 sumrate += 3.5 * (50 - X[tid][2] - X[tid][1] - X[tid][3]) ; // 5 sumrate += 0.015 * (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5 ; // 9 dt_advanced = -logf(curand_uniform(&localstate))/(LA+sumrate); // If the reaction time is smaller than dt, // pick the reaction and update, // then compute the total rate and next reaction time again // until all dt_advanced is larger than dt. // Note that you don't have to compensate for // the last reaction time that exceeds dt. // The reason is that the exponential distribution is MEMORYLESS. while (dt_advanced <= dt) { reaction_ind = 0; sumrate = curand_uniform(&localstate) * sumrate; if (sumrate > 2e-5) { sumrate -= lambda; reaction_ind = (sumrate<=2e-5) * 13; if (!reaction_ind) { sumrate -= mM_to_num(30) * Ca[tid] * (0.5 - num_to_mM(X[tid][5]) ); reaction_ind = (sumrate<=2e-5) * 11; if (!reaction_ind) { sumrate -= mM_to_num(5.5) * num_to_mM(X[tid][5]); reaction_ind = (sumrate<=2e-5) * 12; if (!reaction_ind) { sumrate -= 25 * (1+10*fn[tid]) * X[tid][6]; reaction_ind = (sumrate<=2e-5) * 10; if (!reaction_ind) { sumrate -= 4 * (1+37.8*fn[tid]) * X[tid][4]; reaction_ind = (sumrate<=2e-5) * 8; if (!reaction_ind) { sumrate -= 144 * (1+11.1*fn[tid]) * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 7; if (!reaction_ind) { sumrate -= 3.7*(1+40*fn[tid]) * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 1; if (!reaction_ind) { sumrate -= 1300 * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 6; if (!reaction_ind) { sumrate -= 3.0 * X[tid][2] * X[tid][3]; reaction_ind = (sumrate<=2e-5) * 4; if (!reaction_ind) { sumrate -= 15.6 * X[tid][2] * (100-X[tid][3]); reaction_ind = (sumrate<=2e-5) * 3; if (!reaction_ind) { sumrate -= 3.5 * (50 - X[tid][2] - X[tid][1] - X[tid][3]); reaction_ind = (sumrate<=2e-5) * 5; if(!reaction_ind) { sumrate -= 7.05 * X[tid][1] * X[tid][0]; reaction_ind = (sumrate<=2e-5) * 2; if(!reaction_ind) { sumrate -= 0.015 * (1+11.5*compute_fp( Ca[tid] )) * X[tid][4]*(X[tid][4]-1)*(25-X[tid][6])*0.5; reaction_ind = (sumrate<=2e-5) * 9; } } } } } } } } } } } } } //int ind; // only up to two state variables are needed to be updated // update the first one. ind = change_ind1[reaction_ind]; X[tid][ind] += change1[reaction_ind]; //update the second one ind = change_ind2[reaction_ind]; if (ind != 0) X[tid][ind] += change2[reaction_ind]; // compute the advance time again Ca[tid] = compute_ca(X[tid][6], num_to_mM(X[tid][5]), Vm); fn[tid] = compute_fn( num_to_mM(X[tid][5]), ns ); sumrate = lambda + 54198*Ca[tid]*(0.5 - X[tid][5]*5.5353e-4) + 5.5*X[tid][5]; // 11, 12 sumrate += 25*(1 + 10*fn[tid])*X[tid][6]; // 10 sumrate += 4*(1 + 37.8*fn[tid])*X[tid][4]; // 8 sumrate += (1444 + 1598.4*fn[tid])*X[tid][3]; // 7, 6 sumrate += (3.7*(1 + 40*fn[tid]) + 7.05*X[tid][1])*X[tid][0]; // 1, 2 sumrate += (1560 - 12.6*X[tid][3])*X[tid][2]; // 3, 4 sumrate += 3.5*(50 - X[tid][2] - X[tid][1] - X[tid][3]); // 5 sumrate += 0.015*(1 + 11.5*compute_fp( Ca[tid] )) *X[tid][4]*(X[tid][4] - 1)*(25 - X[tid][6])*0.5; // 9 dt_advanced -= logf(curand_uniform(&localstate))/(LA+sumrate); } // end while ((ushort*)d_X[3])[mid] = X[tid][0]; ((ushort2*)d_X[0])[mid] = make_ushort2(X[tid][1], X[tid][2]); ((ushort2*)d_X[1])[mid] = make_ushort2(X[tid][3], X[tid][4]); ((ushort2*)d_X[2])[mid] = make_ushort2(X[tid][5], X[tid][6]); if(wid == 0) { mi[wrp] = atomicAdd(count, 32); } mid = mi[wrp] + wid; } // copy the updated random generator state back to global memory state[gid] = localstate; } } """ try: co = [compile_options[0] + ' --maxrregcount=54'] except IndexError: co = ['--maxrregcount=54'] scalartype = dtype.type if isinstance(dtype, np.dtype) else dtype mod = SourceModule(template_run % { "type": dtype_to_ctype(dtype), "block_size": block_size, "fletter": 'f' if scalartype == np.float32 else '' }, options=co, no_extern_c=True) func = mod.get_function('transduction') d_X_address, d_X_nbytes = mod.get_global("d_X") cuda.memcpy_htod(d_X_address, Xaddress) d_change_ind1_address, d_change_ind1_nbytes = mod.get_global("change_ind1") d_change_ind2_address, d_change_ind2_nbytes = mod.get_global("change_ind2") d_change1_address, d_change1_nbytes = mod.get_global("change1") d_change2_address, d_change2_nbytes = mod.get_global("change2") cuda.memcpy_htod(d_change_ind1_address, change_ind1) cuda.memcpy_htod(d_change_ind2_address, change_ind2) cuda.memcpy_htod(d_change1_address, change1) cuda.memcpy_htod(d_change2_address, change2) func.prepare('PfPPPPiP') func.set_cache_config(cuda.func_cache.PREFER_SHARED) return func
fin_g4[i] = -feq_g2[i] + feq_g4[i] + fin_g2[i]; fin_g7[i] = -feq_g5[i] + feq_g7[i] + fin_g5[i]; fin_g8[i] = -feq_g6[i] + feq_g8[i] + fin_g6[i]; } } """ % (omega)) # funRT = funRT % (uLB, omega, turb) #mod = SourceModule(funRT+funBC) funRT = mod.get_function("funRT") #funBC = mod.get_function("funBC") t_c, _ = mod.get_global("t_c") c_c, _ = mod.get_global("c_c") # cuda.memcpy_htod(t_c, t_h) cuda.memcpy_htod(c_c, c_h) # Time Loop for It in range(maxIt): #cuda.memcpy_htod(fin_g, fin) #cuda.memcpy_htod(fpost_g,fpost) # start.record() #start.synchronize() funRT(fin_g0,
#!/usr/bin/python import sys import pycuda.driver as cuda import pycuda.gpuarray import numpy as np from pycuda.compiler import SourceModule import pycuda.autoinit import time dtype = np.double src = ''.join(open('some-tests.cu').readlines()) #mod = SourceModule(src, arch='sm_13', no_extern_c = True) mod = SourceModule(src, no_extern_c=True) mod.get_global('str') fun = mod.get_function('test') #jm1 = np.matrix(range(6), dtype).reshape(2,3) m1 = np.zeros((2, 3), dtype) fun(cuda.InOut(m1), np.uint32(time.time()), block=(10, 1, 1), grid=(1, 1)) print m1
def initialize(function_number, threads = 1, grid = (1,1)): total_threads = threads * grid[0] * grid[1] # initialize only once for subsequent calls of the same benchmark function global initialized_parameters if initialized_parameters == (function_number, threads, grid): return initialized_parameters = (function_number, threads, grid) print '--- Allocating memmory, initializing benchmark function(s) ---' global module # compile the kernel source for given dimension and proper blocksize module = SourceModule(src, arch='sm_13', no_extern_c = True, options= \ ['--use_fast_math', '--ptxas-options=-v', \ '-D NREAL=%d'%nreal, '-D NFUNC=%d'%nfunc, '-D BLOCKSIZE=%d'%threads]) # PRNG initialization global rngStates, initRNG rngStates = cuda.mem_alloc(40 * total_threads) # sizeof(curandState) = 40 bytes cuda.memcpy_htod(module.get_global('rngStates')[0], np.intp(rngStates)) # set pointer in kernel code initRNG = module.get_function('initRNG') initRNG(np.uint32(time.time()), block=(threads,1,1), grid = grid) # rw data structures (separate for each thread) global g_trans_x, g_temp_x1, g_temp_x2, g_temp_x3, g_temp_x4, \ g_norm_x, g_basic_f, g_weight, g_norm_f # constant (same for each thread) global sigma, lambd, bias, o, g, l, o_gpu, o_rows, g_gpu, g_rows, l_gpu # constant scalars cuda.memcpy_htod(module.get_global('nreal')[0], np.int32(nreal)) cuda.memcpy_htod(module.get_global('nfunc')[0], np.int32(nfunc)) cuda.memcpy_htod(module.get_global('C')[0], np.double(2000)) cuda.memcpy_htod(module.get_global('global_bias')[0], np.double(0)) # rw arrays (memmory allocation only, no initialization) g_trans_x = np.zeros(nreal * total_threads).astype(dtype) g_temp_x1 = np.zeros(nreal * total_threads).astype(dtype) g_temp_x2 = np.zeros(nreal * total_threads).astype(dtype) g_temp_x3 = np.zeros(nreal * total_threads).astype(dtype) g_temp_x4 = np.zeros(nreal * total_threads).astype(dtype) g_norm_x = np.zeros(nreal * total_threads).astype(dtype) g_basic_f = np.zeros(nfunc * total_threads).astype(dtype) g_weight = np.zeros(nfunc * total_threads).astype(dtype) g_norm_f = np.zeros(nfunc * total_threads).astype(dtype) # constant arrays sigma = np.zeros(nfunc).astype(dtype) lambd = np.ones(nfunc).astype(dtype) bias = np.zeros(nfunc).astype(dtype) # 2d arrays o = np.zeros((nfunc,nreal)).astype(dtype) g = np.eye(nreal,nreal).astype(dtype) if function_number == 1: # wczytanie 'o' mozna bedzie pewnie zamienic na 1-linijkowy kod (o = np.matrix(''.join... itp fpt = ''.join(open('input_data/sphere_func_data.txt', 'r').readlines()).strip().split() for i in xrange(nfunc): for j in xrange(nreal): o[i,j] = dtype(fpt.pop(0)) bias[0] = -450.0 elif function_number == 2: fpt = ''.join(open('input_data/schwefel_102_data.txt', 'r').readlines()).strip().split() for i in xrange(nfunc): for j in xrange(nreal): o[i,j] = dtype(fpt.pop(0)) bias[0] = -450.0 elif function_number == 3: fpt = ''.join(open('input_data/elliptic_M_D%d.txt' % nreal, 'r').readlines()).strip().split() for i in xrange(nreal): for j in xrange(nreal): g[i,j] = dtype(fpt.pop(0)) fpt = ''.join(open('input_data/high_cond_elliptic_rot_data.txt', 'r').readlines()).strip().split() for i in xrange(nfunc): for j in xrange(nreal): o[i,j] = dtype(fpt.pop(0)) bias[0] = -450.0 elif function_number == 4: fpt = ''.join(open('input_data/schwefel_102_data.txt', 'r').readlines()).strip().split() for i in xrange(nfunc): for j in xrange(nreal): o[i,j] = dtype(fpt.pop(0)) bias[0] = -450.0 elif function_number == 5: global A, B fpt = open('input_data/schwefel_206_data.txt', 'r') o = np.matrix(fpt.readline().strip(), dtype)[0,:nreal] A = np.matrix(np.matrix(';'.join(fpt.readlines()))[:nreal,:nreal], dtype) B = np.zeros(nreal).astype(dtype) if nreal % 4 == 0: index = nreal / 4 else: index = nreal / 4 + 1 for i in xrange(index): o[0,i] = -100 index = (3 * nreal) / 4 - 1 for i in xrange(index, nreal): o[0,i] = 100 for i in xrange(nreal): for j in xrange(nreal): B[i] += A[i,j] * o[0,j] A = cuda.to_device(A) B = cuda.to_device(B) cuda.memcpy_htod(module.get_global('A')[0], np.intp(A)) cuda.memcpy_htod(module.get_global('B')[0], np.intp(B)) bias[0] = -310.0 elif function_number == 6: fpt = ''.join(open('input_data/rosenbrock_func_data.txt', 'r').readlines()).strip().split() for i in xrange(nfunc): for j in xrange(nreal): o[i,j] = dtype(fpt.pop(0)) - 1 bias[0] = 390 # 6 1-dimensional arrays arrays = ['g_trans_x', 'g_temp_x1', 'g_temp_x2', 'g_temp_x3', 'g_temp_x4', 'g_norm_x'] for var in arrays: globals()[var] = cuda.to_device(globals()[var]) # send to device and save the pointer cuda.memcpy_htod(module.get_global(var)[0], np.intp(globals()[var])) # set pointer in kernel code # 6 1-dimensional arrays arrays = ['g_basic_f', 'g_weight', 'sigma', 'g_norm_f'] for var in arrays: globals()[var] = cuda.to_device(globals()[var]) cuda.memcpy_htod(module.get_global(var)[0], np.intp(globals()[var])) #lambd = cuda.to_device(lambd) cuda.memcpy_htod(module.get_global('lambda')[0], lambd) cuda.memcpy_htod(module.get_global('bias')[0], bias) # 2 2-dimensional arrays (o, g) # MAYBE IT SHOULD BE CASTED to 1-d ARRAY for performance? print 'o:', o #o_gpu = cuda.to_device(np.zeros(nfunc).astype(np.intp)) #o_rows = [] for i in xrange(o.shape[0]): cuda.memcpy_htod(module.get_global('o')[0] + i * 50 * 8, o[i,:]) #row = cuda.to_device(o[i,:]) #cuda.memcpy_htod(int(o_gpu) + np.intp().nbytes * i, np.intp(row)) #o_rows.append(row) #cuda.memcpy_htod(int(o_gpu) + np.intp().nbytes * i, np.intp(row)) #cuda.memcpy_htod(module.get_global('o')[0], np.intp(o_gpu)) # watch out! 'g' is nreal x nreal #g = np.linspace(21,21 + nreal*nreal-1,nreal*nreal).reshape((nreal,nreal)) print 'g:',g #g_gpu = cuda.to_device(np.zeros(nreal).astype(np.intp)) #g_rows = [] for i in xrange(g.shape[0]): cuda.memcpy_htod(module.get_global('g')[0] + i * 50 * 8, g[i,:]) #row = cuda.to_device(g[i,:]) #g_rows.append(row) #cuda.memcpy_htod(int(g_gpu) + np.intp().nbytes * i, np.intp(row)) #cuda.memcpy_htod(module.get_global('g')[0], np.intp(g_gpu)) # 'l' (3d array) -- flatten to 1d l = np.zeros((nfunc,nreal,nreal)).astype(dtype) #l = np.linspace(3, 3 + nfunc*nreal*nreal-1,nfunc*nreal*nreal).reshape((nfunc,nreal,nreal)) for i in xrange(nfunc): l[i,:,:] = np.eye(nreal) print 'l:',l print 'l flatten:', l.flatten() #l_gpu = cuda.to_device(l.flatten().astype(dtype)) #print l_gpu cuda.memcpy_htod(module.get_global('l_flat')[0], l); print '--- initialization done ---'
kalman_gain_ar, &fltr_stt_cov_ar[offset_cov+wid], dim_state_ar, temp, temp1, tid); } logpdf[tid] = log_pdf(pred_obs_mean_ar[tid], pred_obs_cov_ar[tid],tid); } """) start = time.time() context.set_cache_config(cuda.func_cache.PREFER_L1) filter = mod.get_function("filter") init_stt_mean_ar = mod.get_global('init_stt_mean_ar')[0] init_stt_cov_ar = mod.get_global('init_stt_cov_ar')[0] tran_mat_ar = mod.get_global('tran_mat_ar')[0] observations_ar = mod.get_global('observations_ar')[0] cuda.memcpy_htod(init_stt_mean_ar, init_stt_mean_ar_const) cuda.memcpy_htod(init_stt_cov_ar, init_stt_cov_ar_const) cuda.memcpy_htod(tran_mat_ar, tran_mat_ar_const) cuda.memcpy_htod(observations_ar, observations_ar_const) start = time.time() filter(tran_cov_mat_ar_gpu, fltr_stt_mean_ar_gpu,
def __init__(self, model, dx, source_dt, sources, pml_width=None, nvcc_options=None): source = """ __constant__ float fd1_d[2]; __constant__ float fd2_d[3]; __global__ void step_d(const float *const wfc, float *wfp, const float *const phix, float *phixp, const float *const sigmax, const float *const model2_dt2, const float dt, const int nx) { int x = blockIdx.x * blockDim.x + threadIdx.x; int b = blockIdx.y; int bx = b * nx + x; float wfc_xx; float wfc_x; float phix_x; bool in_domain = (x > 1) && (x < nx - 2); if (in_domain) { wfc_xx = (fd2_d[0] * wfc[bx] + fd2_d[1] * (wfc[bx + 1] + wfc[bx - 1]) + fd2_d[2] * (wfc[bx + 2] + wfc[bx - 2])); wfc_x = (fd1_d[0] * (wfc[bx + 1] - wfc[bx - 1]) + fd1_d[1] * (wfc[bx + 2] - wfc[bx - 2])); phix_x = (fd1_d[0] * (phix[bx + 1] - phix[bx - 1]) + fd1_d[1] * (phix[bx + 2] - phix[bx - 2])); wfp[bx] = 1 / (1 + dt * sigmax[x] / 2) * (model2_dt2[x] * (wfc_xx + phix_x) + dt * sigmax[x] * wfp[bx] / 2 + (2 * wfc[bx] - wfp[bx])); phixp[bx] = phix[bx] - dt * sigmax[x] * (wfc_x + phix[bx]); } } __global__ void add_sources_d(float *wfp, const float *const model2_dt2, const float *const source_amplitude, const int *const sources_x, const int step, const int nx, const int nt, const int ns) { int s = threadIdx.x; int b = blockIdx.x; int x = sources_x[b * ns + s]; int bx = b * nx + x; wfp[bx] += source_amplitude[b * ns * nt + s * nt + step] * model2_dt2[x]; } """ if nvcc_options is None: nvcc_options = ['--restrict', '--use_fast_math', '-O3'] mod = SourceModule(source, options=nvcc_options) jitfunc1 = mod.get_function('step_d') jitfunc2 = mod.get_function('add_sources_d') fd1_d = mod.get_global('fd1_d')[0] fd2_d = mod.get_global('fd2_d')[0] pad_width = 2 super(Scalar1D, self).__init__(jitfunc1, jitfunc2, fd1_d, fd2_d, model, dx, source_dt, sources, pad_width, pml_width=pml_width)
def init_cuda(self): if self.cuda_inited: return cuda_kernel = """ #include <stdio.h> __device__ __constant__ unsigned int keySchedule[44]; __device__ __constant__ unsigned int Te0[256]; __device__ __constant__ unsigned int Te1[256]; __device__ __constant__ unsigned int Te2[256]; __device__ __constant__ unsigned int Te3[256]; __device__ __constant__ unsigned int length; __device__ __constant__ unsigned int threadMax; __global__ void printKeySchedule(){ for(int i = 0; i < 11; i++){ for(int j = 0; j < 4; j++){ printf("%08x", keySchedule[i * 4 + j]); } printf("\\n"); } } __device__ unsigned int bytestoword(unsigned char* b){ return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; } __device__ void wordtobytes(unsigned char* b, unsigned int w){ b[0] = (w >> 24); b[1] = (w >> 16) & 0xff; b[2] = (w >> 8) & 0xff; b[3] = w & 0xff; } __device__ void addRoundKey(unsigned int *s, unsigned int *k){ s[0] ^= k[0]; s[1] ^= k[1]; s[2] ^= k[2]; s[3] ^= k[3]; } __global__ void encrypt(unsigned char* in){ int p = blockIdx.x * 1024 + threadIdx.x; if(p * 16 >= length) return; unsigned char* block = in + p * 16; unsigned int s[4], t[4]; unsigned int *rk; s[0] = bytestoword(block); s[1] = bytestoword(block + 4); s[2] = bytestoword(block + 8); s[3] = bytestoword(block + 12); addRoundKey(s, keySchedule); for(int i = 1; i < 10; i++){ rk = keySchedule + i * 4; t[0] = Te0[s[0] >> 24] ^ Te1[(s[1] >> 16) & 0xff] ^ Te2[(s[2] >> 8 ) & 0xff] ^ Te3[(s[3]) & 0xff] ^ rk[0]; t[1] = Te0[s[1] >> 24] ^ Te1[(s[2] >> 16) & 0xff] ^ Te2[(s[3] >> 8 ) & 0xff] ^ Te3[(s[0]) & 0xff] ^ rk[1]; t[2] = Te0[s[2] >> 24] ^ Te1[(s[3] >> 16) & 0xff] ^ Te2[(s[0] >> 8 ) & 0xff] ^ Te3[(s[1]) & 0xff] ^ rk[2]; t[3] = Te0[s[3] >> 24] ^ Te1[(s[0] >> 16) & 0xff] ^ Te2[(s[1] >> 8 ) & 0xff] ^ Te3[(s[2]) & 0xff] ^ rk[3]; for(int j = 0; j < 4; j++) s[j] = t[j]; } rk = keySchedule + 4 * 10; s[0] = (Te2[(t[0] >> 24)] & 0xff000000) ^ (Te3[(t[1] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[2] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[3] & 0xff] & 0x000000ff) ^ rk[0]; s[1] = (Te2[(t[1] >> 24)] & 0xff000000) ^ (Te3[(t[2] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[3] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[0] & 0xff] & 0x000000ff) ^ rk[1]; s[2] = (Te2[(t[2] >> 24)] & 0xff000000) ^ (Te3[(t[3] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[0] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[1] & 0xff] & 0x000000ff) ^ rk[2]; s[3] = (Te2[(t[3] >> 24)] & 0xff000000) ^ (Te3[(t[0] >> 16) & 0xff] & 0x00ff0000) ^ (Te0[(t[1] >> 8) & 0xff] & 0x0000ff00) ^ (Te1[t[2] & 0xff] & 0x000000ff) ^ rk[3]; wordtobytes(block, s[0]); wordtobytes(block + 4, s[1]); wordtobytes(block + 8, s[2]); wordtobytes(block + 12, s[3]); } """ mod = SourceModule(cuda_kernel) dKeySchedule = mod.get_global("keySchedule")[0] cuda.memcpy_htod(dKeySchedule, self.keySchedule) dThreadMax = mod.get_global("threadMax")[0] cuda.memcpy_htod(dThreadMax, numpy.array([self.threadMax], numpy.uint32)) self.dLength = mod.get_global('length')[0] dTe0 = mod.get_global("Te0")[0] cuda.memcpy_htod(dTe0, self.Te[0]) dTe1 = mod.get_global("Te1")[0] cuda.memcpy_htod(dTe1, self.Te[1]) dTe2 = mod.get_global("Te2")[0] cuda.memcpy_htod(dTe2, self.Te[2]) dTe3 = mod.get_global("Te3")[0] cuda.memcpy_htod(dTe3, self.Te[3]) self.mod = mod self.cuda_buf = cuda.mem_alloc(self.cuda_buf_size) self.batchMax = self.threadMax * self.blockMax * 16 self.cuda_inited = True