def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = drv.SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32,8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) strm = drv.Stream() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=shape+(1,), stream=strm) strm.synchronize() self.assert_(la.norm(dest-a*b) == 0)
def __call__(self): spikes = self.collected_spikes[:self.nspikes] total_neurons = self.net.total_neurons if self.use_gpu: if not hasattr(self, 'spikes_gpu'): spikes_bool = drv.pagelocked_zeros(total_neurons, dtype=uint32) spikes_bool[spikes] = True spikes_gpu = pycuda.gpuarray.to_gpu(spikes_bool) spikes_gpu_ptr = int(int(spikes_gpu.gpudata)) self.spikes_bool = spikes_bool self.spikes_gpu = spikes_gpu self.spikes_gpu_ptr = spikes_gpu_ptr else: spikes_bool = self.spikes_bool spikes_bool[:] = False spikes_bool[spikes] = True spikes_gpu = self.spikes_gpu pycuda.driver.memcpy_htod(spikes_gpu.gpudata, spikes_bool) spikes_gpu_ptr = self.spikes_gpu_ptr acc_ptr = self.net.nemo_sim.propagate(self.synapse_type, spikes_gpu_ptr, total_neurons) if not hasattr(self, 'acc'): self.acc = acc = drv.pagelocked_zeros(total_neurons, dtype=float32) else: acc = self.acc pycuda.driver.memcpy_dtoh(acc, acc_ptr) else: spikes_ptr = spikes.ctypes.data spikes_len = len(spikes) acc_ptr = self.net.nemo_sim.propagate(self.synapse_type, spikes_ptr, spikes_len) acc = numpy_array_from_memory(acc_ptr, total_neurons, float32) for _, targetvar, targetslice in self.net.nemo_propagate_targets: targetvar += acc[targetslice] self.nspikes = 0
def _allocate_arrays(self): #allocate gpu arrays and numpy arrays. if self.max_features < 4: imp_size = 4 else: imp_size = self.max_features #allocate gpu arrays self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32) self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32) self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts) self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices) self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels, self.dtype_indices) self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32) self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts) self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16) self.mark_table = gpuarray.empty(self.stride, np.uint8) #allocate numpy arrays self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32) self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8) self.nid_array = np.zeros(self.n_samples, dtype = np.uint32) self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices) self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8) self.threshold_value_idx = np.zeros(2, self.dtype_indices) self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32) self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16) self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
def prepare(self, P): n = len(P.state_(self.eqs._diffeq_names_nonzero[0])) var_len = len(dict.fromkeys( self.eqs._diffeq_names)) + 1 # +1 needed to store t for index, varname in enumerate(self.eqs._diffeq_names): self.index_to_varname.append(varname) self.varname_to_index[varname] = index if varname in self.eqs._diffeq_names_nonzero: self.index_nonzero.append(index) self.S_in = cuda.pagelocked_zeros((n, var_len), numpy.float64) self.S_out = cuda.pagelocked_zeros((n, var_len), numpy.float64) nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize self.S_in_gpu = cuda.mem_alloc(nbytes) self.S_out_gpu = cuda.mem_alloc(nbytes) Z = zeros((n, var_len)) self.A_gpu = cuda.mem_alloc(nbytes) cuda.memcpy_htod(self.A_gpu, Z) self.B_gpu = cuda.mem_alloc(nbytes) cuda.memcpy_htod(self.B_gpu, Z) self.S_temp_gpu = cuda.mem_alloc(nbytes) modFun = {} self.applyFun = {} for x in self.index_nonzero: s = self.eqs._function_C_String[self.index_to_varname[x]] args_fun = [] for i in xrange(var_len): args_fun.append("S_temp[" + str(i) + " + blockIdx.x * var_len]") modFun[x] = SourceModule(""" __device__ double f""" + s + """ __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len) { int idx = x + blockIdx.x * var_len; S_temp[idx] = 0; B[idx] = f(""" + ",".join(args_fun) + """); S_temp[idx] = 1; A[idx] = f(""" + ",".join(args_fun) + """) - B[idx]; B[idx] /= A[idx]; S_temp[idx] = S_in[idx]; } """) self.applyFun[x] = modFun[x].get_function("applyFun") self.applyFun[x].prepare(['P', 'P', 'P', 'P', 'i', 'i'], block=(1, 1, 1)) self.calc_dict = {} self.already_calc = {}
def getRT(self, s_map, srt_gpu, srt_nsamp, srt_npairs, npairs, store_rt=False): """ Computes the rank template s_map(Sample Map) - an list of 1s and 0s of length nsamples where 1 means use this sample to compute rank template srt_gpu - cuda memory object containing srt(sample rank template) array on gpu srt_nsamp, srt_npairs - shape(buffered) of srt_gpu object npairs - true number of gene pairs being compared b_size - size of the blocks for computation store_rt - determines the RETURN value False(default) = returns an numpy array shape(npairs) of the rank template True = returns the rt_gpu object and the padded size of the rt_gpu objet (rt_obj, npairs_padded) """ b_size = self.b_size s_map_buff = self.s_map_buff = cuda.pagelocked_zeros((int(srt_nsamp),), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) s_map_buff[:len(s_map)] = np.array(s_map,dtype=np.int32) s_map_gpu = np.intp(s_map_buff.base.get_device_pointer()) #cuda.memcpy_htod(s_map_gpu, s_map_buff) #sample blocks g_y_sz = self.getGrid( srt_nsamp) #pair blocks g_x_sz = self.getGrid( srt_npairs ) block_rt_gpu = cuda.mem_alloc(int(g_y_sz*srt_npairs*(np.uint32(1).nbytes)) ) grid = (g_x_sz, g_y_sz) func1,func2 = self.getrtKern(g_y_sz) shared_size = b_size*b_size*np.uint32(1).nbytes func1( srt_gpu, np.uint32(srt_nsamp), np.uint32(srt_npairs), s_map_gpu, block_rt_gpu, np.uint32(g_y_sz), block=(b_size,b_size,1), grid=grid, shared=shared_size) rt_buffer =self.rt_buffer = cuda.pagelocked_zeros((int(srt_npairs),), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) rt_gpu = np.intp(rt_buffer.base.get_device_pointer()) func2( block_rt_gpu, rt_gpu, np.int32(s_map_buff.sum()), block=(b_size,1,1), grid=(g_x_sz,)) if store_rt: #this is in case we want to run further stuff without #transferring back and forth return (rt_gpu, srt_npairs) else: #rt_buffer = np.zeros((srt_npairs ,), dtype=np.int32) #cuda.memcpy_dtoh(rt_buffer, rt_gpu) #rt_gpu.free() return rt_buffer[:npairs]
def prepare(self, P): n = len(P.state_(self.eqs._diffeq_names_nonzero[0])) var_len = len(dict.fromkeys(self.eqs._diffeq_names))+1 # +1 needed to store t for index,varname in enumerate(self.eqs._diffeq_names): self.index_to_varname.append(varname) self.varname_to_index[varname]= index if varname in self.eqs._diffeq_names_nonzero : self.index_nonzero.append(index) self.S_in = cuda.pagelocked_zeros((n,var_len),numpy.float64) self.S_out = cuda.pagelocked_zeros((n,var_len),numpy.float64) nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize self.S_in_gpu = cuda.mem_alloc(nbytes) self.S_out_gpu = cuda.mem_alloc(nbytes) Z = zeros((n,var_len)) self.A_gpu = cuda.mem_alloc(nbytes) cuda.memcpy_htod(self.A_gpu, Z) self.B_gpu = cuda.mem_alloc(nbytes) cuda.memcpy_htod(self.B_gpu, Z) self.S_temp_gpu = cuda.mem_alloc(nbytes) modFun={} self.applyFun = {} for x in self.index_nonzero: s = self.eqs._function_C_String[self.index_to_varname[x]] args_fun =[] for i in xrange(var_len): args_fun.append("S_temp["+str(i)+" + blockIdx.x * var_len]") modFun[x] = SourceModule(""" __device__ double f"""+ s +""" __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len) { int idx = x + blockIdx.x * var_len; S_temp[idx] = 0; B[idx] = f("""+",".join(args_fun)+"""); S_temp[idx] = 1; A[idx] = f("""+",".join(args_fun)+""") - B[idx]; B[idx] /= A[idx]; S_temp[idx] = S_in[idx]; } """) self.applyFun[x] = modFun[x].get_function("applyFun") self.applyFun[x].prepare(['P','P','P','P','i','i'],block=(1,1,1)) self.calc_dict = {} self.already_calc = {}
def _initialize_gpu_ds(self): """ Setup GPU arrays. """ self.synapse_state = garray.zeros( max(int(self.total_synapses) + len(self.input_neuron_list), 1), np.float64) if self.total_num_gpot_neurons>0: # self.V = garray.zeros( # int(self.total_num_gpot_neurons), # np.float64) self.V_host = drv.pagelocked_zeros( int(self.total_num_gpot_neurons), np.float64, mem_flags=drv.host_alloc_flags.DEVICEMAP) self.V = garray.GPUArray(self.V_host.shape, self.V_host.dtype, gpudata=self.V_host.base.get_device_pointer()) else: self.V = None if self.total_num_spike_neurons > 0: # self.spike_state = garray.zeros(int(self.total_num_spike_neurons), # np.int32) self.spike_state_host = drv.pagelocked_zeros(int(self.total_num_spike_neurons), np.int32, mem_flags=drv.host_alloc_flags.DEVICEMAP) self.spike_state = garray.GPUArray(self.spike_state_host.shape, self.spike_state_host.dtype, gpudata=self.spike_state_host.base.get_device_pointer()) self.block_extract = (256, 1, 1) if len(self.out_ports_ids_gpot) > 0: self.out_ports_ids_gpot_g = garray.to_gpu(self.out_ports_ids_gpot) self.sel_out_gpot_ids_g = garray.to_gpu(self.sel_out_gpot_ids) self._extract_gpot = self._extract_projection_gpot_func() if len(self.out_ports_ids_spk) > 0: self.out_ports_ids_spk_g = garray.to_gpu( (self.out_ports_ids_spk).astype(np.int32)) self.sel_out_spk_ids_g = garray.to_gpu(self.sel_out_spk_ids) self._extract_spike = self._extract_projection_spike_func() if self.ports_in_gpot_mem_ind is not None: inds = self.sel_in_gpot_ids self.inds_gpot = garray.to_gpu(inds) if self.ports_in_spk_mem_ind is not None: inds = self.sel_in_spk_ids self.inds_spike = garray.to_gpu(inds)
def allocate(n, dtype=numpy.float32): """ allocate context-portable device mapped host memory """ return drv.pagelocked_zeros(int(n), dtype, order='C', mem_flags=drv.host_alloc_flags.PORTABLE | drv.host_alloc_flags.DEVICEMAP)
def getRMS(self, rt_gpu, srt_gpu, padded_samples, padded_npairs, samp_id, npairs): """ Returns the rank matching score rt_gpu - rank template gpu object (padded_npairs,) srt_gpu - sample rank template gpu object (padded_npairs, padded_samples) samp_id - the sample id to compare srt to rt npairs - true number of pairs b_size - the block size for gpu computation. """ b_size = self.b_size gsize = int(padded_npairs / b_size) result = self.result = cuda.pagelocked_zeros( (gsize, ), dtype=np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) result_gpu = np.intp( result.base.get_device_pointer()) #cuda.mem_alloc(result.nbytes) func = self.getrmsKern() func(rt_gpu, srt_gpu, np.int32(samp_id), np.int32(padded_samples), np.int32(npairs), result_gpu, block=(b_size, 1, 1), grid=(int(gsize), ), shared=b_size * np.uint32(1).nbytes) self.ctx.synchronize() return result.sum() / float(npairs)
def getBuff(self, frm, new_r, new_c, b_dtype): """ Generates a numpy array sized (new_r,new_x) of dtype b_dtype that contains the np array frm such that frm[i,j] == new[i,j] wher new has zeros if frm[i,j] is out of bounds. """ try: old_r,old_c = frm.shape buff = cuda.pagelocked_zeros((new_r,new_c),b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP)#np.zeros((new_r,new_c),dtype=b_dtype) buff[:old_r,:old_c] = frm except ValueError: #oned old_r = frm.shape[0] buff = cuda.pagelocked_zeros((new_r,), b_dtype,mem_flags=cuda.host_alloc_flags.DEVICEMAP)# np.zeros((new_r,),dtype=b_dtype) buff[:old_r] = frm return buff
def GenerateFractal(dimensions,position,zoom,iterations,block=(20,20,1), report=False, silent=False): chunkSize = numpy.array([dimensions[0]/block[0],dimensions[1]/block[1]],dtype=numpy.int32) zoom = numpy.float32(zoom) iterations = numpy.int32(iterations) blockDim = numpy.array([block[0],block[1]],dtype=numpy.int32) result = numpy.zeros(dimensions,dtype=numpy.int32) #Center position position = Vector(position[0]*zoom,position[1]*zoom) position = position - (Vector(result.shape[0],result.shape[1])/2) position = numpy.array([int(position.x),int(position.y)]).astype(numpy.float32) #For progress reporting: ppc = cuda.pagelocked_zeros((1,1),numpy.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) #pagelocked progress counter ppc[0,0] = 0 ppc_ptr = numpy.intp(ppc.base.get_device_pointer()) #pagelocked memory counter, device pointer to #End progress reporting #Copy parameters over to device chunkS = In(chunkSize) posit = In(position) blockD = In(blockDim) zoo = In(zoom) iters = In(iterations) res = In(result) if not silent: print("Calling CUDA function. Starting timer. progress starting at: "+str(ppc[0,0])) start_time = time.time() genChunk(chunkS, posit, blockD, zoo, iters, res, ppc_ptr, block=(1,1,1), grid=block) if report: total = (dimensions[0]*dimensions[1]) print "Reporting up to "+str(total)+", "+str(ppc[0,0]) while ppc[0,0] < ((dimensions[0]*dimensions[1])): pct = (ppc[0,0]*100)/(total) hashes = "#"*pct dashes = "-"*(100-pct) print "\r["+hashes+dashes+"] "+locale.format("%i",ppc[0,0],grouping=True)+"/"+locale.format("%i",total,grouping=True), time.sleep(0.00001) cuda.Context.synchronize() if not silent: print "Done. "+str(ppc[0,0]) #Copy result back from device cuda.memcpy_dtoh(result, res) if not silent: end_time = time.time() elapsed_time = end_time-start_time print("Done with call. Took "+str(elapsed_time)+" seconds. Here's the repr'd arary:\n") print(result) result[result.shape[0]/2,result.shape[1]/2]=iterations+1 #mark center of image return result
def find_component_device(d_v, d_D, length): """ :param d_v: :param d_D: :param ecount: :return: """ import eulercuda.pyencode as enc logger = logging.getLogger('eulercuda.pycomponent.find_component_device') logger.info("started.") mem_size = length d_prevD = np.zeros(mem_size, dtype=np.uintc) d_Q = np.zeros_like(d_prevD) d_t1 = np.zeros_like(d_prevD) d_t2 = np.zeros_like(d_prevD) d_val1 = np.zeros_like(d_prevD) d_val2 = np.zeros_like(d_prevD) sp = np.uintc(0) s = np.uintc d_D, d_Q = component_step_init(d_v, d_D, d_Q, length) s, sp = 1, 1 sptemp = drv.pagelocked_zeros(4, dtype=np.intc, mem_flags=drv.host_alloc_flags.DEVICEMAP) d_sptemp = np.intp(sptemp.base.get_device_pointer()) while s == sp: d_D, d_prevD = d_prevD, d_D d_D = component_step1_shortcutting_p1(d_v, d_prevD, d_D, d_Q, length, s) d_Q = component_step1_shortcutting_p2(d_v, d_prevD, d_D, d_Q, length, s) d_t1, d_t2, d_val1, d_val2 = component_Step2_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s) d_D, d_Q = component_Step2_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s) d_t1, d_t2, d_val1, d_val2 = component_Step3_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s) d_D = component_Step3_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s) d_val1 = component_step4_P1(d_v, d_D, d_val1, length) d_D = component_step4_P2(d_v, d_D, d_val1, length) sptemp[0] = 0 d_sptemp = (d_Q, length, d_sptemp, s) sp += sptemp[0] s += 1 logger.info("Finished. Leaving.") return d_D
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32, 8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1, ), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest - a * b) == 0
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") shape = (32, 8) a = drv.pagelocked_zeros(shape, dtype=np.float32) b = drv.pagelocked_zeros(shape, dtype=np.float32) a[:] = np.random.randn(*shape) b[:] = np.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=shape+(1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest-a*b) == 0
def getBuff(self, frm, new_r, new_c, b_dtype): """ Generates a numpy array sized (new_r,new_x) of dtype b_dtype that contains the np array frm such that frm[i,j] == new[i,j] wher new has zeros if frm[i,j] is out of bounds. """ try: old_r, old_c = frm.shape buff = cuda.pagelocked_zeros( (new_r, new_c), b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP ) #np.zeros((new_r,new_c),dtype=b_dtype) buff[:old_r, :old_c] = frm except ValueError: #oned old_r = frm.shape[0] buff = cuda.pagelocked_zeros( (new_r, ), b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP ) # np.zeros((new_r,),dtype=b_dtype) buff[:old_r] = frm return buff
def getRMS(self, rt_gpu, srt_gpu, padded_samples, padded_npairs, samp_id, npairs): """ Returns the rank matching score rt_gpu - rank template gpu object (padded_npairs,) srt_gpu - sample rank template gpu object (padded_npairs, padded_samples) samp_id - the sample id to compare srt to rt npairs - true number of pairs b_size - the block size for gpu computation. """ b_size = self.b_size gsize = int(padded_npairs/b_size) result = self.result= cuda.pagelocked_zeros((gsize,), dtype=np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) result_gpu = np.intp(result.base.get_device_pointer()) #cuda.mem_alloc(result.nbytes) func = self.getrmsKern() func( rt_gpu, srt_gpu, np.int32(samp_id), np.int32(padded_samples), np.int32(npairs), result_gpu, block=(b_size,1,1), grid=(int(gsize),), shared=b_size*np.uint32(1).nbytes ) self.ctx.synchronize() return result.sum()/float(npairs)
def main(): params0 = np.ones((args.dim, ), dtype=np.float32) / (np.sqrt(args.dim)) loss, params, grad_cached, grad_assign_op = create_net('net1', params0) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) lr = 0.01 import pycuda.driver as drv drv.init() print("%d device(s) found." % drv.Device.count()) current_dev = drv.Device(0) #device we are working on ctx = current_dev.make_context() #make a working context ctx.push() #let context make the lead params1 = drv.pagelocked_zeros((args.dim, ), dtype=np.float32) for i in range(10): loss0 = loss.eval() print(loss0) with timeit('step'): sess.run(grad_assign_op) with timeit('fetch'): grad0 = sess.run(grad_cached) # takes 75ms, 33ms is on allocation, 16ms on multiplication with timeit('add'): params0 -= grad0 * lr with timeit('feed'): # params.load(params0) sess.run(params.initializer, feed_dict={params.initial_value: params1}) for key, times in global_timeit_dict.items(): summarize_time(key, times) assert abs(loss0 - 0.69513524) < 0.01 print('test passed')
def __init__(self, N, model, threshold=None, reset=NoReset(), init=None, refractory=0 * msecond, level=0, clock=None, order=1, implicit=False, unit_checking=True, max_delay=0 * msecond, compile=False, freeze=False, method=None, precision='double', maxblocksize=512, forcesync=False, pagelocked_mem=True, gpu_to_cpu_vars=None, cpu_to_gpu_vars=None): eqs = model eqs.prepare() NeuronGroup.__init__(self, N, eqs, threshold=threshold, reset=reset, init=init, refractory=refractory, level=level, clock=clock, order=order, compile=compile, freeze=freeze, method=method) self.precision = precision if self.precision == 'double': self.precision_dtype = float64 self.precision_nbytes = 8 else: self.precision_dtype = float32 self.precision_nbytes = 4 self.clock = guess_clock(clock) if gpu_to_cpu_vars is None and cpu_to_gpu_vars is None: self._state_updater = GPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize, forcesync=forcesync) else: cpu_to_gpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes, self.get_var_index(var) * len(self), (self.get_var_index(var) + 1) * len(self)) for var in cpu_to_gpu_vars] gpu_to_cpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes, self.get_var_index(var) * len(self), (self.get_var_index(var) + 1) * len(self)) for var in gpu_to_cpu_vars] self._state_updater = UserControlledGPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize, gpu_to_cpu_vars=gpu_to_cpu_vars, cpu_to_gpu_vars=cpu_to_gpu_vars) if pagelocked_mem: self._S = GPUBufferedArray(drv.pagelocked_zeros(self._S.shape, dtype=self.precision_dtype)) else: self._S = GPUBufferedArray(array(self._S, dtype=self.precision_dtype)) self._gpuneurongroup_init_finished = True
for i in range(1, ngpu): result = np.concatenate((result, mpi.world.recv(i, 10))) for i in xrange(ny): print result[:nx, i], '\t', result[nx:2 * nx, i], '\t', result[2 * nx:, i] if __name__ == '__main__': cuda.init() ngpu = cuda.Device.count() ctx = cuda.Device(mpi.rank).make_context(cuda.ctx_flags.MAP_HOST) nx, ny = 6, 5 a_side_f = cuda.pagelocked_zeros(ny, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) a = np.zeros((nx, ny), 'f') if mpi.rank == 0: a[-2, :] = 1.5 elif mpi.rank == 1: a[1, :] = 2.0 a[-2, :] = 2.5 elif mpi.rank == 2: a[1, :] = 3.0 a_gpu = cuda.to_device(a) if mpi.rank == 0: print 'dev 0', '\t' * 5, 'dev 1', '\t' * 5, 'dev 2' print_arr_gpus(ngpu, nx, ny, a_gpu)
# prepare for plot import matplotlib.pyplot as plt plt.ion() fig = plt.figure(figsize=(15,7)) ''' fig = plt.figure(figsize=(10,13)) ax1 = fig.add_subplot(3,1,1) ax1.imshow(fdtd.cex[nx/2,:,:].T, origin='lower', interpolation='nearest') ax2 = fig.add_subplot(3,1,2) ax2.imshow(fdtd.cey[nx/2,:,:].T, origin='lower', interpolation='nearest') ax3 = fig.add_subplot(3,1,3) ax3.imshow(fdtd.cez[nx/2,:,:].T, origin='lower', interpolation='nearest') plt.show() ''' #ez_tmp = np.ones((800,ny,nz), 'f') ez_tmp = cuda.pagelocked_zeros((800,ny,nz), 'f') ax1 = fig.add_subplot(4,1,1) imag1 = ax1.imshow(ez_tmp[:,:,nz/2].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2., interpolation='nearest') ax2 = fig.add_subplot(4,1,2) imag2 = ax2.imshow(ez_tmp[:,ny/2,:].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2., interpolation='nearest') ax3 = fig.add_subplot(2,2,3) imag3 = ax3.imshow(ez_tmp[500,:,:].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2., interpolation='nearest') ax4 = fig.add_subplot(2,2,4) imag4 = ax4.imshow(ez_tmp[700,:,:].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2, interpolation='nearest') # ez^2 sum s1 = np.zeros(200) s2 = np.zeros(200) ez_tmp1 = cuda.pagelocked_zeros((ny,nz), 'f') ez_tmp2 = cuda.pagelocked_zeros((ny,nz), 'f')
def run_simulation(self): # setup data#{{{ data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T } base_shape = self.n_work_items, for name, shape in dict( tavg0=(self.exposures, self.args.n_regions,), tavg1=(self.exposures, self.args.n_regions,), state=(self.buf_len, self.states * self.args.n_regions), ).items(): # memory error exception for compute device try: data[name] = np.zeros(shape + base_shape, 'f') except MemoryError as e: self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large ' 'for this compute device', e, self.args.n_sweep_arg0, self.args.n_sweep_arg1) exit(1) gpu_data = self.make_gpu_data(data)#{{{ # setup CUDA stuff#{{{ step_fn = self.make_kernel( source_file=self.args.filename, warp_size=32, # block_dim_x=self.args.n_sweep_arg0, # ext_options=preproccesor_defines, # caching=args.caching, args=self.args, lineinfo=self.args.lineinfo, nh=self.buf_len, )#}}} # setup simulation#{{{ tic = time.time() n_streams = 32 streams = [drv.Stream() for i in range(n_streams)] events = [drv.Event() for i in range(n_streams)] tavg_unpinned = [] try: tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32) except drv.MemoryError as e: self.logger.error( '%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU', e, self.params.size) exit(1) # determine optimal grid recursively def dog(fgd): maxgd, mingd = max(fgd), min(fgd) maxpos = fgd.index(max(fgd)) if (maxgd - 1) * mingd * bx * by >= nwi: fgd[maxpos] = fgd[maxpos] - 1 dog(fgd) else: return fgd # n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y # form an optimal grid recursively bx, by = self.args.blockszx, self.args.blockszy nwi = self.n_work_items rootnwi = int(np.ceil(np.sqrt(nwi))) gridx = int(np.ceil(rootnwi / bx)) gridy = int(np.ceil(rootnwi / by)) final_block_dim = bx, by, 1 fgd = [gridx, gridy] dog(fgd) final_grid_dim = fgd[0], fgd[1] assert gridx * gridy * bx * by >= nwi self.logger.info('history shape %r', gpu_data['state'].shape) self.logger.info('gpu_data %s', gpu_data['tavg0'].shape) self.logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, )) self.logger.info('final block dim %r', final_block_dim) self.logger.info('final grid dim %r', final_grid_dim) # run simulation#{{{ nstep = self.args.n_time self.gpu_mem_info() if self.args.verbose else None try: for i in tqdm.trange(nstep, file=sys.stdout): try: event = events[i % n_streams] stream = streams[i % n_streams] if i > 0: stream.wait_for_event(events[(i - 1) % n_streams]) step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len), np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt), gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'], gpu_data['tavg%d' % (i%2,)], block=final_block_dim, grid=final_grid_dim) event.record(streams[i % n_streams]) except drv.LaunchError as e: self.logger.error('%s', e) exit(1) tavgk = 'tavg%d' % ((i + 1) % 2,) # async wrt. other streams & host, but not this stream. if i >= n_streams: stream.synchronize() tavg_unpinned.append(tavg[i % n_streams].copy()) drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream) # recover uncopied data from pinned buffer if nstep > n_streams: for i in range(nstep % n_streams, n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) for i in range(nstep % n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) except drv.LogicError as e: self.logger.error('%s. Check the number of states of the model or ' 'GPU block shape settings blockdim.x/y %r, griddim %r.', e, final_block_dim, final_grid_dim) exit(1) except drv.RuntimeError as e: self.logger.error('%s', e) exit(1) # self.logger.info('kernel finish..') # release pinned memory tavg = np.array(tavg_unpinned) # also release gpu_data self.release_gpumem(gpu_data) self.logger.info('kernel finished') return tavg
exec_time = { 'update_h': np.zeros(tmax), 'mpi_recv_h': np.zeros(tmax), 'memcpy_htod_h': np.zeros(tmax), 'mpi_send_h': np.zeros(tmax), 'memcpy_dtoh_h': np.zeros(tmax), 'update_e': np.zeros(tmax), 'mpi_recv_e': np.zeros(tmax), 'memcpy_htod_e': np.zeros(tmax), 'mpi_send_e': np.zeros(tmax), 'memcpy_dtoh_e': np.zeros(tmax), 'src_e': np.zeros(tmax) } # main loop ey_tmp = cuda.pagelocked_zeros((ny, nz), 'f') ez_tmp = cuda.pagelocked_zeros_like(ey_tmp) hy_tmp = cuda.pagelocked_zeros_like(ey_tmp) hz_tmp = cuda.pagelocked_zeros_like(ey_tmp) for tn in xrange(1, tmax + 1): if rank == 1: start.record() for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i * MBy), *eh_args) if rank == 0: cuda.memcpy_dtoh( hy_tmp, int(hy_gpu) + (nx - 1) * ny * nz * np.nbytes['float32']) cuda.memcpy_dtoh( hz_tmp, int(hz_gpu) + (nx - 1) * ny * nz * np.nbytes['float32'])
def main(): #Set up global timer tot_time = time.time() #Define constants BankSize = 8 # Do not go beyond 8! WarpSize = 32 #Do not change... DimGridX = 19 DimGridY = 19 BlockDimX = 256 BlockDimY = 256 SearchSpaceSize = 2**24 #BlockDimX * BlockDimY * 32 FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize AlignedByteLengthGenome = 4 #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":AlignedByteLengthGenome, "bit_length_edge_type":3, "curand_nr_threads_per_block":32, "nr_tile_types":2, "nr_edge_types":8, "warpsize":WarpSize, "fit_dim_thread_x":32*BankSize, "fit_dim_thread_y":1, "fit_dim_block_x":BlockDimX, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4, "banksize":BankSize, "curand_dim_block_x":BlockDimX } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("SearchSpaceKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8) #for i in range(0, GenomeDim/4): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) # dest[i*4 + 0] = 40 # dest[i*4 + 1] = 0 # dest[i*4 + 2] = 0 # dest[i*4 + 3] = 0 dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes) #drv.memcpy_htod(dest_h, dest) #print "Genomes before: " #print dest #Set-up grids #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up fitness values #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32) #fitness_h = drv.mem_alloc(fitness.nbytes) #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32) fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0) fitness_size_h = drv.mem_alloc(fitness_size.nbytes) #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32) fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0) fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes) #drv.memcpy_htod(fitness_h, fitness) #print "Fitness values:" #print fitness #Set-up grids #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0) grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up curand #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8); #curand_h = drv.mem_alloc(curand.nbytes) curand_h = drv.mem_alloc(40*GenomeDim) #SearchSpace control #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" #Schedule kernel calls #MaxBlockDim = 100 OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize) MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize) BlockCounter = 0 print "Will do that many kernels a ", BlockDimX,"x", BlockDimY,"x ", WarpSize, ":", MaxBlockCycles #quit() #SET UP PROCESSING histo = {} #INITIALISATION CurandKernel(curand_h, block=(WarpSize,1,1), grid=(BlockDimX, BlockDimY)) print "Finished Curand kernel, starting main kernel..." #FIRST GENERATION proc_time = time.time() print "Starting first generation..." start = drv.Event() stop = drv.Event() start.record() Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64(0), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY)) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Copying..." drv.memcpy_dtoh(fitness_size, fitness_size_h) drv.memcpy_dtoh(fitness_hash, fitness_hash_h) drv.memcpy_dtoh(grids, grids_h) #INTERMEDIATE GENERATION for i in range(MaxBlockCycles-1): print "Starting generation: ", i+1 start = drv.Event() stop = drv.Event() start.record() Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64((i+1)*BlockDimX*BlockDimY*WarpSize), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY)) "Processing..." for j in range(grids.shape[0]): # if (fitness_hash[j]!=33) and (fitness_hash[j]!=44) and (fitness_hash[j]!=22): if fitness_hash[j] in histo: histo[fitness_hash[j]] = (histo[fitness_hash[j]][0], histo[fitness_hash[j]][1]+1) else: histo[fitness_hash[j]] = (grids[j], 1) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "This corresponds to %f polyomino classification a second."%((BlockDimX*BlockDimY*WarpSize)/(start.time_till(stop)*1e-3)) print "Copying..." drv.memcpy_dtoh(fitness_size, fitness_size_h) drv.memcpy_dtoh(fitness_hash, fitness_hash_h) drv.memcpy_dtoh(grids, grids_h) #FINAL PROCESSING "Processing..." for i in range(grids.shape[0]): if fitness_hash[i] in histo: histo[fitness_hash[i]] = (histo[fitness_hash[i]][0], histo[fitness_hash[i]][1]+1) else: histo[fitness_hash[i]] = (grids[i], 1) print "Done!" #TIMING RESULTS print "Total time including set-up: ", (time.time() - tot_time) print "Total Processing time: ", (time.time() - proc_time) #OUTPUT print histo
result = cuda.from_device(a_gpu, (nx,ny), 'float32') print ngpu for i in range(1,ngpu): result = np.concatenate((result, mpi.world.recv(i,10))) for i in xrange(ny): print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i] if __name__ == '__main__': cuda.init() ngpu = cuda.Device.count() ctx = cuda.Device(mpi.rank).make_context(cuda.ctx_flags.MAP_HOST) nx, ny = 6, 5 a_side_f = cuda.pagelocked_zeros(ny, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) a = np.zeros((nx,ny),'f') if mpi.rank == 0: a[-2,:] = 1.5 elif mpi.rank == 1: a[1,:] = 2.0 a[-2,:] = 2.5 elif mpi.rank == 2: a[1,:] = 3.0 a_gpu = cuda.to_device(a) if mpi.rank == 0: print 'dev 0','\t'*5,'dev 1','\t'*5,'dev 2' print_arr_gpus(ngpu, nx, ny, a_gpu) if mpi.rank == 0:
from wavemoth.cuda.profile import cuda_profile N = 4092 nside = 2048 npix = 12*2048**2 nrings = 4 * nside - 1 lmax = 2 * nside #x = np.asarray(np.random.rand(N), np.float32) #xf = np.fft.fft(x) #x_gpu = gpuarray.to_gpu(x) #xf_gpu = gpuarray.empty(N/2+1, np.complex64) map = drv.pagelocked_zeros(npix, np.float64) buf = drv.pagelocked_zeros((nrings, (lmax + 1) // 2 + 1), np.complex128) map_gpu = drv.mem_alloc(npix * 8) buf_gpu = drv.mem_alloc(nrings * ((lmax + 1) // 2 + 1) * 16) drv.memcpy_htod(map_gpu, map) from wavemoth.cuda import cufft print 'ctoring plan' plan = cufft.HealpixCuFFTPlan(2048, 8) repeats = 1 print 'plan ctored' with cuda_profile() as prof:
ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device( set_c(f,(None,-1,-1)) ) cey_gpu = cuda.to_device( set_c(f,(-1,None,-1)) ) cez_gpu = cuda.to_device( set_c(f,(-1,-1,None)) ) chx_gpu = cuda.to_device( set_c(f,(None,0,0)) ) chy_gpu = cuda.to_device( set_c(f,(0,None,0)) ) chz_gpu = cuda.to_device( set_c(f,(0,0,None)) ) # pinned memory allocation for zero-copy if myrank != 1: ex_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) ey_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) hx_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) hy_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) ''' ex_send_map = ex_send.get_device_pointer() ey_send_map = ey_send.get_device_pointer() hx_recv_map = hx_recv.get_device_pointer() hy_recv_map = hy_recv.get_device_pointer() ''' if myrank != 3: ex_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) ey_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) hx_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) hy_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) '''
model_file = 'erfnet_nobn.uff' with trt.Builder(TRT_LOGGER) as builder, builder.create_network( ) as network, trt.UffParser() as parser: # (num_channels, h, w) parser.register_input("inputs/X", (1, 256, 256)) parser.register_output("up23/BiasAdd") parser.parse(model_file, network) builder.max_batch_size = max_batch_size builder.max_workspace_size = 1 << 20 # This determines the amount of memory available to the builder when building an optimized engine and should generally be set as high as possible. with builder.build_cuda_engine(network) as engine: with engine.create_execution_context() as context: # h_input = cuda.pagelocked_zeros(trt.volume(engine.get_binding_shape(0)), dtype=np.float32) h_input = np.ones((1, 256, 256)) h_output = cuda.pagelocked_zeros(trt.volume( engine.get_binding_shape(1)), dtype=np.float32) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Run inference. context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Setup the kernel mod = SourceModule(""" __global__ void add(float *a, float *b, float *c, float *c_map) { int idx = blockIdx.x*blockDim.x + threadIdx.x; float val; val = a[idx] + b[idx]; c[idx] = val; c_map[idx] = val; } """) add = mod.get_function("add") # Memory allocation nx = 1024 a = np.random.randn(nx).astype(np.float32) b = np.random.randn(nx).astype(np.float32) c = np.zeros_like(a) a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) # Page-locked host memory allocation for zero-copy c_map = cuda.pagelocked_zeros(nx, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) add( a_gpu, b_gpu, cuda.Out(c), cuda.Out(c_map), block=(256,1,1), grid=(4,1) ) assert( np.linalg.norm( (a+b)-c ) == 0 ) assert( np.linalg.norm( (a+b)-c_map ) == 0 ) ctx.pop()
# measure kernel execution time from datetime import datetime t1 = datetime.now() flop = 3*(nx*ny*nz*30)*tgap flops = np.zeros(tmax/tgap+1) start, stop = cuda.Event(), cuda.Event() start.record() elif rank == 1: start, stop = cuda.Event(), cuda.Event() exec_time = {'update_h':np.zeros(tmax), 'mpi_recv_h':np.zeros(tmax), 'memcpy_htod_h':np.zeros(tmax), 'mpi_send_h':np.zeros(tmax), 'memcpy_dtoh_h':np.zeros(tmax), 'update_e':np.zeros(tmax), 'mpi_recv_e':np.zeros(tmax), 'memcpy_htod_e':np.zeros(tmax), 'mpi_send_e':np.zeros(tmax), 'memcpy_dtoh_e':np.zeros(tmax), 'src_e':np.zeros(tmax)} # main loop ey_tmp = cuda.pagelocked_zeros((ny,nz),'f') ez_tmp = cuda.pagelocked_zeros_like(ey_tmp) hy_tmp = cuda.pagelocked_zeros_like(ey_tmp) hz_tmp = cuda.pagelocked_zeros_like(ey_tmp) for tn in xrange(1, tmax+1): if rank == 1: start.record() for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args) if rank == 0: cuda.memcpy_dtoh(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) cuda.memcpy_dtoh(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) comm.Send(hy_tmp, 1, 20) comm.Send(hz_tmp, 1, 21) elif rank == 1: stop.record() stop.synchronize()
print("BGRA2NV12=%s" % BGRA2NV12) w = roundup(512, 32) h = roundup(512, 32) log("w=%s, h=%s", w, h) cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16) log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)), inputPitch) #allocate CUDA NV12 buffer (on device): cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16) log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch) #host buffers: inputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2, dtype=numpy.byte) log("inputBuffer=%s", inputBuffer) outputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2, dtype=numpy.byte) log("outputBuffer=%s", outputBuffer) #populate host buffer with random data: buf = inputBuffer.data for y in range(h * 3 / 2): dst = y * inputPitch #debug("%s: %s:%s (size=%s) <- %s:%s (size=%s)", y, dst, dst+w, len(buffer), src, src+w, len(Yplane)) for x in range(w): buf[dst + x] = numpy.byte((x + y) % 256) #copy input buffer to CUDA buffer:
def _pre_run(self): assert (self.LPU_obj) assert all([var in self.memory_manager.variables for var in self.variables.keys()]),\ (list(self.memory_manager.variables), list(self.variables.keys())) self.add_inds_func = {} if self.input_file is not None: self.input_file_handle = h5py.File(self.input_file, 'w') self.input_file_handle.create_dataset('metadata', (), 'i') self.input_file_handle['metadata'].attrs['dt'] = self.dt self.input_file_handle['metadata'].attrs[ 'sample_interval'] = self.input_interval self.input_file_handle['metadata'].attrs[ 'DateCreated'] = datetime.now().isoformat() for k, v in self.metadata.items(): self.input_file_handle['metadata'].attrs[k] = v for var, d in self.variables.items(): v_dict = self.memory_manager.variables[var] uids = [] inds = [] for uid in d['uids']: cd = self.LPU_obj.conn_dict[uid] assert (var in cd) pre = cd[var]['pre'][0] inds.append(v_dict['uids'][pre]) if isinstance(var, str): self.dest_inds[var] = garray.to_gpu(np.array(inds, np.int32)) self.dtypes[var] = v_dict['buffer'].dtype self._d_input[var] = garray.zeros(len(d['uids']), self.dtypes[var]) if self.memory_mode == 'cpu': self.variables[var]['input'] = cuda.pagelocked_zeros( len(d['uids']), self.dtypes[var]) elif self.memory_mode == 'gpu': self.variables[var]['input'] = self._d_input[var] self.add_inds_func[var] = get_inds_kernel( self.dest_inds[var].dtype, v_dict['buffer'].dtype) elif isinstance(var, tuple): n = len(var) new_inds = list( itertools.chain.from_iterable( [[ind * n + i for i in range(n)] for ind in inds])) self.dest_inds[var] = garray.to_gpu( np.array(new_inds, np.int32)) self.dtypes[var] = v_dict['buffer'].dtype self._d_input[var] = garray.zeros( len(d['uids']) * n, self.dtypes[var]) if self.memory_mode == 'cpu': self.variables[var]['input'] = cuda.pagelocked_zeros( len(d['uids']) * n, self.dtypes[var]) elif self.memory_mode == 'gpu': self.variables[var]['input'] = self._d_input[var] self.add_inds_func[var] = get_inds_kernel( self.dest_inds[var].dtype, v_dict['buffer'].dtype) else: raise TypeError( 'variable name must either be a str or a tuple of str') if self.input_file is not None: self.input_file_handle.create_dataset('{}/uids'.format(var), data=np.array(d['uids'], dtype='S')) if isinstance(var, str): self.input_file_handle.create_dataset( '{}/data'.format(var), (0, len(d['uids'])), d['input'].dtype, maxshape=(None, len(d['uids']))) elif isinstance(var, tuple): n = len(var) for ind_var in var: self.input_file_handle.create_dataset( '{}/data/{}'.format(var, ind_var), (0, len(d['uids'])), d['input'].dtype, maxshape=(None, len(d['uids']))) else: raise TypeError( 'variable name must either be a str or a tuple of str') self.record_count = 0 self.pre_run()
def prepare(self): ''' From Hines 1984 paper, discrete formula is: A_plus*V(i+1)-(A_plus+A_minus)*V(i)+A_minus*V(i-1)=Cm/dt*(V(i,t+dt)-V(i,t))+gtot(i)*V(i)-I0(i) A_plus: i->i+1 A_minus: i->i-1 This gives the following tridiagonal system: A_plus*V(i+1)-(Cm/dt+gtot(i)+A_plus+A_minus)*V(i)+A_minus*V(i-1)=-Cm/dt*V(i,t)-I0(i) Boundaries, one simple possibility (sealed ends): -(Cm/dt+gtot(n)+A_minus)*V(n)+A_minus*V(n-1)=-Cm/dt*V(n,t)-I0(n) A_plus*V(1)-(Cm/dt+gtot(0)+A_plus)*V(0)=-Cm/dt*V(0,t)-I0(0) ''' mid_diameter = zeros(len(self.neuron)) # mid(i) : (i-1) <-> i mid_diameter[1:] = .5 * (self.neuron.diameter[:-1] + self.neuron.diameter[1:]) self.Aplus = zeros(len(self.neuron)) # A+ i -> j = Aplus(j) self.Aminus = zeros(len(self.neuron)) # A- i <- j = Aminus(j) self.Aplus[1] = mid_diameter[1]**2 / (4 * self.neuron.diameter[1] * self.neuron.length[1]**2 * self.neuron.Ri) self.Aplus[2:] = mid_diameter[2:]**2 / ( 4 * self.neuron.diameter[1:-1] * self.neuron.length[1:-1]**2 * self.neuron.Ri) self.Aminus[1:] = mid_diameter[1:]**2 / (4 * self.neuron.diameter[1:] * self.neuron.length[1:]**2 * self.neuron.Ri) self.neuron.index = zeros( len(self.neuron), int ) # gives the index of the branch containing the current compartment self.neuron.branches = [] # (i,j,bp,ante,ante_index,pointType) # i is the first compartment # bp is the last, a branch point # j is the end of the "inner branch". j = bp-1 # ante is the branch point to which i is connected self.neuron.BPcount = 0 # number of branch points (or branches). = len(self.neuron.branches) self.neuron.long_branches_count = 0 # number of branches with len(branch) > 1 #self.vL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) #self.vR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) #self.d = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) self.bL = cuda.pagelocked_zeros((len(self.neuron)), numpy.float64) self.bR = cuda.pagelocked_zeros((len(self.neuron)), numpy.float64) #self.bd = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) self.ab = zeros((3, len(self.neuron))) self.ab0 = zeros(len(self.neuron)) self.ab1 = cuda.pagelocked_zeros((len(self.neuron)), numpy.float64) self.ab2 = zeros(len(self.neuron)) self.ab1_base = zeros(len(self.neuron)) #self.res = cuda.pagelocked_zeros((3 * len(self.neuron)),numpy.float64) self.mTrunc = 0 # used to truncate vL and vR self.delta_list = zeros(len(self.neuron)) #used to find mTrunc # prepare_branch : fill neuron.index, neuron.branches, changes Aplus & Aminus self.prepare_branch(self.neuron.morphology, mid_diameter, 0) # linear system P V = B used to deal with the voltage at branch points and take boundary conditions into account. self.P = zeros((self.neuron.BPcount, self.neuron.BPcount)) self.B = zeros(self.neuron.BPcount) self.solution_bp = zeros(self.neuron.BPcount) self.gtot = zeros(len(self.neuron)) self.I0 = zeros(len(self.neuron)) self.i_list = [] self.j_list = [] self.i_list_bis = [] self.j_list_bis = [] new_tridiag = True self.bp_list = [] self.pointType_list = [] self.pointTypeAnte_list = [] self.index_ante_list0 = [] self.index_ante_list1 = [] self.index_ante_list2 = [] self.ante_list = [] self.post_list = [] self.ante_list_idx = [] self.post_list_idx = [] self.id = [] self.test_list = [] temp = zeros(self.neuron.BPcount) self.ind0 = [] self.ind_bctype_0 = [] for index, (i, j, bp, ante, index_ante, pointType) in enumerate(self.neuron.branches): self.i_list.append(i) self.j_list.append(j) if new_tridiag: self.i_list_bis.append(i) ii = i else: ii = self.i_list[-1] if j - ii + 1 > 2: self.j_list_bis.append(j) new_tridiag = True else: new_tridiag = False self.bp_list.append(bp) self.pointType_list.append(max(1, pointType)) self.pointTypeAnte_list.append(max(1, self.neuron.bc[ante])) temp[index] = index_ante self.id.append(index) if (j - i + 2 > 1): self.test_list.append(1) else: self.test_list.append(0) for x in xrange(j - i + 2): self.ante_list.append(ante) self.post_list.append(bp) self.ante_list_idx.append(index_ante) self.post_list_idx.append(index) if index_ante == 0 and index != 0: self.ind0.append(index) if pointType == 0: self.ind_bctype_0.append(bp) self.ante_arr = numpy.array(self.ante_list_idx) self.post_arr = numpy.array(self.post_list_idx) self.index_ante_list1, self.ind1 = numpy.unique(temp, return_index=True) self.ind1 = numpy.sort(self.ind1) self.index_ante_list1 = temp[self.ind1] self.index_ante_list1 = list(self.index_ante_list1) self.ind2 = [] for x in xrange(self.neuron.BPcount): self.ind2.append(x) self.ind2 = numpy.delete(self.ind2, self.ind1, None) self.ind2 = numpy.setdiff1d(self.ind2, self.ind0, assume_unique=True) self.index_ante_list2 = temp[self.ind2] self.index_ante_list2 = list(self.index_ante_list2) self.index_ante_list = list(temp) self.Aminus_bp = self.Aminus[self.bp_list] self.Aminus_bp[:] *= self.pointType_list[:] self.Aplus_i = self.Aplus[self.i_list] self.Aplus_i[:] *= self.pointTypeAnte_list[:] #--------------------------------------------------------GPU------------------------ n = len(self.neuron) mod = SourceModule(""" __global__ void updateAB_gtot(double *ab1, double *ab1_base, double *gtot) { int idx = threadIdx.x + blockIdx.x * blockDim.x; ab1[idx] = ab1_base[idx] - gtot[idx]; ab1[idx+gridDim.x] = ab1_base[idx] - gtot[idx]; ab1[idx+2*gridDim.x] = ab1_base[idx] - gtot[idx]; } __global__ void updateBD(double *bd, double *Cm, double dt,double *v, double *I0) { int idx = threadIdx.x + blockIdx.x * blockDim.x; bd[idx] = - Cm[idx] / dt * v[idx] - I0[idx]; } __global__ void finalizeFun(double *v, double *v_bp, int *ante,int *post, double *b, int m) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx_a = ante[idx]; int idx_p = post[idx]; v[idx] = b[idx + m] * v_bp[idx_a] + b[idx + 2*m] * v_bp[idx_p] + b[idx]; // vL, vR, d } __global__ void finalizeFunBis(double *v, double *v_bp, int *GPU_data_int) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int bp = GPU_data_int[4 * idx + 3]; v[bp] = v_bp[idx]; } __global__ void initPB(double *P, double *B, int BPcount) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int idy = threadIdx.x + blockIdx.y * blockDim.y; P[idx + idy * BPcount] = 0.0; B[idx] = 0.0; } //GPU_data_int is : ante_list, i_list, j_list, bp_list //GPU_data_double is : test_list, Aplus, Aminus __global__ void fillPB(double *P, double *B, double *b, double *Cm_l, double *gtot_l, int *GPU_data_int, double *GPU_data_double, double *I0_l, double *v_l, int BPcount, double dt, int m) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx_ante = GPU_data_int[4 * idx]; int i = GPU_data_int[4 * idx + 1]; int j = GPU_data_int[4 * idx + 2]; int bp = GPU_data_int[4 * idx + 3]; double test = GPU_data_double[3 * idx]; double Aplus = GPU_data_double[3 * idx + 1]; double Aminus = GPU_data_double[3 * idx + 2]; double Cm = Cm_l[bp]; double gtot = gtot_l[bp]; double I0 = I0_l[bp]; double v_bp = v_l[bp]; double vLright = b[j + m] * test; double vRleft = b[i + 2*m] * test; double vRright = b[j + 2*m] * test; double dright = b[j] * test; B[idx] += -Cm/dt * v_bp - I0 -Aminus * dright; P[idx * BPcount + idx] += -Cm/dt - gtot + Aminus * (vRright - 1.0); P[idx * BPcount + idx_ante] += Aminus * vLright; P[idx_ante * BPcount + idx] += Aplus * vRleft; } __global__ void fillPB_bis(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double, int BPcount, int *indices, int m) { int idx_temp = threadIdx.x + blockIdx.x * blockDim.x; int idx = indices[idx_temp]; int idx_ante = GPU_data_int[4 * idx]; int i = GPU_data_int[4 * idx + 1]; int test = GPU_data_double[3 * idx]; double Aplus = GPU_data_double[3 * idx + 1]; double vLleft = b[i + m] * test; double dleft = b[i] * test; B[idx_ante] += - Aplus * dleft; P[idx_ante * (BPcount + 1)] += Aplus * (vLleft - 1.0); } __global__ void badFillPB_0(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double, int *indices, double *Cm_l, double *gtot_l, double *I0_l, double *v_l, int len_indices, int m, double dt) { double Cm = Cm_l[0]; double gtot = gtot_l[0]; double I0 = I0_l[0]; double v_0 = v_l[0]; B[0] = - Cm/dt * v_0 - I0; P[0] = - Cm/dt - gtot; int idx; int i; int test; double Aplus; double vLleft; double dleft; for (int idx_temp=0;idx_temp<len_indices;idx_temp++) { idx = indices[idx_temp]; i = GPU_data_int[4 * idx + 1]; test = GPU_data_double[3 * idx]; Aplus = GPU_data_double[3 * idx + 1]; vLleft = b[i + m] * test; dleft = b[i] * test; P[0] += Aplus * (vLleft - 1.0); B[0] += - Aplus * dleft; } } __global__ void resetPB_type0(double *P, double *B, double *v, int *indices,int BPcount) { int idx = indices[threadIdx.x] + blockIdx.x * blockDim.x; int idy = threadIdx.x + blockIdx.y * blockDim.y; P[idx + idy * BPcount] = 0.0; P[idx + idx * BPcount] = 1.0; B[idx] = v[idx]; } """) self.updateAB_gtot = mod.get_function("updateAB_gtot") self.updateAB_gtot.prepare(["P", "P", 'P'], block=(1, 1, 1)) self.updateBD = mod.get_function("updateBD") self.updateBD.prepare(["P", "P", 'd', 'P', 'P'], block=(1, 1, 1)) self.finalizeFun = mod.get_function("finalizeFun") self.finalizeFun.prepare(['P', 'P', 'P', 'P', 'P', 'i'], block=(1, 1, 1)) self.finalizeFunBis = mod.get_function("finalizeFunBis") self.finalizeFunBis.prepare(['P', 'P', 'P'], block=(1, 1, 1)) self.initPB = mod.get_function("initPB") self.initPB.prepare(['P', "P", 'i'], block=(1, 1, 1)) self.fillPB = mod.get_function("fillPB") self.fillPB.prepare( ["P", "P", 'P', "P", 'P', 'P', 'P', 'P', 'P', 'i', 'd', 'i'], block=(1, 1, 1)) self.fillPB_bis = mod.get_function("fillPB_bis") self.fillPB_bis.prepare(["P", "P", 'P', "P", 'P', 'i', 'P', 'i'], block=(1, 1, 1)) self.badFillPB_0 = mod.get_function("badFillPB_0") self.badFillPB_0.prepare( ["P", "P", 'P', "P", 'P', 'P', 'P', 'P', 'P', "P", 'i', 'i', 'd'], block=(1, 1, 1)) self.resetPB_type0 = mod.get_function("resetPB_type0") self.resetPB_type0.prepare(['P', "P", 'P', 'P', 'i'], block=(1, 1, 1)) dtype = numpy.dtype(numpy.float64) int_type = numpy.dtype(numpy.int32) self.P_gpu = cuda.mem_alloc(self.P.size * dtype.itemsize) self.B_gpu = cuda.mem_alloc(self.B.size * dtype.itemsize) GPU_data_int = zeros((self.neuron.BPcount, 4)) GPU_data_double = zeros((self.neuron.BPcount, 3)) GPU_data_int[:, 0] = self.index_ante_list[:] GPU_data_int[:, 1] = self.i_list[:] GPU_data_int[:, 2] = self.j_list[:] GPU_data_int[:, 3] = self.bp_list[:] GPU_data_double[:, 0] = self.test_list[:] GPU_data_double[:, 1] = self.Aplus_i[:] GPU_data_double[:, 2] = self.Aminus_bp[:] self.GPU_data_int = cuda.mem_alloc(4 * self.neuron.BPcount * int_type.itemsize) self.GPU_data_double = cuda.mem_alloc(3 * self.neuron.BPcount * dtype.itemsize) cuda.memcpy_htod(self.GPU_data_int, GPU_data_int.astype(numpy.int32)) cuda.memcpy_htod(self.GPU_data_double, GPU_data_double.astype(numpy.float64)) self.ind0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind0_gpu, numpy.array(self.ind0, numpy.int32)) self.ind1_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind1_gpu, numpy.array(self.ind1, numpy.int32)) self.ind2_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind2_gpu, numpy.array(self.ind2, numpy.int32)) self.ind_bctype_0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind_bctype_0_gpu, numpy.array(self.ind_bctype_0, numpy.int32)) self.ab1_base_gpu = cuda.mem_alloc(n * dtype.itemsize) self.ab1_gpu = cuda.mem_alloc(3 * n * dtype.itemsize) self.ab1_gpu_ptr = int(self.ab1_gpu) self.Cm_gpu = cuda.mem_alloc(n * dtype.itemsize) cuda.memcpy_htod(self.Cm_gpu, self.neuron.Cm.astype(numpy.float64)) self.gtot_gpu = cuda.mem_alloc(n * dtype.itemsize) self.I0_gpu = cuda.mem_alloc(n * dtype.itemsize) self.v_gpu = cuda.mem_alloc(n * dtype.itemsize) cuda.memcpy_htod(self.v_gpu, self.neuron.v) ab0 = zeros(3 * n).astype(numpy.float64) ab0[:n] = self.ab0[:] ab0[n:2 * n] = self.ab0[:] ab0[2 * n:3 * n] = self.ab0[:] ab2 = zeros(3 * n).astype(numpy.float64) ab2[:n] = self.ab2[:] ab2[n:2 * n] = self.ab2[:] ab2[2 * n:3 * n] = self.ab2[:] dtype = numpy.dtype(numpy.float64) self.ab0_gpu = cuda.mem_alloc(ab0.size * dtype.itemsize) self.ab0_gpu_ptr = int(self.ab0_gpu) self.ab2_gpu = cuda.mem_alloc(ab2.size * dtype.itemsize) self.ab2_gpu_ptr = int(self.ab2_gpu) self.bL_gpu = cuda.mem_alloc(self.bL.size * dtype.itemsize) self.bL_gpu_ptr = int(self.bL_gpu) self.bR_gpu = cuda.mem_alloc(self.bR.size * dtype.itemsize) self.bR_gpu_ptr = int(self.bR_gpu) self.b_gpu = cuda.mem_alloc(3 * self.bR.size * dtype.itemsize) self.b_gpu_ptr = int(self.b_gpu) # bd + bL + bR -> vd + vL + vR cuda.memcpy_htod(self.ab0_gpu, ab0) cuda.memcpy_htod(self.ab2_gpu, ab2) cuda.memcpy_htod(self.bL_gpu, self.bL) cuda.memcpy_htod(self.bR_gpu, self.bR) self.ante_gpu = cuda.mem_alloc(self.ante_arr.size * self.ante_arr.dtype.itemsize) self.post_gpu = cuda.mem_alloc(self.ante_arr.size * self.ante_arr.dtype.itemsize) self.v_old_gpu = cuda.mem_alloc(self.neuron.v.size * dtype.itemsize) cuda.memcpy_htod(self.ante_gpu, self.ante_arr) cuda.memcpy_htod(self.post_gpu, self.post_arr) #---------------------------------------------------------------------------------- self.v_branchpoints = zeros(self.neuron.BPcount) self.v_bp_gpu = cuda.mem_alloc(self.v_branchpoints.size * dtype.itemsize) self.timeDevice = [0] self.timeDeviceU = [0] self.timeDeviceT = [0] self.timeHost = [0] self.timeUpdater = [0] self.timeSolveHost = [0] self.timeFillHost = [0] self.timeFin = [0]
def alloc_exchange_boundaries(s): s.ey_tmp = cuda.pagelocked_zeros((s.ny, s.nz), 'f') s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
def run_simulation(self, weights, lengths, params_matrix, speeds, logger, args, n_nodes, n_work_items, n_params, nstep, n_inner_steps, buf_len, states, dt, min_speed): # setup data#{{{ data = { 'weights': weights, 'lengths': lengths, 'params': params_matrix.T } base_shape = n_work_items, for name, shape in dict( tavg=(n_nodes,), state=(buf_len, states * n_nodes), ).items(): data[name] = np.zeros(shape + base_shape, 'f') gpu_data = self.make_gpu_data(data)#{{{ # logger.info('history shape %r', data['state'].shape) logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, ))#}}} # setup CUDA stuff#{{{ step_fn = self.make_kernel( source_file=args.filename, warp_size=32, block_dim_x=args.n_coupling, # ext_options=preproccesor_defines, # caching=args.caching, args=args, lineinfo=args.lineinfo, nh=buf_len, # model=args.model, )#}}} # setup simulation#{{{ tic = time.time() # logger.info('nstep %i', nstep) streams = [drv.Stream() for i in range(32)] events = [drv.Event() for i in range(32)] tavg_unpinned = [] tavg = drv.pagelocked_zeros(data['tavg'].shape, dtype=np.float32) # logger.info('data[tavg].shape %s', data['tavg'].shape) #}}} gridx = args.n_coupling // args.blockszx gridy = args.n_speed // args.blockszy final_block_dim = args.blockszx, args.blockszy, 1 final_grid_dim = gridx, gridy # logger.info('final block dim %r', final_block_dim) logger.info('final grid dim %r', final_grid_dim) # assert n_coupling_per_block * n_coupling_blocks == args.n_coupling #}}} # logger.info('gpu_data[lengts] %s', gpu_data['lengths'].shape) # logger.info('nnodes %r', n_nodes) # logger.info('gpu_data[lengths] %r', gpu_data['lengths']) # run simulation#{{{ # logger.info('submitting work') import tqdm for i in tqdm.trange(nstep): # event = events[i % 32] # stream = streams[i % 32] # stream.wait_for_event(events[(i - 1) % 32]) step_fn(np.uintc(i * n_inner_steps), np.uintc(n_nodes), np.uintc(buf_len), np.uintc(n_inner_steps), np.uintc(n_params), np.float32(dt), np.float32(min_speed), gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'], gpu_data['tavg'], block=final_block_dim, grid=final_grid_dim) # event.record(streams[i % 32]) tavg_unpinned.append(tavg.copy()) drv.memcpy_dtoh( tavg, gpu_data['tavg'].ptr) # logger.info('kernel finish..') # release pinned memory tavg = np.array(tavg_unpinned) return tavg
def __init__(self, source, b, a, samplerate=None, precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None): # Automatically duplicate mono input to fit the desired output shape if b.shape[0] != source.nchannels: if source.nchannels != 1: raise ValueError( 'Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.' ) source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) if pycuda.context is None: set_gpu_device(0) self.precision = precision if self.precision == 'double': self.precision_dtype = float64 else: self.precision_dtype = float32 self.forcesync = forcesync self.pagelocked_mem = pagelocked_mem n, m, p = b.shape self.filt_b = b self.filt_a = a filt_b_gpu = array(b, dtype=self.precision_dtype) filt_a_gpu = array(a, dtype=self.precision_dtype) filt_state = zeros((n, m - 1, p), dtype=self.precision_dtype) if pagelocked_mem: filt_y = drv.pagelocked_zeros((n, ), dtype=self.precision_dtype) self.pre_x = drv.pagelocked_zeros((n, ), dtype=self.precision_dtype) else: filt_y = zeros(n, dtype=self.precision_dtype) self.pre_x = zeros(n, dtype=self.precision_dtype) self.filt_b_gpu = gpuarray.to_gpu(filt_b_gpu.T.flatten( )) # transform to Fortran order for better GPU mem self.filt_a_gpu = gpuarray.to_gpu( filt_a_gpu.T.flatten()) # access speeds self.filt_state = gpuarray.to_gpu(filt_state.T.flatten()) self.unroll_filterorder = unroll_filterorder if unroll_filterorder is None: if m <= 32: unroll_filterorder = True else: unroll_filterorder = False # TODO: improve code, check memory access patterns, maybe use local memory code = ''' #define x(s,i) _x[(s)*n+(i)] #define y(s,i) _y[(s)*n+(i)] #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m] #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m] #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)] __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples) { int j = blockIdx.x * blockDim.x + threadIdx.x; if(j>=n) return; for(int s=0; s<numsamples; s++) { ''' for k in range(p): loopcode = ''' y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k); ''' if unroll_filterorder: for i in range(m - 2): loopcode += re.sub( '\\bi\\b', str(i), ''' zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''') else: loopcode += ''' for(int i=0;i<m-2;i++) zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''' loopcode += ''' zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j); ''' if k < p - 1: loopcode += ''' x(s,j) = y(s,j); ''' loopcode = re.sub('\\bk\\b', str(k), loopcode) code += loopcode code += ''' } } ''' code = code.replace('SCALAR', self.precision) code = re.sub("\\bp\\b", str(p), code) #replace the variable by their values code = re.sub("\\bm\\b", str(m), code) code = re.sub("\\bn\\b", str(n), code) #print code self.gpu_mod = pycuda.compiler.SourceModule(code) self.gpu_filt_func = self.gpu_mod.get_function("filt") blocksize = 256 if n < blocksize: blocksize = n if n % blocksize == 0: gridsize = n / blocksize else: gridsize = n / blocksize + 1 self.block = (blocksize, 1, 1) self.grid = (gridsize, 1) self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block) self._has_run_once = False
if rank == 0: print "\navg: %1.2f GFLOPS" % flops[2:-2].mean() if rank == 1: total = np.zeros(tmax) for key in exec_time.iterkeys(): total[:] += exec_time[key][:] for key in exec_time.iterkeys(): print key, ':\t %1.2f %%' % (exec_time[key][2:-2].sum() / total[2:-2].sum() * 100) print "%1.2f GFLOPS\r" % ( (tmax - 4) * 3 * nx * ny * nz * 30 / total[2:-2].sum() * 1e-6) g = cuda.pagelocked_zeros((nx, ny, nz), 'f') cuda.memcpy_dtoh(g, ez_gpu) if rank != 0: comm.Send(g, 0, 24) else: lg = np.zeros((3 * nx, ny), 'f') lg[:nx, :] = g[:, :, nz / 2] comm.Recv(g, 1, 24) lg[nx:-nx, :] = g[:, :, nz / 2] comm.Recv(g, 2, 24) lg[2 * nx:, :] = g[:, :, nz / 2] imsh.set_array(lg.T**2) show() #draw() #savefig('./png-wave/%.5d.png' % tstep) stop.record()
def prepare(self): ''' From Hines 1984 paper, discrete formula is: A_plus*V(i+1)-(A_plus+A_minus)*V(i)+A_minus*V(i-1)=Cm/dt*(V(i,t+dt)-V(i,t))+gtot(i)*V(i)-I0(i) A_plus: i->i+1 A_minus: i->i-1 This gives the following tridiagonal system: A_plus*V(i+1)-(Cm/dt+gtot(i)+A_plus+A_minus)*V(i)+A_minus*V(i-1)=-Cm/dt*V(i,t)-I0(i) Boundaries, one simple possibility (sealed ends): -(Cm/dt+gtot(n)+A_minus)*V(n)+A_minus*V(n-1)=-Cm/dt*V(n,t)-I0(n) A_plus*V(1)-(Cm/dt+gtot(0)+A_plus)*V(0)=-Cm/dt*V(0,t)-I0(0) ''' mid_diameter = zeros(len(self.neuron)) # mid(i) : (i-1) <-> i mid_diameter[1:] = .5*(self.neuron.diameter[:-1]+self.neuron.diameter[1:]) self.Aplus = zeros(len(self.neuron)) # A+ i -> j = Aplus(j) self.Aminus = zeros(len(self.neuron)) # A- i <- j = Aminus(j) self.Aplus[1]= mid_diameter[1]**2/(4*self.neuron.diameter[1]*self.neuron.length[1]**2*self.neuron.Ri) self.Aplus[2:]=mid_diameter[2:]**2/(4*self.neuron.diameter[1:-1]*self.neuron.length[1:-1]**2*self.neuron.Ri) self.Aminus[1:]=mid_diameter[1:]**2/(4*self.neuron.diameter[1:]*self.neuron.length[1:]**2*self.neuron.Ri) self.neuron.index = zeros(len(self.neuron),int) # gives the index of the branch containing the current compartment self.neuron.branches = [] # (i,j,bp,ante,ante_index,pointType) # i is the first compartment # bp is the last, a branch point # j is the end of the "inner branch". j = bp-1 # ante is the branch point to which i is connected self.neuron.BPcount = 0 # number of branch points (or branches). = len(self.neuron.branches) self.neuron.long_branches_count = 0 # number of branches with len(branch) > 1 #self.vL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) #self.vR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) #self.d = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) self.bL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) self.bR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) #self.bd = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) self.ab = zeros((3,len(self.neuron))) self.ab0 = zeros(len(self.neuron)) self.ab1 = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64) self.ab2 = zeros(len(self.neuron)) self.ab1_base = zeros(len(self.neuron)) #self.res = cuda.pagelocked_zeros((3 * len(self.neuron)),numpy.float64) self.mTrunc = 0 # used to truncate vL and vR self.delta_list = zeros(len(self.neuron)) #used to find mTrunc # prepare_branch : fill neuron.index, neuron.branches, changes Aplus & Aminus self.prepare_branch(self.neuron.morphology, mid_diameter,0) # linear system P V = B used to deal with the voltage at branch points and take boundary conditions into account. self.P = zeros((self.neuron.BPcount,self.neuron.BPcount)) self.B = zeros(self.neuron.BPcount) self.solution_bp = zeros(self.neuron.BPcount) self.gtot = zeros(len(self.neuron)) self.I0 = zeros(len(self.neuron)) self.i_list = [] self.j_list = [] self.i_list_bis = [] self.j_list_bis = [] new_tridiag = True self.bp_list = [] self.pointType_list = [] self.pointTypeAnte_list = [] self.index_ante_list0 = [] self.index_ante_list1 = [] self.index_ante_list2 = [] self.ante_list = [] self.post_list = [] self.ante_list_idx = [] self.post_list_idx = [] self.id = [] self.test_list = [] temp = zeros(self.neuron.BPcount) self.ind0 = [] self.ind_bctype_0 = [] for index,(i,j,bp,ante,index_ante,pointType) in enumerate(self.neuron.branches) : self.i_list.append(i) self.j_list.append(j) if new_tridiag: self.i_list_bis.append(i) ii = i else: ii = self.i_list[-1] if j-ii+1>2: self.j_list_bis.append(j) new_tridiag = True else : new_tridiag = False self.bp_list.append(bp) self.pointType_list.append(max(1,pointType)) self.pointTypeAnte_list.append(max(1,self.neuron.bc[ante])) temp[index] = index_ante self.id.append(index) if (j-i+2>1): self.test_list.append(1) else : self.test_list.append(0) for x in xrange(j-i+2): self.ante_list.append(ante) self.post_list.append(bp) self.ante_list_idx.append(index_ante) self.post_list_idx.append(index) if index_ante == 0 and index != 0: self.ind0.append(index) if pointType==0 : self.ind_bctype_0.append(bp) self.ante_arr = numpy.array(self.ante_list_idx) self.post_arr = numpy.array(self.post_list_idx) self.index_ante_list1, self.ind1 = numpy.unique(temp,return_index=True) self.ind1 = numpy.sort(self.ind1) self.index_ante_list1 = temp[self.ind1] self.index_ante_list1 = list(self.index_ante_list1) self.ind2 = [] for x in xrange(self.neuron.BPcount): self.ind2.append(x) self.ind2 = numpy.delete(self.ind2,self.ind1,None) self.ind2 = numpy.setdiff1d(self.ind2, self.ind0, assume_unique=True) self.index_ante_list2 = temp[self.ind2] self.index_ante_list2 = list(self.index_ante_list2) self.index_ante_list = list(temp) self.Aminus_bp = self.Aminus[self.bp_list] self.Aminus_bp [:] *= self.pointType_list[:] self.Aplus_i = self.Aplus[self.i_list] self.Aplus_i[:] *= self.pointTypeAnte_list[:] #--------------------------------------------------------GPU------------------------ n = len(self.neuron) mod = SourceModule(""" __global__ void updateAB_gtot(double *ab1, double *ab1_base, double *gtot) { int idx = threadIdx.x + blockIdx.x * blockDim.x; ab1[idx] = ab1_base[idx] - gtot[idx]; ab1[idx+gridDim.x] = ab1_base[idx] - gtot[idx]; ab1[idx+2*gridDim.x] = ab1_base[idx] - gtot[idx]; } __global__ void updateBD(double *bd, double *Cm, double dt,double *v, double *I0) { int idx = threadIdx.x + blockIdx.x * blockDim.x; bd[idx] = - Cm[idx] / dt * v[idx] - I0[idx]; } __global__ void finalizeFun(double *v, double *v_bp, int *ante,int *post, double *b, int m) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx_a = ante[idx]; int idx_p = post[idx]; v[idx] = b[idx + m] * v_bp[idx_a] + b[idx + 2*m] * v_bp[idx_p] + b[idx]; // vL, vR, d } __global__ void finalizeFunBis(double *v, double *v_bp, int *GPU_data_int) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int bp = GPU_data_int[4 * idx + 3]; v[bp] = v_bp[idx]; } __global__ void initPB(double *P, double *B, int BPcount) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int idy = threadIdx.x + blockIdx.y * blockDim.y; P[idx + idy * BPcount] = 0.0; B[idx] = 0.0; } //GPU_data_int is : ante_list, i_list, j_list, bp_list //GPU_data_double is : test_list, Aplus, Aminus __global__ void fillPB(double *P, double *B, double *b, double *Cm_l, double *gtot_l, int *GPU_data_int, double *GPU_data_double, double *I0_l, double *v_l, int BPcount, double dt, int m) { int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx_ante = GPU_data_int[4 * idx]; int i = GPU_data_int[4 * idx + 1]; int j = GPU_data_int[4 * idx + 2]; int bp = GPU_data_int[4 * idx + 3]; double test = GPU_data_double[3 * idx]; double Aplus = GPU_data_double[3 * idx + 1]; double Aminus = GPU_data_double[3 * idx + 2]; double Cm = Cm_l[bp]; double gtot = gtot_l[bp]; double I0 = I0_l[bp]; double v_bp = v_l[bp]; double vLright = b[j + m] * test; double vRleft = b[i + 2*m] * test; double vRright = b[j + 2*m] * test; double dright = b[j] * test; B[idx] += -Cm/dt * v_bp - I0 -Aminus * dright; P[idx * BPcount + idx] += -Cm/dt - gtot + Aminus * (vRright - 1.0); P[idx * BPcount + idx_ante] += Aminus * vLright; P[idx_ante * BPcount + idx] += Aplus * vRleft; } __global__ void fillPB_bis(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double, int BPcount, int *indices, int m) { int idx_temp = threadIdx.x + blockIdx.x * blockDim.x; int idx = indices[idx_temp]; int idx_ante = GPU_data_int[4 * idx]; int i = GPU_data_int[4 * idx + 1]; int test = GPU_data_double[3 * idx]; double Aplus = GPU_data_double[3 * idx + 1]; double vLleft = b[i + m] * test; double dleft = b[i] * test; B[idx_ante] += - Aplus * dleft; P[idx_ante * (BPcount + 1)] += Aplus * (vLleft - 1.0); } __global__ void badFillPB_0(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double, int *indices, double *Cm_l, double *gtot_l, double *I0_l, double *v_l, int len_indices, int m, double dt) { double Cm = Cm_l[0]; double gtot = gtot_l[0]; double I0 = I0_l[0]; double v_0 = v_l[0]; B[0] = - Cm/dt * v_0 - I0; P[0] = - Cm/dt - gtot; int idx; int i; int test; double Aplus; double vLleft; double dleft; for (int idx_temp=0;idx_temp<len_indices;idx_temp++) { idx = indices[idx_temp]; i = GPU_data_int[4 * idx + 1]; test = GPU_data_double[3 * idx]; Aplus = GPU_data_double[3 * idx + 1]; vLleft = b[i + m] * test; dleft = b[i] * test; P[0] += Aplus * (vLleft - 1.0); B[0] += - Aplus * dleft; } } __global__ void resetPB_type0(double *P, double *B, double *v, int *indices,int BPcount) { int idx = indices[threadIdx.x] + blockIdx.x * blockDim.x; int idy = threadIdx.x + blockIdx.y * blockDim.y; P[idx + idy * BPcount] = 0.0; P[idx + idx * BPcount] = 1.0; B[idx] = v[idx]; } """) self.updateAB_gtot = mod.get_function("updateAB_gtot") self.updateAB_gtot.prepare(["P","P",'P'],block=(1,1,1)) self.updateBD = mod.get_function("updateBD") self.updateBD.prepare(["P","P",'d','P','P'],block=(1,1,1)) self.finalizeFun = mod.get_function("finalizeFun") self.finalizeFun.prepare(['P','P','P','P','P','i'],block=(1,1,1)) self.finalizeFunBis = mod.get_function("finalizeFunBis") self.finalizeFunBis.prepare(['P','P','P'],block=(1,1,1)) self.initPB = mod.get_function("initPB") self.initPB.prepare(['P',"P",'i'],block=(1,1,1)) self.fillPB = mod.get_function("fillPB") self.fillPB.prepare(["P","P",'P',"P",'P','P','P','P','P','i','d','i'],block=(1,1,1)) self.fillPB_bis = mod.get_function("fillPB_bis") self.fillPB_bis.prepare(["P","P",'P',"P",'P','i','P','i'],block=(1,1,1)) self.badFillPB_0 = mod.get_function("badFillPB_0") self.badFillPB_0.prepare(["P","P",'P',"P",'P','P','P','P','P',"P",'i','i','d'],block=(1,1,1)) self.resetPB_type0 = mod.get_function("resetPB_type0") self.resetPB_type0.prepare(['P',"P",'P','P','i'],block=(1,1,1)) dtype = numpy.dtype(numpy.float64) int_type = numpy.dtype(numpy.int32) self.P_gpu = cuda.mem_alloc(self.P.size * dtype.itemsize) self.B_gpu = cuda.mem_alloc(self.B.size * dtype.itemsize) GPU_data_int = zeros((self.neuron.BPcount,4)) GPU_data_double = zeros((self.neuron.BPcount,3)) GPU_data_int[:,0] = self.index_ante_list[:] GPU_data_int[:,1] = self.i_list[:] GPU_data_int[:,2] = self.j_list[:] GPU_data_int[:,3] = self.bp_list[:] GPU_data_double[:,0] = self.test_list[:] GPU_data_double[:,1] = self.Aplus_i[:] GPU_data_double[:,2] = self.Aminus_bp[:] self.GPU_data_int = cuda.mem_alloc(4 * self.neuron.BPcount * int_type.itemsize) self.GPU_data_double = cuda.mem_alloc(3 * self.neuron.BPcount * dtype.itemsize) cuda.memcpy_htod(self.GPU_data_int,GPU_data_int.astype(numpy.int32)) cuda.memcpy_htod(self.GPU_data_double,GPU_data_double.astype(numpy.float64)) self.ind0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind0_gpu,numpy.array(self.ind0,numpy.int32)) self.ind1_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind1_gpu,numpy.array(self.ind1,numpy.int32)) self.ind2_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind2_gpu,numpy.array(self.ind2,numpy.int32)) self.ind_bctype_0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize) cuda.memcpy_htod(self.ind_bctype_0_gpu,numpy.array(self.ind_bctype_0,numpy.int32)) self.ab1_base_gpu = cuda.mem_alloc(n * dtype.itemsize) self.ab1_gpu = cuda.mem_alloc(3 * n * dtype.itemsize) self.ab1_gpu_ptr = int(self.ab1_gpu) self.Cm_gpu = cuda.mem_alloc(n * dtype.itemsize) cuda.memcpy_htod(self.Cm_gpu,self.neuron.Cm.astype(numpy.float64)) self.gtot_gpu = cuda.mem_alloc(n * dtype.itemsize) self.I0_gpu = cuda.mem_alloc(n * dtype.itemsize) self.v_gpu = cuda.mem_alloc(n * dtype.itemsize) cuda.memcpy_htod(self.v_gpu,self.neuron.v) ab0 = zeros(3*n).astype(numpy.float64) ab0[:n] = self.ab0[:] ab0[n:2*n] = self.ab0[:] ab0[2*n:3*n] = self.ab0[:] ab2 = zeros(3*n).astype(numpy.float64) ab2[:n] = self.ab2[:] ab2[n:2*n] = self.ab2[:] ab2[2*n:3*n] = self.ab2[:] dtype = numpy.dtype(numpy.float64) self.ab0_gpu = cuda.mem_alloc(ab0.size * dtype.itemsize) self.ab0_gpu_ptr = int(self.ab0_gpu) self.ab2_gpu = cuda.mem_alloc(ab2.size * dtype.itemsize) self.ab2_gpu_ptr = int(self.ab2_gpu) self.bL_gpu = cuda.mem_alloc(self.bL.size * dtype.itemsize) self.bL_gpu_ptr = int(self.bL_gpu) self.bR_gpu = cuda.mem_alloc(self.bR.size * dtype.itemsize) self.bR_gpu_ptr = int(self.bR_gpu) self.b_gpu = cuda.mem_alloc(3 * self.bR.size * dtype.itemsize) self.b_gpu_ptr = int(self.b_gpu) # bd + bL + bR -> vd + vL + vR cuda.memcpy_htod(self.ab0_gpu, ab0) cuda.memcpy_htod(self.ab2_gpu, ab2) cuda.memcpy_htod(self.bL_gpu, self.bL) cuda.memcpy_htod(self.bR_gpu, self.bR) self.ante_gpu = cuda.mem_alloc(self.ante_arr.size * self.ante_arr.dtype.itemsize) self.post_gpu = cuda.mem_alloc(self.ante_arr.size * self.ante_arr.dtype.itemsize) self.v_old_gpu = cuda.mem_alloc(self.neuron.v.size * dtype.itemsize) cuda.memcpy_htod(self.ante_gpu,self.ante_arr) cuda.memcpy_htod(self.post_gpu,self.post_arr) #---------------------------------------------------------------------------------- self.v_branchpoints = zeros(self.neuron.BPcount) self.v_bp_gpu = cuda.mem_alloc(self.v_branchpoints.size * dtype.itemsize) self.timeDevice = [0] self.timeDeviceU = [0] self.timeDeviceT = [0] self.timeHost = [0] self.timeUpdater = [0] self.timeSolveHost = [0] self.timeFillHost = [0] self.timeFin = [0]
hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(set_c(f, (None, -1, -1))) cey_gpu = cuda.to_device(set_c(f, (-1, None, -1))) cez_gpu = cuda.to_device(set_c(f, (-1, -1, None))) chx_gpu = cuda.to_device(set_c(f, (None, 0, 0))) chy_gpu = cuda.to_device(set_c(f, (0, None, 0))) chz_gpu = cuda.to_device(set_c(f, (0, 0, None))) # pinned memory allocation for zero-copy if myrank != 1: ex_send = cuda.pagelocked_zeros( (nx, ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) ey_send = cuda.pagelocked_zeros( (nx, ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) hx_recv = cuda.pagelocked_zeros( (nx, ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP) hy_recv = cuda.pagelocked_zeros( (nx, ny), np.float32,
def alloc_async_host_buf(self, shape, dtype): """Allocates a buffer that can be used for asynchronous data transfers.""" return cuda.pagelocked_zeros(shape, dtype=dtype)
from wavemoth.cuda.profile import cuda_profile N = 4092 nside = 2048 npix = 12 * 2048**2 nrings = 4 * nside - 1 lmax = 2 * nside #x = np.asarray(np.random.rand(N), np.float32) #xf = np.fft.fft(x) #x_gpu = gpuarray.to_gpu(x) #xf_gpu = gpuarray.empty(N/2+1, np.complex64) map = drv.pagelocked_zeros(npix, np.float64) buf = drv.pagelocked_zeros((nrings, (lmax + 1) // 2 + 1), np.complex128) map_gpu = drv.mem_alloc(npix * 8) buf_gpu = drv.mem_alloc(nrings * ((lmax + 1) // 2 + 1) * 16) drv.memcpy_htod(map_gpu, map) from wavemoth.cuda import cufft print 'ctoring plan' plan = cufft.HealpixCuFFTPlan(2048, 8) repeats = 1 print 'plan ctored' with cuda_profile() as prof:
def __init__(self, source, b, a, samplerate=None, precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None): # Automatically duplicate mono input to fit the desired output shape if b.shape[0]!=source.nchannels: if source.nchannels!=1: raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.') source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) if pycuda.context is None: set_gpu_device(0) self.precision=precision if self.precision=='double': self.precision_dtype=float64 else: self.precision_dtype=float32 self.forcesync=forcesync self.pagelocked_mem=pagelocked_mem n, m, p=b.shape self.filt_b=b self.filt_a=a filt_b_gpu=array(b, dtype=self.precision_dtype) filt_a_gpu=array(a, dtype=self.precision_dtype) filt_state=zeros((n, m-1, p), dtype=self.precision_dtype) if pagelocked_mem: filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype) self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype) else: filt_y=zeros(n, dtype=self.precision_dtype) self.pre_x=zeros(n, dtype=self.precision_dtype) self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds self.filt_state=gpuarray.to_gpu(filt_state.T.flatten()) self.unroll_filterorder = unroll_filterorder if unroll_filterorder is None: if m<=32: unroll_filterorder = True else: unroll_filterorder = False # TODO: improve code, check memory access patterns, maybe use local memory code=''' #define x(s,i) _x[(s)*n+(i)] #define y(s,i) _y[(s)*n+(i)] #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m] #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m] #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)] __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples) { int j = blockIdx.x * blockDim.x + threadIdx.x; if(j>=n) return; for(int s=0; s<numsamples; s++) { ''' for k in range(p): loopcode=''' y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k); ''' if unroll_filterorder: for i in range(m-2): loopcode+=re.sub('\\bi\\b', str(i), ''' zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''') else: loopcode+=''' for(int i=0;i<m-2;i++) zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''' loopcode+=''' zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j); ''' if k<p-1: loopcode+=''' x(s,j) = y(s,j); ''' loopcode=re.sub('\\bk\\b', str(k), loopcode) code+=loopcode code+=''' } } ''' code=code.replace('SCALAR', self.precision) code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values code=re.sub("\\bm\\b", str(m), code) code=re.sub("\\bn\\b", str(n), code) #print code self.gpu_mod=pycuda.compiler.SourceModule(code) self.gpu_filt_func=self.gpu_mod.get_function("filt") blocksize=256 if n<blocksize: blocksize=n if n%blocksize==0: gridsize=n/blocksize else: gridsize=n/blocksize+1 self.block=(blocksize, 1, 1) self.grid=(gridsize, 1) self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block) self._has_run_once=False
sys.stdout.flush() start.record() if rank == 0: print "\navg: %1.2f GFLOPS" % flops[2:-2].mean() if rank == 1: total = np.zeros(tmax) for key in exec_time.iterkeys(): total[:] += exec_time[key][:] for key in exec_time.iterkeys(): print key, ":\t %1.2f %%" % (exec_time[key][2:-2].sum() / total[2:-2].sum() * 100) print "%1.2f GFLOPS\r" % ((tmax - 4) * 3 * nx * ny * nz * 30 / total[2:-2].sum() * 1e-6) g = cuda.pagelocked_zeros((nx, ny, nz), "f") cuda.memcpy_dtoh(g, ez_gpu) if rank != 0: comm.Send(g, 0, 24) else: lg = np.zeros((3 * nx, ny), "f") lg[:nx, :] = g[:, :, nz / 2] comm.Recv(g, 1, 24) lg[nx:-nx, :] = g[:, :, nz / 2] comm.Recv(g, 2, 24) lg[2 * nx :, :] = g[:, :, nz / 2] imsh.set_array(lg.T ** 2) show() # draw() # savefig('./png-wave/%.5d.png' % tstep) stop.record()
__global__ void increment(int* a,float* progress) { for(int i=0;i<500;i++){ atomicAdd(progress,1.0f); } } """).get_function("increment") print("Compiled and got function increment") def In(thing): thing_pointer = cuda.mem_alloc(thing.nbytes) cuda.memcpy_htod(thing_pointer, thing) return thing_pointer pagelocked_mem = cuda.pagelocked_zeros((1,1),numpy.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) pagelocked_mem_ptr = numpy.intp(pagelocked_mem.base.get_device_pointer()) print(pagelocked_mem[0,0]) a = numpy.int32(345) a_gpu = In(a) increment(a_gpu,pagelocked_mem_ptr, block=(1,1,1), grid=(50,50,1)) while pagelocked_mem[0,0]<(50*50*500): print pagelocked_mem[0,0] cuda.Context.synchronize() print pagelocked_mem[0,0]
def getRT(self, s_map, srt_gpu, srt_nsamp, srt_npairs, npairs, store_rt=False): """ Computes the rank template s_map(Sample Map) - an list of 1s and 0s of length nsamples where 1 means use this sample to compute rank template srt_gpu - cuda memory object containing srt(sample rank template) array on gpu srt_nsamp, srt_npairs - shape(buffered) of srt_gpu object npairs - true number of gene pairs being compared b_size - size of the blocks for computation store_rt - determines the RETURN value False(default) = returns an numpy array shape(npairs) of the rank template True = returns the rt_gpu object and the padded size of the rt_gpu objet (rt_obj, npairs_padded) """ b_size = self.b_size s_map_buff = self.s_map_buff = cuda.pagelocked_zeros( (int(srt_nsamp), ), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) s_map_buff[:len(s_map)] = np.array(s_map, dtype=np.int32) s_map_gpu = np.intp(s_map_buff.base.get_device_pointer()) #cuda.memcpy_htod(s_map_gpu, s_map_buff) #sample blocks g_y_sz = self.getGrid(srt_nsamp) #pair blocks g_x_sz = self.getGrid(srt_npairs) block_rt_gpu = cuda.mem_alloc( int(g_y_sz * srt_npairs * (np.uint32(1).nbytes))) grid = (g_x_sz, g_y_sz) func1, func2 = self.getrtKern(g_y_sz) shared_size = b_size * b_size * np.uint32(1).nbytes func1(srt_gpu, np.uint32(srt_nsamp), np.uint32(srt_npairs), s_map_gpu, block_rt_gpu, np.uint32(g_y_sz), block=(b_size, b_size, 1), grid=grid, shared=shared_size) rt_buffer = self.rt_buffer = cuda.pagelocked_zeros( (int(srt_npairs), ), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) rt_gpu = np.intp(rt_buffer.base.get_device_pointer()) func2(block_rt_gpu, rt_gpu, np.int32(s_map_buff.sum()), block=(b_size, 1, 1), grid=(g_x_sz, )) if store_rt: #this is in case we want to run further stuff without #transferring back and forth return (rt_gpu, srt_npairs) else: #rt_buffer = np.zeros((srt_npairs ,), dtype=np.int32) #cuda.memcpy_dtoh(rt_buffer, rt_gpu) #rt_gpu.free() return rt_buffer[:npairs]
def alloc_exchange_boundaries(s): s.ey_tmp = cuda.pagelocked_zeros((s.ny,s.nz),'f') s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
include_dirs=[ "/usr/local/cuda/include", ], # This because we use mma.h which isn't included in pycuda's default include dir. no_extern_c=True) #An explanation of this follows. # PyCuda normally wraps the entirety of its included source in an extern "C" {...} block. This is to prevent the kernels from being compiled in a C++ # fashion, and allows the module.get_function() [see below] to actually find the function by its name (i.e. identifier). # I need to use this option because if #include <mma.h> is inside the extern "C" block, then PyCuda kicks puppies and clubs baby seals. # You need to manually put this block around your kernel for the functionality to work. # NOTE: I am not sure whether this prevents us from using any advanced, C++-like features (such as classes, inheritance, polymorphism, etc) in kernels. # Probably we won't need those things anyway. simple_tc_matmul_kernel = module.get_function( "simple_tc_matmul" ) # This looks for the "simple_tc_matmul" identifier in the output of nvcc. # Set up the A matrix. Currently, it is mostly zeros. a_host = cuda.pagelocked_zeros((tcm_size, tcm_size), dtype=np.float16) # Manually insert some test data. a_host[0, 0] = np.float16(num1) #a_host[0,2] = np.float16(num2) a_device = cuda.mem_alloc(a_host.nbytes) cuda.memcpy_htod(a_device, a_host) print("A:") print_mat(a_host) # B will be mostly zeros. b_host = cuda.pagelocked_zeros((tcm_size, tcm_size), dtype=np.float16) # Manually insert some test data. b_host[0, 0] = np.float16(num2) #b_host[2,0] = np.float16(num1) b_device = cuda.mem_alloc(a_host.nbytes)
BGRA2NV12 = get_BGRA2NV12() print("BGRA2NV12=%s" % BGRA2NV12) w = roundup(512, 32) h = roundup(512, 32) log("w=%s, h=%s", w, h) cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h*3/2, 16) log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)), inputPitch) #allocate CUDA NV12 buffer (on device): cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h*3/2, 16) log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch) #host buffers: inputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte) log("inputBuffer=%s", inputBuffer) outputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte) log("outputBuffer=%s", outputBuffer) #populate host buffer with random data: buf = inputBuffer.data for y in range(h*3/2): dst = y * inputPitch #debug("%s: %s:%s (size=%s) <- %s:%s (size=%s)", y, dst, dst+w, len(buffer), src, src+w, len(Yplane)) for x in range(w): buf[dst+x] = numpy.byte((x+y) % 256) #copy input buffer to CUDA buffer: driver.memcpy_htod(cudaInputBuffer, inputBuffer)
import pycuda.autoinit # Setup the kernel mod = SourceModule(""" __global__ void add(float *a, float *b, float *c, float *c_map) { int idx = blockIdx.x*blockDim.x + threadIdx.x; float val; val = a[idx] + b[idx]; c[idx] = val; c_map[idx] = val; } """) add = mod.get_function("add") # Memory allocation nx = 1024 a = np.random.randn(nx).astype(np.float32) b = np.random.randn(nx).astype(np.float32) c = np.zeros_like(a) a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) # Page-locked host memory allocation for zero-copy c_map = cuda.pagelocked_zeros(nx, np.float32) add( a_gpu, b_gpu, cuda.Out(c), cuda.Out(c_map), block=(256,1,1), grid=(4,1) ) assert( np.linalg.norm( (a+b)-c ) == 0 ) assert( np.linalg.norm( (a+b)-c_map ) == 0 )