def __init__(self, volume, segmentation, voxelsize, origin=[0.0, 0.0, 0.0], stepsize=0.1, mode="linear"): #generate kernels self.mod = self.generateKernelModuleProjector() self.projKernel = self.mod.get_function("projectKernel") self.volumesize = volume.shape self.volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy() self.segmentation = np.moveaxis(segmentation.astype(np.float32), [0, 1, 2], [2, 1, 0]).copy() # print("done swap") self.volume_gpu = cuda.np_to_array(self.volume, order='C') self.texref_volume = self.mod.get_texref("tex_density") cuda.bind_array_to_texref(self.volume_gpu, self.texref_volume) self.segmentation_gpu = cuda.np_to_array(self.segmentation, order='C') self.texref_segmentation = self.mod.get_texref("tex_segmentation") cuda.bind_array_to_texref(self.segmentation_gpu, self.texref_segmentation) if mode == "linear": self.texref_volume.set_filter_mode(cuda.filter_mode.LINEAR) self.texref_segmentation.set_filter_mode(cuda.filter_mode.LINEAR) self.voxelsize = voxelsize self.stepsize = np.float32(stepsize) self.origin = origin self.initialized = False print("initialized projector")
def __init__(self, balljoint, texture): self.balljoint = balljoint self.tex = texture self.interpol = self.mod.get_function("MagneticFieldInterpolateKernel") self.texref = self.mod.get_texref('tex') drv.bind_array_to_texref( drv.make_multichannel_2d_array(self.tex, order="C"), self.texref) self.texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES) self.texref.set_filter_mode(drv.filter_mode.LINEAR) self.texref.set_address_mode(0, drv.address_mode.WRAP) self.texref.set_address_mode(1, drv.address_mode.CLAMP) self.sensor_pos = balljoint.config['sensor_pos'] self.number_of_sensors = len(self.sensor_pos) self.input = np.zeros((self.number_of_sensors, 3), dtype=np.float32, order='C') self.output = np.zeros((self.number_of_sensors, 3), dtype=np.float32, order='C') self.b_target = np.zeros((self.number_of_sensors, 3), dtype=np.float32, order='C') self.bdim = (16, 16, 1) dx, mx = divmod(self.number_of_sensors, self.bdim[0]) dy, my = divmod(self.number_of_sensors, self.bdim[1]) self.gdim = (int((dx + (mx > 0))), int((dy + (my > 0)))) rospy.init_node('BallJointPoseestimator', anonymous=True) self.joint_state = rospy.Publisher('/external_joint_states', sensor_msgs.msg.JointState, queue_size=1)
def test_multichannel_2d_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture<float4, 2, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5, 6) channels = 4 a = np.asarray(np.random.randn(*((channels, ) + shape)), dtype=np.float32, order="F") drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape + (channels, ), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) #print reshaped_a #print dest assert la.norm(dest - reshaped_a) == 0
def test_multichannel_2d_texture(self): mod = SourceModule( """ #define CHANNELS 4 texture<float4, 2, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """ ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5, 6) channels = 4 a = np.asarray(np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F") drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape + (channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) # print reshaped_a # print dest assert la.norm(dest - reshaped_a) == 0
def create_2d_rgba_texture(a, module, variable, point_sampling=False): a = numpy.ascontiguousarray(a) out_texref = module.get_texref(variable) cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(a, order='C'), out_texref) if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT) else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR) return out_texref
def initialize(self): """Allocate GPU memory and transfer the volume, segmentations to GPU.""" if self.initialized: raise RuntimeError("Close projector before initializing again.") # allocate and transfer volume texture to GPU # TODO: this axis-swap is messy and actually may be messing things up. Maybe use a FrameTransform in the Volume class instead? volume = self.volume.data volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy() # TODO: is this axis swap necessary? self.volume_gpu = cuda.np_to_array(volume, order='C') self.volume_texref = self.mod.get_texref("volume") cuda.bind_array_to_texref(self.volume_gpu, self.volume_texref) # set the (interpolation?) mode if self.mode == 'linear': self.volume_texref.set_filter_mode(cuda.filter_mode.LINEAR) else: raise RuntimeError # allocate and transfer segmentation texture to GPU # TODO: remove axis swap? # self.segmentations_gpu = [cuda.np_to_array(seg, order='C') for mat, seg in self.volume.materials.items()] self.segmentations_gpu = [cuda.np_to_array(np.moveaxis(seg, [0, 1, 2], [2, 1, 0]).copy(), order='C') for mat, seg in self.volume.materials.items()] self.segmentations_texref = [self.mod.get_texref(f"seg_{m}") for m, _ in enumerate(self.volume.materials)] for seg, texref in zip(self.segmentations_gpu, self.segmentations_texref): cuda.bind_array_to_texref(seg, texref) if self.mode == 'linear': texref.set_filter_mode(cuda.filter_mode.LINEAR) else: raise RuntimeError # allocate output image array on GPU (4 bytes to a float32) self.output_gpu = cuda.mem_alloc(self.output_size * 4) # allocate ijk_from_index matrix array on GPU (3x3 array x 4 bytes per float32) self.rt_kinv_gpu = cuda.mem_alloc(3 * 3 * 4) # Mark self as initialized. self.initialized = True
def run_function(function_package): # global variables global FD global tb_cnt # initialize variables fp = function_package func_output = fp.output u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position data_halo = func_output.data_halo function_name = fp.function_name args = fp.function_args work_range = fp.work_range tb_cnt = 0 stream = stream_list[0] # cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) # cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream) if fp.update_tf == 1: tf.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1,256,4), order='C'), tf) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) if fp.update_tf2 == 1: tf1.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1,256,4), order='C'), tf1) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) cuda_args = [] data_exist = True if u not in data_list: data_exist = False elif ss not in data_list[u]: data_exist = False elif sp not in data_list[u][ss]: data_exist = False if data_exist: # initialize variables data_package = data_list[u][ss][sp] dp = data_package if dp.devptr == None: wait_data_arrive(data_package, stream=stream) ########################### devptr = dp.devptr output_range = dp.data_range full_output_range = dp.full_data_range ad = data_range_to_cuda_in(output_range, full_output_range, stream=stream) cuda_args += [ad] output_package = dp FD.append(ad) else: bytes = func_output.buffer_bytes devptr, usage = malloc_with_swap_out(bytes) log("created output data bytes %s"%(str(func_output.buffer_bytes)),'detail',log_type) data_range = func_output.data_range full_data_range = func_output.full_data_range buffer_range = func_output.buffer_range buffer_halo = func_output.buffer_halo ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = func_output output_package.buffer_bytes = usage if False: print "OUTPUT" print "OUTPUT_RANGE", data_range print "OUTPUT_FULL_RANGE", full_data_range FD.append(ad) # set work range block, grid = range_to_block_grid(work_range) # set block and grid # log("work_range "+str(work_range),'detail',log_type) # log("block %s grid %s"%(str(block),str(grid)),'detail',log_type) cuda_args = [devptr] + cuda_args # print "GPU", rank, "BEFORE RECV", time.time() # Recv data from other process for data_package in args: u = data_package.unique_id data_name = data_package.data_name if data_name not in work_range and u != -1: wait_data_arrive(data_package, stream=stream) # print "GPU", rank, "Recv Done", time.time() # set cuda arguments for data_package in args: data_name = data_package.data_name data_dtype = data_package.data_dtype data_contents_dtype = data_package.data_contents_dtype u = data_package.unique_id if data_name in work_range: cuda_args.append( numpy.int32(work_range[data_name][0])) cuda_args.append( numpy.int32(work_range[data_name][1])) elif u == -1: data = data_package.data dtype = type(data) if dtype in [int]: data = numpy.float32(data) if dtype in [float]: data = numpy.float32(data) cuda_args.append(numpy.float32(data)) # temp else: ss = data_package.split_shape sp = data_package.split_position dp = data_list[u][ss][sp] # it must be fixed to data_package latter memory_type = dp.memory_type if memory_type == 'devptr': cuda_args.append(dp.devptr) data_range = dp.data_range full_data_range = dp.full_data_range buffer_range = dp.buffer_range if False: print "DATA_NAME", data_name print "DATA_RANGE", data_range print "FULL_DATA_RANGE", full_data_range print "BUFFER_RANGE", buffer_range print "DATA_HALO", dp.data_halo print "BUFFER_HALO", dp.buffer_halo print dp print_devptr(dp.devptr, dp) ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, data_halo=dp.data_halo, buffer_halo=dp.buffer_halo, stream=stream) cuda_args.append(ad) FD.append(ad) # log("function cuda name %s"%(function_name),'detail',log_type) # if function_name in func_dict: # func = func_dict[function_name] # else: # set modelview matrix cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream) cuda.memcpy_htod_async(inv_mmtx, fp.inv_mmtx.reshape(16), stream=stream) try: if Debug: print "Function name: ", function_name func = mod.get_function(function_name.strip()) except: print "Function not found ERROR" print "Function name: " + function_name assert(False) stream_list[0].synchronize() if log_type in ['time','all']: start = time.time() kernel_finish = cuda.Event() func( *cuda_args, block=block, grid=grid, stream=stream_list[0]) kernel_finish.record(stream=stream_list[0]) """ try: a = numpy.empty((30,30),dtype=numpy.int32) cuda.memcpy_dtoh(a, cuda_args[0]) print a[10:-10,10:-10] except: print "Fail", function_name print "Fp.output", fp.output pass """ u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position target = (u,ss,sp) Event_dict[target] = kernel_finish if target not in valid_list: valid_list.append(target) ################################################################################# # finish if log_type in ['time','all']: t = (time.time() - start) ms = 1000*t log("rank%d, %s, \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "%(rank, func_output.data_name, function_name, u, device_number, ms),'time',log_type) #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type) ################################################################################### # decrease retain_count for data_package in args: u = data_package.unique_id if u != -1: mem_release(data_package) # print "Release", time.time() return devptr, output_package
def __init__(self, img_path): super(LFapplication, self).__init__() # # Load image data # base_path = os.path.splitext(img_path)[0] lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' with open(lenslet_path, 'r') as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path, 'r') as f: for line in f: name, val = line.strip().split() try: setattr(self, name, np.float32(val)) except: pass max_angle = math.atan(self.pitch / 2 / self.flen) # # Prepare image # im_pil = Image.open(img_path) if im_pil.mode == 'RGB': self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) im[:, :, :3] = np.array(im_pil).astype(np.float32) self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx), 3) else: self.NCHANNELS = 1 im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype( np.float32) h, w = im.shape self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx)) x_start = x_offset - int(x_offset / right_dx) * right_dx y_start = y_offset - int(y_offset / down_dy) * down_dy x_ratio = self.flen * right_dx / self.pitch y_ratio = self.flen * down_dy / self.pitch # # Generate the cuda kernel # mod_LFview = pycuda.compiler.SourceModule( _kernel_tpl.render(newiw=self.LF_dim[1], newih=self.LF_dim[0], oldiw=w, oldih=h, x_start=x_start, y_start=y_start, x_ratio=x_ratio, y_ratio=y_ratio, x_step=right_dx, y_step=down_dy, NCHANNELS=self.NCHANNELS)) self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") # # Now generate the cuda texture # if self.NCHANNELS == 3: cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(im, order="C"), self.texref) else: cuda.matrix_to_texref(im, self.texref, order="C") # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # self.texref.set_filter_mode(cuda.filter_mode.LINEAR) # # Prepare the traits # self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: self.LF_img.img_plot("LF_img") else: self.LF_img.img_plot("LF_img", colormap=gray)
def __init__(self, img_path): super(LFapplication, self).__init__() # # Load image data # base_path = os.path.splitext(img_path)[0] lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' with open(lenslet_path, 'r') as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path, 'r') as f: for line in f: name, val = line.strip().split() try: setattr(self, name, np.float32(val)) except: pass max_angle = math.atan(self.pitch/2/self.flen) # # Prepare image # im_pil = Image.open(img_path) if im_pil.mode == 'RGB': self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) im[:, :, :3] = np.array(im_pil).astype(np.float32) self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx), 3) else: self.NCHANNELS = 1 im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(np.float32) h, w = im.shape self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx)) x_start = x_offset - int(x_offset / right_dx) * right_dx y_start = y_offset - int(y_offset / down_dy) * down_dy x_ratio = self.flen * right_dx / self.pitch y_ratio = self.flen * down_dy / self.pitch # # Generate the cuda kernel # mod_LFview = pycuda.compiler.SourceModule( _kernel_tpl.render( newiw=self.LF_dim[1], newih=self.LF_dim[0], oldiw=w, oldih=h, x_start=x_start, y_start=y_start, x_ratio=x_ratio, y_ratio=y_ratio, x_step=right_dx, y_step=down_dy, NCHANNELS=self.NCHANNELS ) ) self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") # # Now generate the cuda texture # if self.NCHANNELS == 3: cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(im, order="C"), self.texref ) else: cuda.matrix_to_texref(im, self.texref, order="C") # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # self.texref.set_filter_mode(cuda.filter_mode.LINEAR) # # Prepare the traits # self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: self.LF_img.img_plot("LF_img") else: self.LF_img.img_plot("LF_img", colormap=gray)
def watershed(I, mask=None): kernel_source = open("Dwatershed.cu").read() main_module = nvcc.SourceModule(kernel_source) descent_kernel = main_module.get_function("descent_kernel") stabilize_kernel = main_module.get_function("stabilize_kernel") image_texture = main_module.get_texref("img") plateau_kernel = main_module.get_function("plateau_kernel") minima_kernel = main_module.get_function("minima_kernel") flood_kernel = main_module.get_function("flood_kernel") increment_kernel = main_module.get_function("increment_kernel") # Get contiguous image + shape. height, width, depth = I.shape I = np.float32(I.copy()) if mask is None: mask = np.ones(I.shape) mask = np.int32(mask) # Get block/grid size for steps 1-3. block_size = (8, 8, 8) grid_size = (width / (block_size[0] - 2) + 1, height / (block_size[0] - 2) + 1, depth / (block_size[0] - 2) + 1) # # Get block/grid size for step 4. # block_size2 = (10,10,10) # grid_size2 = (width/(block_size2[0]-2)+1, # height/(block_size2[0]-2)+1, # depth/(block_size2[0]-2)+1) # Initialize variables. labeled = np.zeros([height, width, depth]) labeled = np.float64(labeled) width = np.int32(width) height = np.int32(height) depth = np.int32(depth) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counters_d = gpu.to_gpu_async(count) # mask_d = cu.np_to_array( mask, order='C' ) # cu.bind_array_to_texref(mask_d, mask_texture) # Bind CUDA textures. #I_cu = cu.matrix_to_array(I, order='C') I_cu = cu.np_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, depth, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() counters_d = gpu.to_gpu(np.int32([0])) #counters_d = gpu.to_gpu_async(np.int32([0])) old, new = -1, -2 it = 0 while old != new: it += 1 old = new plateau_kernel(labeled_d, counters_d, width, height, depth, block=block_size, grid=grid_size) new = counters_d.get()[0] print 'plateau kernel', it - 2 # Step 2. increment_kernel(labeled_d, width, height, depth, block=block_size, grid=grid_size) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 it = 0 while old != new: it += 1 old = new minima_kernel(labeled_d, counters_d, width, height, depth, block=block_size, grid=grid_size) new = counters_d.get()[0] print 'minima kernel', it - 2 # Step 3. # counters_d = gpu.to_gpu(np.int32([0])) # old, new = -1, -2; it = 0 # while old != new: # it +=1 # old = new # plateau_kernel(labeled_d, counters_d, width, # height, depth, block=block_size, grid=grid_size) # new = counters_d.get()[0] # print 'plateau kernel', it-2 # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 it = 0 while old != new: it += 1 old = new flood_kernel(labeled_d, counters_d, width, height, depth, block=block_size, grid=grid_size) new = counters_d.get()[0] print 'flood kernel', it - 2 labels = labeled_d.get() labels = labels * mask # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) #cu.DeviceAllocation.free(counters_d) del counters_d return labels
def init(): """outputs the high resolution k-box, and the smoothed r box""" N = np.int32(DIM) #prepare for stitching #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN)/DIM HII_scale = np.float32(BOX_LEN)/HII_DIM shape = (N,N,N) MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) kernel_source = open(cmd_folder+"/initialize.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'VOLUME': VOLUME, 'DIM': DIM } main_module = nvcc.SourceModule(kernel_code) init_kernel = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") adj_complex_conj = main_module.get_function("adj_complex_conj") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) largebox_d = gpuarray.zeros(shape, dtype=np.float32) init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size) #import IPython; IPython.embed() largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag #adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size) largebox = largebox_d.get() #np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox) #save real space box before smoothing plan = Plan(shape, dtype=np.complex64) plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box largebox_d /= scale**3 np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async()) #save real space box after smoothing and subsampling # host largebox is still in k space, no need to reload from disk largebox_d = gpuarray.to_gpu(largebox) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size); plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box largebox_d /= scale**3 smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async()) # reload the k-space box for velocity boxes largebox_d = gpuarray.to_gpu(largebox) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64) smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size) plan.execute(largevbox_d, inverse=True) largevbox_d /= scale**3 #import IPython; IPython.embed() subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size) np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get()) return
def init_stitch(N): """outputs the high resolution k-box, and the smoothed r box Input ----------- N: int32 size of box to load onto the GPU, should be related to DIM by powers of 2 """ if N is None: N = np.int32(HII_DIM) #prepare for stitching META_GRID_SIZE = DIM/N M = np.int32(HII_DIM/META_GRID_SIZE) #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN/DIM) print 'scale', scale HII_scale = np.float32(BOX_LEN/HII_DIM) shape = (DIM,DIM,N) stitch_grid_size = (DIM/(block_size[0]), DIM/(block_size[0]), N/(block_size[0])) HII_stitch_grid_size = (HII_DIM/(block_size[0]), HII_DIM/(block_size[0]), M/(block_size[0])) #ratio of large box to small size kernel_source = open(cmd_folder+"/initialize_stitch.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'DIM': DIM, 'VOLUME': VOLUME, 'META_BLOCKDIM': N } main_module = nvcc.SourceModule(kernel_code) init_stitch = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64) plan1d = Plan((np.int64(DIM)), dtype=np.complex64) print "init pspec" interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) #hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64') hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64) #hbox_small = np.zeros(HII_shape, dtype=np.float32) #hbox_large = n smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) # Set up pinned memory for transfer #largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize()) largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32) largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64) largebox_d = gpuarray.zeros(shape, dtype=np.float32) largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) print "init boxes" for meta_z in xrange(META_GRID_SIZE): # MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3) init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy() #if want to get velocity need to use this if True: print "saving kbox" np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) print "Executing FFT on device" #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print hbox_large.dtype print "Finished FFT on device" np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) if True: print "loading kbox" hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() #cuda.memcpy_htod_async(largebox_d, largebox_pin) largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size); hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async() #import IPython; IPython.embed() print "Executing FFT on host" #hbox_large = hifft(hbox_large).astype(np.complex64).real #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print "Finished FFT on host" #import IPython; IPython.embed() # for meta_x in xrange(META_GRID_SIZE): # for meta_y in xrange(META_GRID_SIZE): # for meta_z in xrange(META_GRID_SIZE): # largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N]) # HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size); # hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get() #plan = Plan(shape, dtype=np.complex64) #plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) # This saves a large resolution deltax print "downsampling" smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) #largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) largebox_d /= scale**3 # subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async() np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small) #import IPython; IPython.embed() # To get velocities: reload the k-space box hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32) hvbox_small = np.zeros(HII_shape, dtype=np.float32) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64) smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): for meta_z in xrange(META_GRID_SIZE): largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size) print hvbox_large.shape, largevbox_d.shape hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async() hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real for meta_z in xrange(META_GRID_SIZE): largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async() np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
def run_function(function_package): # global variables global FD global tb_cnt # initialize variables fp = function_package func_output = fp.output u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position data_halo = func_output.data_halo function_name = fp.function_name args = fp.function_args work_range = fp.work_range tb_cnt = 0 stream = stream_list[0] # cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) # cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream) if fp.update_tf == 1: tf.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1, 256, 4), order='C'), tf) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) if fp.update_tf2 == 1: tf1.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1, 256, 4), order='C'), tf1) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) cuda_args = [] data_exist = True if u not in data_list: data_exist = False elif ss not in data_list[u]: data_exist = False elif sp not in data_list[u][ss]: data_exist = False if data_exist: # initialize variables data_package = data_list[u][ss][sp] dp = data_package if dp.devptr == None: wait_data_arrive(data_package, stream=stream) ########################### devptr = dp.devptr output_range = dp.data_range full_output_range = dp.full_data_range ad = data_range_to_cuda_in(output_range, full_output_range, stream=stream) cuda_args += [ad] output_package = dp FD.append(ad) else: bytes = func_output.buffer_bytes devptr, usage = malloc_with_swap_out(bytes) log("created output data bytes %s" % (str(func_output.buffer_bytes)), 'detail', log_type) data_range = func_output.data_range full_data_range = func_output.full_data_range buffer_range = func_output.buffer_range buffer_halo = func_output.buffer_halo ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = func_output output_package.buffer_bytes = usage if False: print "OUTPUT" print "OUTPUT_RANGE", data_range print "OUTPUT_FULL_RANGE", full_data_range FD.append(ad) # set work range block, grid = range_to_block_grid(work_range) # set block and grid # log("work_range "+str(work_range),'detail',log_type) # log("block %s grid %s"%(str(block),str(grid)),'detail',log_type) cuda_args = [devptr] + cuda_args # print "GPU", rank, "BEFORE RECV", time.time() # Recv data from other process for data_package in args: u = data_package.unique_id data_name = data_package.data_name if data_name not in work_range and u != -1: wait_data_arrive(data_package, stream=stream) # print "GPU", rank, "Recv Done", time.time() # set cuda arguments for data_package in args: data_name = data_package.data_name data_dtype = data_package.data_dtype data_contents_dtype = data_package.data_contents_dtype u = data_package.unique_id if data_name in work_range: cuda_args.append(numpy.int32(work_range[data_name][0])) cuda_args.append(numpy.int32(work_range[data_name][1])) elif u == -1: data = data_package.data dtype = type(data) if dtype in [int]: data = numpy.float32(data) if dtype in [float]: data = numpy.float32(data) cuda_args.append(numpy.float32(data)) # temp else: ss = data_package.split_shape sp = data_package.split_position dp = data_list[u][ss][ sp] # it must be fixed to data_package latter memory_type = dp.memory_type if memory_type == 'devptr': cuda_args.append(dp.devptr) data_range = dp.data_range full_data_range = dp.full_data_range buffer_range = dp.buffer_range if False: print "DATA_NAME", data_name print "DATA_RANGE", data_range print "FULL_DATA_RANGE", full_data_range print "BUFFER_RANGE", buffer_range print "DATA_HALO", dp.data_halo print "BUFFER_HALO", dp.buffer_halo print dp print_devptr(dp.devptr, dp) ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, data_halo=dp.data_halo, buffer_halo=dp.buffer_halo, stream=stream) cuda_args.append(ad) FD.append(ad) # log("function cuda name %s"%(function_name),'detail',log_type) # if function_name in func_dict: # func = func_dict[function_name] # else: # set modelview matrix cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream) cuda.memcpy_htod_async(inv_mmtx, fp.inv_mmtx.reshape(16), stream=stream) try: if Debug: print "Function name: ", function_name func = mod.get_function(function_name.strip()) except: print "Function not found ERROR" print "Function name: " + function_name assert (False) stream_list[0].synchronize() if log_type in ['time', 'all']: start = time.time() kernel_finish = cuda.Event() func(*cuda_args, block=block, grid=grid, stream=stream_list[0]) kernel_finish.record(stream=stream_list[0]) """ try: a = numpy.empty((30,30),dtype=numpy.int32) cuda.memcpy_dtoh(a, cuda_args[0]) print a[10:-10,10:-10] except: print "Fail", function_name print "Fp.output", fp.output pass """ u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position target = (u, ss, sp) Event_dict[target] = kernel_finish if target not in valid_list: valid_list.append(target) ################################################################################# # finish if log_type in ['time', 'all']: t = (time.time() - start) ms = 1000 * t log( "rank%d, %s, \"%s\", u=%d, GPU%d function running,,, time: %.3f ms " % (rank, func_output.data_name, function_name, u, device_number, ms), 'time', log_type) #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type) ################################################################################### # decrease retain_count for data_package in args: u = data_package.unique_id if u != -1: mem_release(data_package) # print "Release", time.time() return devptr, output_package
def run_function(function_package, function_name): # global variables global FD # initialize variables fp = function_package func_output = fp.output u, ss, sp = func_output.get_id() data_halo = func_output.data_halo args = fp.function_args work_range = fp.work_range stream = stream_list[0] mod = source_module_dict[function_name] # cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) # cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream) tf = mod.get_texref('TFF') tf1 = mod.get_texref('TFF1') bandwidth,_ = mod.get_global('TF_bandwidth') if fp.Sliders != None: sld,_ = mod.get_global('slider') sld_op,_ = mod.get_global('slider_opacity') cuda.memcpy_htod_async(sld, fp.Sliders, stream=stream) cuda.memcpy_htod_async(sld_op, fp.Slider_opacity, stream=stream) if fp.transN != 0: tf = mod.get_texref('TFF') tf1 = mod.get_texref('TFF1') bandwidth,_ = mod.get_global('TF_bandwidth') if fp.update_tf == 1 and fp.trans_tex != None: global tfTex tfTex = fp.trans_tex if fp.update_tf2 == 1 and fp.trans_tex != None: global tfTex2 tfTex2 = fp.trans_tex tf.set_filter_mode(cuda.filter_mode.LINEAR) tf1.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(tfTex.reshape(1,256,4), order='C'), tf) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(tfTex2.reshape(1,256,4), order='C'), tf1) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) cuda_args = [] data_exist = True if u not in data_list: data_exist = False elif ss not in data_list[u]: data_exist = False elif sp not in data_list[u][ss]: data_exist = False if data_exist: # initialize variables data_package = data_list[u][ss][sp] dp = data_package if dp.devptr == None: wait_data_arrive(data_package, stream=stream) ########################### devptr = dp.devptr output_range = dp.data_range full_output_range = dp.full_data_range buffer_range = dp.buffer_range buffer_halo = dp.buffer_halo ad = data_range_to_cuda_in(output_range, full_output_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = dp FD.append(ad) else: bytes = func_output.data_bytes devptr, usage = malloc_with_swap_out(bytes) log("created output data bytes %s"%(str(func_output.data_bytes)),'detail',log_type) data_range = func_output.data_range full_data_range = func_output.full_data_range buffer_range = func_output.buffer_range buffer_halo = func_output.buffer_halo ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = func_output output_package.set_usage(usage) if False: print "OUTPUT" print "OUTPUT_RANGE", data_range print "OUTPUT_FULL_RANGE", full_data_range FD.append(ad) # set work range block, grid = range_to_block_grid(work_range) cuda_args = [devptr] + cuda_args # print "GPU", rank, "BEFORE RECV", time.time() # Recv data from other process for data_package in args: u = data_package.get_unique_id() data_name = data_package.data_name if data_name not in work_range and u != '-1': wait_data_arrive(data_package, stream=stream) # print "GPU", rank, "Recv Done", time.time() # set cuda arguments for data_package in args: data_name = data_package.data_name data_dtype = data_package.data_dtype data_contents_dtype = data_package.data_contents_dtype u = data_package.get_unique_id() if data_name in work_range: cuda_args.append(numpy.int32(work_range[data_name][0])) cuda_args.append(numpy.int32(work_range[data_name][1])) elif u == '-1': data = data_package.data dtype = data_package.data_contents_dtype if dtype == 'int': cuda_args.append(numpy.int32(data)) elif dtype == 'float': cuda_args.append(numpy.float32(data)) elif dtype == 'double': cuda_args.append(numpy.float64(data)) else: cuda_args.append(numpy.float32(data)) # temp else: ss = data_package.get_split_shape() sp = data_package.get_split_position() dp = data_list[u][ss][sp] # it must be fixed to data_package later memory_type = dp.memory_type if memory_type == 'devptr': cuda_args.append(dp.devptr) data_range = dp.data_range full_data_range = dp.full_data_range if False: print "DP", dp.info() print_devptr(dp.devptr, dp) ad = data_range_to_cuda_in(data_range, full_data_range, data_halo=dp.data_halo, stream=stream) cuda_args.append(ad) FD.append(ad) # set modelview matrix func = mod.get_function(function_name) mmtx,_ = mod.get_global('modelview') inv_mmtx, _ = mod.get_global('inv_modelview') inv_m = numpy.linalg.inv(fp.mmtx) cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream) cuda.memcpy_htod_async(inv_mmtx, inv_m.reshape(16), stream=stream) stream_list[0].synchronize() if log_type in ['time','all']: start = time.time() # st = time.time() kernel_finish = cuda.Event() func( *cuda_args, block=block, grid=grid, stream=stream_list[0]) kernel_finish.record(stream=stream_list[0]) # ctx.synchronize() # print "GPU TIME", time.time() - st # print "FFFFOOo", func_output.info() # print_devptr(cuda_args[0], func_output) u, ss, sp = func_output.get_id() target = (u,ss,sp) Event_dict[target] = kernel_finish if target not in valid_list: valid_list.append(target) ################################################################################# # finish if log_type in ['time','all']: t = (time.time() - start) ms = 1000*t log("rank%d, %s, \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "%(rank, func_output.data_name, function_name, u, device_number, ms),'time',log_type) #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type) ################################################################################### # decrease retain_count for data_package in args: u = data_package.get_unique_id() if u != '-1': mem_release(data_package) # print "Release", time.time() return devptr, output_package
sensor = ball.gen_sensors_custom(pos, [[0, 0, 0]]) val = sensor[0].getB(magnets) tex[i, j, 0] = val[0] tex[i, j, 1] = val[1] tex[i, j, 2] = val[2] # print(val) x_angle_queries[k] = theta / 180.0 y_angle_queries[k] = phi / 180.0 k += 1 print(texture_shape) interpol = mod.get_function("MagneticFieldInterpolateKernel") texref = mod.get_texref('tex') drv.bind_array_to_texref(drv.make_multichannel_2d_array(tex, order="C"), texref) texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES) texref.set_filter_mode(drv.filter_mode.LINEAR) texref.set_address_mode(0, drv.address_mode.WRAP) texref.set_address_mode(1, drv.address_mode.WRAP) # number_of_queries = 100 # x_angle_queries = np.random.rand(number_of_queries) # y_angle_queries = np.random.rand(number_of_queries) # x_angle_queries = x_angles # y_angle_queries = y_angles # x_angle_queries = np.float32(np.arange(0,1,1/number_of_queries)) # y_angle_queries = np.float32(np.arange(0,1,1/number_of_queries)) # x_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32) # y_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32) # k = 0
def watershed(I): # Get contiguous image + shape. height, width = I.shape I = np.float32(I.copy()) # Get block/grid size for steps 1-3. block_size = (6, 6, 1) grid_size = (width / (block_size[0] - 2), height / (block_size[0] - 2)) # Get block/grid size for step 4. block_size2 = (16, 16, 1) grid_size2 = (width / (block_size2[0] - 2), height / (block_size2[0] - 2)) # Initialize variables. labeled = np.zeros([height, width]) labeled = np.float32(labeled) width = np.int32(width) height = np.int32(height) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counter_d = gpu.to_gpu_async(count) # Bind CUDA textures. I_cu = cu.matrix_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() # Step 2. increment_kernel(labeled_d, width, height, block=block_size2, grid=grid_size2) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new minima_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 3. counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new plateau_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new flood_kernel(labeled_d, counters_d, width, height, block=block_size2, grid=grid_size2) new = counters_d.get()[0] result = labeled_d.get() # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) return result
def find_bubbles(I, scale=1., fil='kspace'): """brute force method""" zeta = 40. Z = 12. RMAX = 30. RMIN = 1. mm = mmin(Z) smin = sig0(m2R(mm)) deltac = Deltac(Z) fgrowth = deltac/1.686 #fgrowth = pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True) """find bubbbles for deltax box I""" kernel_source = open("find_bubbles.cu").read() kernel_code = kernel_source % { 'DELTAC': deltac, 'RMIN': RMIN, 'SMIN': smin, 'ZETA': zeta } main_module = nvcc.SourceModule(kernel_code) if fil == 'rspace': kernel = main_module.get_function("real_tophat_kernel") elif fil == 'kspace': kernel = main_module.get_function("k_tophat_kernel") image_texture = main_module.get_texref("img") # Get contiguous image + shape. height, width, depth = I.shape I = np.float32(I.copy()*fgrowth) # Get block/grid size for steps 1-3. block_size = (8,8,8) grid_size = (width/(block_size[0])+1, height/(block_size[0])+1, depth/(block_size[0])+1) # Initialize variables. ionized = np.zeros([height,width,depth]) ionized = np.float32(ionized) width = np.int32(width) # Transfer labels asynchronously. ionized_d = gpuarray.to_gpu_async(ionized) I_cu = cu.np_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) R = RMAX while R > RMIN: print R Rpix = np.float32(R/scale) S0 = np.float32(sig0(R)) start = cu.Event() end = cu.Event() start.record() kernel(ionized_d, width, Rpix, S0, block=block_size, grid=HII_grid_size) end.record() end.synchronize() R *= (1./1.5) ionized = ionized_d.get() return ionized
def watershed(I): # Get contiguous image + shape. height, width = I.shape I = np.float32(I.copy()) # Get block/grid size for steps 1-3. block_size = (6,6,1) grid_size = (width/(block_size[0]-2), height/(block_size[0]-2)) # Get block/grid size for step 4. block_size2 = (16,16,1) grid_size2 = (width/(block_size2[0]-2), height/(block_size2[0]-2)) # Initialize variables. labeled = np.zeros([height,width]) labeled = np.float32(labeled) width = np.int32(width) height = np.int32(height) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counter_d = gpu.to_gpu_async(count) # Bind CUDA textures. I_cu = cu.matrix_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() # Step 2. increment_kernel(labeled_d,width,height, block=block_size2,grid=grid_size2) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new minima_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 3. counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new plateau_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new flood_kernel(labeled_d, counters_d, width, height, block=block_size2, grid=grid_size2) new = counters_d.get()[0] result = labeled_d.get() # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) return result