def __init__(self, balljoint, texture): self.balljoint = balljoint self.tex = texture self.interpol = self.mod.get_function("MagneticFieldInterpolateKernel") self.texref = self.mod.get_texref('tex') drv.bind_array_to_texref( drv.make_multichannel_2d_array(self.tex, order="C"), self.texref) self.texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES) self.texref.set_filter_mode(drv.filter_mode.LINEAR) self.texref.set_address_mode(0, drv.address_mode.WRAP) self.texref.set_address_mode(1, drv.address_mode.CLAMP) self.sensor_pos = balljoint.config['sensor_pos'] self.number_of_sensors = len(self.sensor_pos) self.input = np.zeros((self.number_of_sensors, 3), dtype=np.float32, order='C') self.output = np.zeros((self.number_of_sensors, 3), dtype=np.float32, order='C') self.b_target = np.zeros((self.number_of_sensors, 3), dtype=np.float32, order='C') self.bdim = (16, 16, 1) dx, mx = divmod(self.number_of_sensors, self.bdim[0]) dy, my = divmod(self.number_of_sensors, self.bdim[1]) self.gdim = (int((dx + (mx > 0))), int((dy + (my > 0)))) rospy.init_node('BallJointPoseestimator', anonymous=True) self.joint_state = rospy.Publisher('/external_joint_states', sensor_msgs.msg.JointState, queue_size=1)
def test_multichannel_2d_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture<float4, 2, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5, 6) channels = 4 a = np.asarray(np.random.randn(*((channels, ) + shape)), dtype=np.float32, order="F") drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape + (channels, ), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) #print reshaped_a #print dest assert la.norm(dest - reshaped_a) == 0
def test_multichannel_2d_texture(self): mod = SourceModule( """ #define CHANNELS 4 texture<float4, 2, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """ ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5, 6) channels = 4 a = np.asarray(np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F") drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape + (channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) # print reshaped_a # print dest assert la.norm(dest - reshaped_a) == 0
def create_2d_rgba_texture(a, module, variable, point_sampling=False): a = numpy.ascontiguousarray(a) out_texref = module.get_texref(variable) cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(a, order='C'), out_texref) if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT) else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR) return out_texref
def yt_to_function(self, tf): size = tf.nbins self.interleaved = np.asarray(np.dstack((tf.red.y.reshape(1, size), tf.green.y.reshape(1, size), tf.blue.y.reshape(1, size), tf.alpha.y.reshape(1, size))), dtype=np.float32, order='F') self.cuda_transfer_array = drv.make_multichannel_2d_array(np.asarray(self.interleaved.transpose((2,1,0)), dtype=np.float32, order='F'), order='F')
def tex2DToGPU(tex): nChannal = 1 if (len(tex.shape) == 2) else 3 if (nChannal == 3): #Add padding channal tex = np.dstack((tex, np.ones((tex.shape[0], tex.shape[1])))) tex = np.ascontiguousarray(tex).astype(np.float32) texGPUArray = cuda.make_multichannel_2d_array(tex, 'C') else: texGPUArray = cuda.np_to_array(tex, 'C') return texGPUArray
def yt_to_function(self, tf): size = tf.nbins self.interleaved = np.asarray(np.dstack( (tf.red.y.reshape(1, size), tf.green.y.reshape(1, size), tf.blue.y.reshape(1, size), tf.alpha.y.reshape(1, size))), dtype=np.float32, order='F') self.cuda_transfer_array = drv.make_multichannel_2d_array( np.asarray(self.interleaved.transpose((2, 1, 0)), dtype=np.float32, order='F'), order='F')
def arrays_to_transfer_function(self, arrays): (r_array, g_array, b_array, a_array) = arrays (size, ) = r_array.shape self.interleaved = np.asarray(np.dstack((r_array.reshape(1, size), g_array.reshape(1, size), b_array.reshape(1, size), a_array.reshape(1, size))), dtype=np.float32, order='F') self.cuda_transfer_array = drv.make_multichannel_2d_array(np.asarray(self.interleaved.transpose((2,1,0)), dtype=np.float32, order='F'), order='F')
def arrays_to_transfer_function(self, arrays): (r_array, g_array, b_array, a_array) = arrays (size, ) = r_array.shape self.interleaved = np.asarray(np.dstack( (r_array.reshape(1, size), g_array.reshape(1, size), b_array.reshape(1, size), a_array.reshape(1, size))), dtype=np.float32, order='F') self.cuda_transfer_array = drv.make_multichannel_2d_array( np.asarray(self.interleaved.transpose((2, 1, 0)), dtype=np.float32, order='F'), order='F')
def __init__(self, img_path): super(LFapplication, self).__init__() # # Load image data # base_path = os.path.splitext(img_path)[0] lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' with open(lenslet_path, 'r') as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path, 'r') as f: for line in f: name, val = line.strip().split() try: setattr(self, name, np.float32(val)) except: pass max_angle = math.atan(self.pitch/2/self.flen) # # Prepare image # im_pil = Image.open(img_path) if im_pil.mode == 'RGB': self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) im[:, :, :3] = np.array(im_pil).astype(np.float32) self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx), 3) else: self.NCHANNELS = 1 im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(np.float32) h, w = im.shape self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx)) x_start = x_offset - int(x_offset / right_dx) * right_dx y_start = y_offset - int(y_offset / down_dy) * down_dy x_ratio = self.flen * right_dx / self.pitch y_ratio = self.flen * down_dy / self.pitch # # Generate the cuda kernel # mod_LFview = pycuda.compiler.SourceModule( _kernel_tpl.render( newiw=self.LF_dim[1], newih=self.LF_dim[0], oldiw=w, oldih=h, x_start=x_start, y_start=y_start, x_ratio=x_ratio, y_ratio=y_ratio, x_step=right_dx, y_step=down_dy, NCHANNELS=self.NCHANNELS ) ) self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") # # Now generate the cuda texture # if self.NCHANNELS == 3: cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(im, order="C"), self.texref ) else: cuda.matrix_to_texref(im, self.texref, order="C") # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # self.texref.set_filter_mode(cuda.filter_mode.LINEAR) # # Prepare the traits # self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: self.LF_img.img_plot("LF_img") else: self.LF_img.img_plot("LF_img", colormap=gray)
f.close() return src src = grabsource('PyCUDABackProjectionKernel.cu') mod = SourceModule(src, include_dirs=['.','/home/aldebrn/NVIDIA_GPU_Computing_SDK/C/common/inc', '/home/aldebrn/matlab2009a/extern/include']) def complex_to_2chan(x): return numpy.array(numpy.dstack((x.real, x.imag)), dtype=numpy.float32, order='C', copy=True) def four_to_4chan(x,y,z,w): return numpy.array(numpy.dstack((x,y,z,w)), dtype=numpy.float32, order='C',copy=True) tex_projections = mod.get_texref('tex_projections') arr_projections = drv.make_multichannel_2d_array(complex_to_2chan(rp), order='C') tex_projections.set_filter_mode(drv.filter_mode.LINEAR) tex_projections.set_array(arr_projections) tex_platform_info = mod.get_texref('tex_platform_info') arr_platform_info = drv.make_multichannel_2d_array(four_to_4chan( mdouble(data.AntX), mdouble(data.AntY), mdouble(data.AntZ), mdouble(data.R0)), order='C') tex_platform_info.set_array(arr_platform_info) platform_info = four_to_4chan(mdouble(data.AntX), mdouble(data.AntY), mdouble(data.AntZ), mdouble(data.R0)) # height, width, num_channels for order == 'C'
def complex_to_2chan(x): return numpy.array(numpy.dstack((x.real, x.imag)), dtype=numpy.float32, order='C', copy=True) def four_to_4chan(x, y, z, w): return numpy.array(numpy.dstack((x, y, z, w)), dtype=numpy.float32, order='C', copy=True) tex_projections = mod.get_texref('tex_projections') arr_projections = drv.make_multichannel_2d_array(complex_to_2chan(rp), order='C') tex_projections.set_filter_mode(drv.filter_mode.LINEAR) tex_projections.set_array(arr_projections) tex_platform_info = mod.get_texref('tex_platform_info') arr_platform_info = drv.make_multichannel_2d_array(four_to_4chan( mdouble(data.AntX), mdouble(data.AntY), mdouble(data.AntZ), mdouble(data.R0)), order='C') tex_platform_info.set_array(arr_platform_info) platform_info = four_to_4chan(mdouble(data.AntX), mdouble(data.AntY), mdouble(data.AntZ), mdouble(data.R0)) # height, width, num_channels for order == 'C'
def run_function(function_package, function_name): # global variables global FD # initialize variables fp = function_package func_output = fp.output u, ss, sp = func_output.get_id() data_halo = func_output.data_halo args = fp.function_args work_range = fp.work_range stream = stream_list[0] mod = source_module_dict[function_name] # cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) # cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream) tf = mod.get_texref('TFF') tf1 = mod.get_texref('TFF1') bandwidth,_ = mod.get_global('TF_bandwidth') if fp.Sliders != None: sld,_ = mod.get_global('slider') sld_op,_ = mod.get_global('slider_opacity') cuda.memcpy_htod_async(sld, fp.Sliders, stream=stream) cuda.memcpy_htod_async(sld_op, fp.Slider_opacity, stream=stream) if fp.transN != 0: tf = mod.get_texref('TFF') tf1 = mod.get_texref('TFF1') bandwidth,_ = mod.get_global('TF_bandwidth') if fp.update_tf == 1 and fp.trans_tex != None: global tfTex tfTex = fp.trans_tex if fp.update_tf2 == 1 and fp.trans_tex != None: global tfTex2 tfTex2 = fp.trans_tex tf.set_filter_mode(cuda.filter_mode.LINEAR) tf1.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(tfTex.reshape(1,256,4), order='C'), tf) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(tfTex2.reshape(1,256,4), order='C'), tf1) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) cuda_args = [] data_exist = True if u not in data_list: data_exist = False elif ss not in data_list[u]: data_exist = False elif sp not in data_list[u][ss]: data_exist = False if data_exist: # initialize variables data_package = data_list[u][ss][sp] dp = data_package if dp.devptr == None: wait_data_arrive(data_package, stream=stream) ########################### devptr = dp.devptr output_range = dp.data_range full_output_range = dp.full_data_range buffer_range = dp.buffer_range buffer_halo = dp.buffer_halo ad = data_range_to_cuda_in(output_range, full_output_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = dp FD.append(ad) else: bytes = func_output.data_bytes devptr, usage = malloc_with_swap_out(bytes) log("created output data bytes %s"%(str(func_output.data_bytes)),'detail',log_type) data_range = func_output.data_range full_data_range = func_output.full_data_range buffer_range = func_output.buffer_range buffer_halo = func_output.buffer_halo ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = func_output output_package.set_usage(usage) if False: print "OUTPUT" print "OUTPUT_RANGE", data_range print "OUTPUT_FULL_RANGE", full_data_range FD.append(ad) # set work range block, grid = range_to_block_grid(work_range) cuda_args = [devptr] + cuda_args # print "GPU", rank, "BEFORE RECV", time.time() # Recv data from other process for data_package in args: u = data_package.get_unique_id() data_name = data_package.data_name if data_name not in work_range and u != '-1': wait_data_arrive(data_package, stream=stream) # print "GPU", rank, "Recv Done", time.time() # set cuda arguments for data_package in args: data_name = data_package.data_name data_dtype = data_package.data_dtype data_contents_dtype = data_package.data_contents_dtype u = data_package.get_unique_id() if data_name in work_range: cuda_args.append(numpy.int32(work_range[data_name][0])) cuda_args.append(numpy.int32(work_range[data_name][1])) elif u == '-1': data = data_package.data dtype = data_package.data_contents_dtype if dtype == 'int': cuda_args.append(numpy.int32(data)) elif dtype == 'float': cuda_args.append(numpy.float32(data)) elif dtype == 'double': cuda_args.append(numpy.float64(data)) else: cuda_args.append(numpy.float32(data)) # temp else: ss = data_package.get_split_shape() sp = data_package.get_split_position() dp = data_list[u][ss][sp] # it must be fixed to data_package later memory_type = dp.memory_type if memory_type == 'devptr': cuda_args.append(dp.devptr) data_range = dp.data_range full_data_range = dp.full_data_range if False: print "DP", dp.info() print_devptr(dp.devptr, dp) ad = data_range_to_cuda_in(data_range, full_data_range, data_halo=dp.data_halo, stream=stream) cuda_args.append(ad) FD.append(ad) # set modelview matrix func = mod.get_function(function_name) mmtx,_ = mod.get_global('modelview') inv_mmtx, _ = mod.get_global('inv_modelview') inv_m = numpy.linalg.inv(fp.mmtx) cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream) cuda.memcpy_htod_async(inv_mmtx, inv_m.reshape(16), stream=stream) stream_list[0].synchronize() if log_type in ['time','all']: start = time.time() # st = time.time() kernel_finish = cuda.Event() func( *cuda_args, block=block, grid=grid, stream=stream_list[0]) kernel_finish.record(stream=stream_list[0]) # ctx.synchronize() # print "GPU TIME", time.time() - st # print "FFFFOOo", func_output.info() # print_devptr(cuda_args[0], func_output) u, ss, sp = func_output.get_id() target = (u,ss,sp) Event_dict[target] = kernel_finish if target not in valid_list: valid_list.append(target) ################################################################################# # finish if log_type in ['time','all']: t = (time.time() - start) ms = 1000*t log("rank%d, %s, \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "%(rank, func_output.data_name, function_name, u, device_number, ms),'time',log_type) #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type) ################################################################################### # decrease retain_count for data_package in args: u = data_package.get_unique_id() if u != '-1': mem_release(data_package) # print "Release", time.time() return devptr, output_package
sensor = ball.gen_sensors_custom(pos, [[0, 0, 0]]) val = sensor[0].getB(magnets) tex[i, j, 0] = val[0] tex[i, j, 1] = val[1] tex[i, j, 2] = val[2] # print(val) x_angle_queries[k] = theta / 180.0 y_angle_queries[k] = phi / 180.0 k += 1 print(texture_shape) interpol = mod.get_function("MagneticFieldInterpolateKernel") texref = mod.get_texref('tex') drv.bind_array_to_texref(drv.make_multichannel_2d_array(tex, order="C"), texref) texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES) texref.set_filter_mode(drv.filter_mode.LINEAR) texref.set_address_mode(0, drv.address_mode.WRAP) texref.set_address_mode(1, drv.address_mode.WRAP) # number_of_queries = 100 # x_angle_queries = np.random.rand(number_of_queries) # y_angle_queries = np.random.rand(number_of_queries) # x_angle_queries = x_angles # y_angle_queries = y_angles # x_angle_queries = np.float32(np.arange(0,1,1/number_of_queries)) # y_angle_queries = np.float32(np.arange(0,1,1/number_of_queries)) # x_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32) # y_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32)
def __init__(self, img_path): super(LFapplication, self).__init__() # # Load image data # base_path = os.path.splitext(img_path)[0] lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' with open(lenslet_path, 'r') as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path, 'r') as f: for line in f: name, val = line.strip().split() try: setattr(self, name, np.float32(val)) except: pass max_angle = math.atan(self.pitch / 2 / self.flen) # # Prepare image # im_pil = Image.open(img_path) if im_pil.mode == 'RGB': self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) im[:, :, :3] = np.array(im_pil).astype(np.float32) self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx), 3) else: self.NCHANNELS = 1 im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype( np.float32) h, w = im.shape self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx)) x_start = x_offset - int(x_offset / right_dx) * right_dx y_start = y_offset - int(y_offset / down_dy) * down_dy x_ratio = self.flen * right_dx / self.pitch y_ratio = self.flen * down_dy / self.pitch # # Generate the cuda kernel # mod_LFview = pycuda.compiler.SourceModule( _kernel_tpl.render(newiw=self.LF_dim[1], newih=self.LF_dim[0], oldiw=w, oldih=h, x_start=x_start, y_start=y_start, x_ratio=x_ratio, y_ratio=y_ratio, x_step=right_dx, y_step=down_dy, NCHANNELS=self.NCHANNELS)) self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") # # Now generate the cuda texture # if self.NCHANNELS == 3: cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(im, order="C"), self.texref) else: cuda.matrix_to_texref(im, self.texref, order="C") # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # self.texref.set_filter_mode(cuda.filter_mode.LINEAR) # # Prepare the traits # self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: self.LF_img.img_plot("LF_img") else: self.LF_img.img_plot("LF_img", colormap=gray)
R0 = float64(clight) / (2.0 * float64(a2f(data.deltaF))) Nfft = nint(a2f(data.Nfft)) rmin = -R0 / 2.0 rmax = (Nfft * 0.5 - 1.0) * R0 / float(Nfft) platform_info = four_to_4chan(mdouble(data.AntX), mdouble(data.AntY), mdouble(data.AntZ), mdouble(data.R0)) # Load CUDA source file src = grabsource('PyCUDABackProjectionKernel.cu') mod = SourceModule(src, include_dirs=['.']) # Set up CUDA texture for range projections tex_projections = mod.get_texref('tex_projections') arr_projections = drv.make_multichannel_2d_array(complex_to_2chan(rp), order='C') tex_projections.set_filter_mode(drv.filter_mode.LINEAR) tex_projections.set_array(arr_projections) # Run! backprojection_loop = mod.get_function('backprojection_loop') backprojection_loop( drv.Out(im), nint(a2f(data.Np)), nint(Nimg_height), nfloat(delta_pixel_x), nfloat(delta_pixel_y), nint(a2f(data.Nfft)), drv.In(pi_4_f0__clight), nfloat(numpy.min(data.x_vec)), nfloat(numpy.min(data.y_vec)),
def run_function(function_package): # global variables global FD global tb_cnt # initialize variables fp = function_package func_output = fp.output u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position data_halo = func_output.data_halo function_name = fp.function_name args = fp.function_args work_range = fp.work_range tb_cnt = 0 stream = stream_list[0] # cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) # cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream) if fp.update_tf == 1: tf.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1, 256, 4), order='C'), tf) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) if fp.update_tf2 == 1: tf1.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1, 256, 4), order='C'), tf1) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) cuda_args = [] data_exist = True if u not in data_list: data_exist = False elif ss not in data_list[u]: data_exist = False elif sp not in data_list[u][ss]: data_exist = False if data_exist: # initialize variables data_package = data_list[u][ss][sp] dp = data_package if dp.devptr == None: wait_data_arrive(data_package, stream=stream) ########################### devptr = dp.devptr output_range = dp.data_range full_output_range = dp.full_data_range ad = data_range_to_cuda_in(output_range, full_output_range, stream=stream) cuda_args += [ad] output_package = dp FD.append(ad) else: bytes = func_output.buffer_bytes devptr, usage = malloc_with_swap_out(bytes) log("created output data bytes %s" % (str(func_output.buffer_bytes)), 'detail', log_type) data_range = func_output.data_range full_data_range = func_output.full_data_range buffer_range = func_output.buffer_range buffer_halo = func_output.buffer_halo ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = func_output output_package.buffer_bytes = usage if False: print "OUTPUT" print "OUTPUT_RANGE", data_range print "OUTPUT_FULL_RANGE", full_data_range FD.append(ad) # set work range block, grid = range_to_block_grid(work_range) # set block and grid # log("work_range "+str(work_range),'detail',log_type) # log("block %s grid %s"%(str(block),str(grid)),'detail',log_type) cuda_args = [devptr] + cuda_args # print "GPU", rank, "BEFORE RECV", time.time() # Recv data from other process for data_package in args: u = data_package.unique_id data_name = data_package.data_name if data_name not in work_range and u != -1: wait_data_arrive(data_package, stream=stream) # print "GPU", rank, "Recv Done", time.time() # set cuda arguments for data_package in args: data_name = data_package.data_name data_dtype = data_package.data_dtype data_contents_dtype = data_package.data_contents_dtype u = data_package.unique_id if data_name in work_range: cuda_args.append(numpy.int32(work_range[data_name][0])) cuda_args.append(numpy.int32(work_range[data_name][1])) elif u == -1: data = data_package.data dtype = type(data) if dtype in [int]: data = numpy.float32(data) if dtype in [float]: data = numpy.float32(data) cuda_args.append(numpy.float32(data)) # temp else: ss = data_package.split_shape sp = data_package.split_position dp = data_list[u][ss][ sp] # it must be fixed to data_package latter memory_type = dp.memory_type if memory_type == 'devptr': cuda_args.append(dp.devptr) data_range = dp.data_range full_data_range = dp.full_data_range buffer_range = dp.buffer_range if False: print "DATA_NAME", data_name print "DATA_RANGE", data_range print "FULL_DATA_RANGE", full_data_range print "BUFFER_RANGE", buffer_range print "DATA_HALO", dp.data_halo print "BUFFER_HALO", dp.buffer_halo print dp print_devptr(dp.devptr, dp) ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, data_halo=dp.data_halo, buffer_halo=dp.buffer_halo, stream=stream) cuda_args.append(ad) FD.append(ad) # log("function cuda name %s"%(function_name),'detail',log_type) # if function_name in func_dict: # func = func_dict[function_name] # else: # set modelview matrix cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream) cuda.memcpy_htod_async(inv_mmtx, fp.inv_mmtx.reshape(16), stream=stream) try: if Debug: print "Function name: ", function_name func = mod.get_function(function_name.strip()) except: print "Function not found ERROR" print "Function name: " + function_name assert (False) stream_list[0].synchronize() if log_type in ['time', 'all']: start = time.time() kernel_finish = cuda.Event() func(*cuda_args, block=block, grid=grid, stream=stream_list[0]) kernel_finish.record(stream=stream_list[0]) """ try: a = numpy.empty((30,30),dtype=numpy.int32) cuda.memcpy_dtoh(a, cuda_args[0]) print a[10:-10,10:-10] except: print "Fail", function_name print "Fp.output", fp.output pass """ u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position target = (u, ss, sp) Event_dict[target] = kernel_finish if target not in valid_list: valid_list.append(target) ################################################################################# # finish if log_type in ['time', 'all']: t = (time.time() - start) ms = 1000 * t log( "rank%d, %s, \"%s\", u=%d, GPU%d function running,,, time: %.3f ms " % (rank, func_output.data_name, function_name, u, device_number, ms), 'time', log_type) #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type) ################################################################################### # decrease retain_count for data_package in args: u = data_package.unique_id if u != -1: mem_release(data_package) # print "Release", time.time() return devptr, output_package
# tex = np.float32(np.random.rand(number_of_samples, number_of_samples,4)) # tex = np.float32(np.arange(number_of_samples*number_of_samples*4).reshape(number_of_samples,number_of_samples,4)) tex = np.zeros((number_of_samples,number_of_samples,4),dtype=np.float32) for i in range(number_of_samples): for j in range(number_of_samples): tex[i,j,0] = sin(x_angles[i]*pi)*cos(y_angles[j]*pi) tex[i,j,1] = sin(x_angles[i]*pi)*sin(y_angles[j]*pi) tex[i,j,2] = cos(x_angles[i]*pi) # print(tex) interpol = mod.get_function("MagneticFieldInterpolateKernel") texref = mod.get_texref('tex') drv.bind_array_to_texref( drv.make_multichannel_2d_array(tex, order="C"), texref ) texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES) texref.set_filter_mode(drv.filter_mode.LINEAR) # number_of_queries = 100 # x_angle_queries = np.random.rand(number_of_queries) # y_angle_queries = np.random.rand(number_of_queries) # x_angle_queries = x_angles # y_angle_queries = y_angles+0.1 # x_angle_queries = np.float32(np.arange(0,1,1/number_of_queries)) # y_angle_queries = np.float32(np.arange(0,1,1/number_of_queries)) x_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32) y_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32) k = 0
def run_function(function_package): # global variables global FD global tb_cnt # initialize variables fp = function_package func_output = fp.output u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position data_halo = func_output.data_halo function_name = fp.function_name args = fp.function_args work_range = fp.work_range tb_cnt = 0 stream = stream_list[0] # cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) # cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream) if fp.update_tf == 1: tf.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1,256,4), order='C'), tf) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) if fp.update_tf2 == 1: tf1.set_filter_mode(cuda.filter_mode.LINEAR) cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1,256,4), order='C'), tf1) cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream) cuda_args = [] data_exist = True if u not in data_list: data_exist = False elif ss not in data_list[u]: data_exist = False elif sp not in data_list[u][ss]: data_exist = False if data_exist: # initialize variables data_package = data_list[u][ss][sp] dp = data_package if dp.devptr == None: wait_data_arrive(data_package, stream=stream) ########################### devptr = dp.devptr output_range = dp.data_range full_output_range = dp.full_data_range ad = data_range_to_cuda_in(output_range, full_output_range, stream=stream) cuda_args += [ad] output_package = dp FD.append(ad) else: bytes = func_output.buffer_bytes devptr, usage = malloc_with_swap_out(bytes) log("created output data bytes %s"%(str(func_output.buffer_bytes)),'detail',log_type) data_range = func_output.data_range full_data_range = func_output.full_data_range buffer_range = func_output.buffer_range buffer_halo = func_output.buffer_halo ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream) cuda_args += [ad] output_package = func_output output_package.buffer_bytes = usage if False: print "OUTPUT" print "OUTPUT_RANGE", data_range print "OUTPUT_FULL_RANGE", full_data_range FD.append(ad) # set work range block, grid = range_to_block_grid(work_range) # set block and grid # log("work_range "+str(work_range),'detail',log_type) # log("block %s grid %s"%(str(block),str(grid)),'detail',log_type) cuda_args = [devptr] + cuda_args # print "GPU", rank, "BEFORE RECV", time.time() # Recv data from other process for data_package in args: u = data_package.unique_id data_name = data_package.data_name if data_name not in work_range and u != -1: wait_data_arrive(data_package, stream=stream) # print "GPU", rank, "Recv Done", time.time() # set cuda arguments for data_package in args: data_name = data_package.data_name data_dtype = data_package.data_dtype data_contents_dtype = data_package.data_contents_dtype u = data_package.unique_id if data_name in work_range: cuda_args.append( numpy.int32(work_range[data_name][0])) cuda_args.append( numpy.int32(work_range[data_name][1])) elif u == -1: data = data_package.data dtype = type(data) if dtype in [int]: data = numpy.float32(data) if dtype in [float]: data = numpy.float32(data) cuda_args.append(numpy.float32(data)) # temp else: ss = data_package.split_shape sp = data_package.split_position dp = data_list[u][ss][sp] # it must be fixed to data_package latter memory_type = dp.memory_type if memory_type == 'devptr': cuda_args.append(dp.devptr) data_range = dp.data_range full_data_range = dp.full_data_range buffer_range = dp.buffer_range if False: print "DATA_NAME", data_name print "DATA_RANGE", data_range print "FULL_DATA_RANGE", full_data_range print "BUFFER_RANGE", buffer_range print "DATA_HALO", dp.data_halo print "BUFFER_HALO", dp.buffer_halo print dp print_devptr(dp.devptr, dp) ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, data_halo=dp.data_halo, buffer_halo=dp.buffer_halo, stream=stream) cuda_args.append(ad) FD.append(ad) # log("function cuda name %s"%(function_name),'detail',log_type) # if function_name in func_dict: # func = func_dict[function_name] # else: # set modelview matrix cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream) cuda.memcpy_htod_async(inv_mmtx, fp.inv_mmtx.reshape(16), stream=stream) try: if Debug: print "Function name: ", function_name func = mod.get_function(function_name.strip()) except: print "Function not found ERROR" print "Function name: " + function_name assert(False) stream_list[0].synchronize() if log_type in ['time','all']: start = time.time() kernel_finish = cuda.Event() func( *cuda_args, block=block, grid=grid, stream=stream_list[0]) kernel_finish.record(stream=stream_list[0]) """ try: a = numpy.empty((30,30),dtype=numpy.int32) cuda.memcpy_dtoh(a, cuda_args[0]) print a[10:-10,10:-10] except: print "Fail", function_name print "Fp.output", fp.output pass """ u = func_output.unique_id ss = func_output.split_shape sp = func_output.split_position target = (u,ss,sp) Event_dict[target] = kernel_finish if target not in valid_list: valid_list.append(target) ################################################################################# # finish if log_type in ['time','all']: t = (time.time() - start) ms = 1000*t log("rank%d, %s, \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "%(rank, func_output.data_name, function_name, u, device_number, ms),'time',log_type) #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type) ################################################################################### # decrease retain_count for data_package in args: u = data_package.unique_id if u != -1: mem_release(data_package) # print "Release", time.time() return devptr, output_package