Exemple #1
0
 def __init__(self,
              volume,
              segmentation,
              voxelsize,
              origin=[0.0, 0.0, 0.0],
              stepsize=0.1,
              mode="linear"):
     #generate kernels
     self.mod = self.generateKernelModuleProjector()
     self.projKernel = self.mod.get_function("projectKernel")
     self.volumesize = volume.shape
     self.volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy()
     self.segmentation = np.moveaxis(segmentation.astype(np.float32),
                                     [0, 1, 2], [2, 1, 0]).copy()
     # print("done swap")
     self.volume_gpu = cuda.np_to_array(self.volume, order='C')
     self.texref_volume = self.mod.get_texref("tex_density")
     cuda.bind_array_to_texref(self.volume_gpu, self.texref_volume)
     self.segmentation_gpu = cuda.np_to_array(self.segmentation, order='C')
     self.texref_segmentation = self.mod.get_texref("tex_segmentation")
     cuda.bind_array_to_texref(self.segmentation_gpu,
                               self.texref_segmentation)
     if mode == "linear":
         self.texref_volume.set_filter_mode(cuda.filter_mode.LINEAR)
         self.texref_segmentation.set_filter_mode(cuda.filter_mode.LINEAR)
     self.voxelsize = voxelsize
     self.stepsize = np.float32(stepsize)
     self.origin = origin
     self.initialized = False
     print("initialized projector")
    def __init__(self, balljoint, texture):
        self.balljoint = balljoint
        self.tex = texture
        self.interpol = self.mod.get_function("MagneticFieldInterpolateKernel")
        self.texref = self.mod.get_texref('tex')
        drv.bind_array_to_texref(
            drv.make_multichannel_2d_array(self.tex, order="C"), self.texref)
        self.texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES)
        self.texref.set_filter_mode(drv.filter_mode.LINEAR)
        self.texref.set_address_mode(0, drv.address_mode.WRAP)
        self.texref.set_address_mode(1, drv.address_mode.CLAMP)
        self.sensor_pos = balljoint.config['sensor_pos']
        self.number_of_sensors = len(self.sensor_pos)
        self.input = np.zeros((self.number_of_sensors, 3),
                              dtype=np.float32,
                              order='C')
        self.output = np.zeros((self.number_of_sensors, 3),
                               dtype=np.float32,
                               order='C')
        self.b_target = np.zeros((self.number_of_sensors, 3),
                                 dtype=np.float32,
                                 order='C')

        self.bdim = (16, 16, 1)
        dx, mx = divmod(self.number_of_sensors, self.bdim[0])
        dy, my = divmod(self.number_of_sensors, self.bdim[1])
        self.gdim = (int((dx + (mx > 0))), int((dy + (my > 0))))
        rospy.init_node('BallJointPoseestimator', anonymous=True)
        self.joint_state = rospy.Publisher('/external_joint_states',
                                           sensor_msgs.msg.JointState,
                                           queue_size=1)
Exemple #3
0
    def test_multichannel_2d_texture(self):
        mod = SourceModule("""
        #define CHANNELS 4
        texture<float4, 2, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          float4 texval = tex2D(mtx_tex, row, col);
          dest[(row*w+col)*CHANNELS + 0] = texval.x;
          dest[(row*w+col)*CHANNELS + 1] = texval.y;
          dest[(row*w+col)*CHANNELS + 2] = texval.z;
          dest[(row*w+col)*CHANNELS + 3] = texval.w;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (5, 6)
        channels = 4
        a = np.asarray(np.random.randn(*((channels, ) + shape)),
                       dtype=np.float32,
                       order="F")
        drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"),
                                 mtx_tex)

        dest = np.zeros(shape + (channels, ), dtype=np.float32)
        copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex])
        reshaped_a = a.transpose(1, 2, 0)
        #print reshaped_a
        #print dest
        assert la.norm(dest - reshaped_a) == 0
Exemple #4
0
    def test_multichannel_2d_texture(self):
        mod = SourceModule(
            """
        #define CHANNELS 4
        texture<float4, 2, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          float4 texval = tex2D(mtx_tex, row, col);
          dest[(row*w+col)*CHANNELS + 0] = texval.x;
          dest[(row*w+col)*CHANNELS + 1] = texval.y;
          dest[(row*w+col)*CHANNELS + 2] = texval.z;
          dest[(row*w+col)*CHANNELS + 3] = texval.w;
        }
        """
        )

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (5, 6)
        channels = 4
        a = np.asarray(np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F")
        drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex)

        dest = np.zeros(shape + (channels,), dtype=np.float32)
        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
        reshaped_a = a.transpose(1, 2, 0)
        # print reshaped_a
        # print dest
        assert la.norm(dest - reshaped_a) == 0
Exemple #5
0
def create_2d_rgba_texture(a, module, variable, point_sampling=False):
    a = numpy.ascontiguousarray(a)
    out_texref = module.get_texref(variable)
    cuda.bind_array_to_texref(
        cuda.make_multichannel_2d_array(a, order='C'), out_texref)    
    if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref
Exemple #6
0
def create_2d_rgba_texture(a, module, variable, point_sampling=False):
    a = numpy.ascontiguousarray(a)
    out_texref = module.get_texref(variable)
    cuda.bind_array_to_texref(
        cuda.make_multichannel_2d_array(a, order='C'), out_texref)    
    if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref
Exemple #7
0
    def initialize(self):
        """Allocate GPU memory and transfer the volume, segmentations to GPU."""
        if self.initialized:
            raise RuntimeError("Close projector before initializing again.")

        # allocate and transfer volume texture to GPU
        # TODO: this axis-swap is messy and actually may be messing things up. Maybe use a FrameTransform in the Volume class instead?
        volume = self.volume.data
        volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy() # TODO: is this axis swap necessary?
        self.volume_gpu = cuda.np_to_array(volume, order='C')
        self.volume_texref = self.mod.get_texref("volume")
        cuda.bind_array_to_texref(self.volume_gpu, self.volume_texref)
        
        # set the (interpolation?) mode
        if self.mode == 'linear':
            self.volume_texref.set_filter_mode(cuda.filter_mode.LINEAR)
        else:
            raise RuntimeError

        # allocate and transfer segmentation texture to GPU
        # TODO: remove axis swap?
        # self.segmentations_gpu = [cuda.np_to_array(seg, order='C') for mat, seg in self.volume.materials.items()]
        self.segmentations_gpu = [cuda.np_to_array(np.moveaxis(seg, [0, 1, 2], [2, 1, 0]).copy(), order='C') for mat, seg in self.volume.materials.items()]
        self.segmentations_texref = [self.mod.get_texref(f"seg_{m}") for m, _ in enumerate(self.volume.materials)]
        for seg, texref in zip(self.segmentations_gpu, self.segmentations_texref):
            cuda.bind_array_to_texref(seg, texref)
            if self.mode == 'linear':
                texref.set_filter_mode(cuda.filter_mode.LINEAR)
            else:
                raise RuntimeError

        # allocate output image array on GPU (4 bytes to a float32)
        self.output_gpu = cuda.mem_alloc(self.output_size * 4)

        # allocate ijk_from_index matrix array on GPU (3x3 array x 4 bytes per float32)
        self.rt_kinv_gpu = cuda.mem_alloc(3 * 3 * 4)
        
        # Mark self as initialized.
        self.initialized = True
def run_function(function_package):
	# global variables
	global FD
	global tb_cnt
	
	# initialize variables
	fp = function_package
	func_output = fp.output
	u = func_output.unique_id
	ss = func_output.split_shape
	sp = func_output.split_position
	data_halo = func_output.data_halo
	function_name = fp.function_name

	args = fp.function_args
	work_range = fp.work_range

	tb_cnt = 0
	
	stream = stream_list[0]

#	cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream)
#	cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream)
	if fp.update_tf == 1:
		tf.set_filter_mode(cuda.filter_mode.LINEAR)
		cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1,256,4), order='C'), tf)
		cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream)
	if fp.update_tf2 == 1:
		tf1.set_filter_mode(cuda.filter_mode.LINEAR)
		cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1,256,4), order='C'), tf1)
		cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream)
	
	cuda_args = []

	data_exist = True
	if u not in data_list: data_exist = False
	elif ss not in data_list[u]: data_exist = False
	elif sp not in data_list[u][ss]: data_exist = False

	if data_exist:
		# initialize variables
		data_package = data_list[u][ss][sp]
		dp = data_package

		if dp.devptr == None:
			wait_data_arrive(data_package, stream=stream)
		###########################
		devptr = dp.devptr
		output_range = dp.data_range
		full_output_range = dp.full_data_range
		ad = data_range_to_cuda_in(output_range, full_output_range, stream=stream)
		cuda_args += [ad]
		output_package = dp
		
		FD.append(ad)
	else:
		bytes = func_output.buffer_bytes
		devptr, usage = malloc_with_swap_out(bytes)
		log("created output data bytes %s"%(str(func_output.buffer_bytes)),'detail',log_type)
		data_range = func_output.data_range
		full_data_range = func_output.full_data_range
		buffer_range = func_output.buffer_range
		buffer_halo = func_output.buffer_halo
		ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream)

		cuda_args += [ad]
		output_package = func_output
		output_package.buffer_bytes = usage
		
		if False:
			print "OUTPUT"
			print "OUTPUT_RANGE", data_range
			print "OUTPUT_FULL_RANGE",  full_data_range
			
		FD.append(ad)

	# set work range
	block, grid = range_to_block_grid(work_range)
	# set block and grid
#	log("work_range "+str(work_range),'detail',log_type)
#	log("block %s grid %s"%(str(block),str(grid)),'detail',log_type)

	cuda_args = [devptr] + cuda_args

#	print "GPU", rank, "BEFORE RECV", time.time()
	# Recv data from other process
	for data_package in args:
		u = data_package.unique_id
		data_name = data_package.data_name
		if data_name not in work_range and u != -1:
			wait_data_arrive(data_package, stream=stream)

#	print "GPU", rank, "Recv Done", time.time()
	# set cuda arguments 
	for data_package in args:
		data_name = data_package.data_name
		data_dtype = data_package.data_dtype
		data_contents_dtype = data_package.data_contents_dtype

		u = data_package.unique_id

		if data_name in work_range:
			cuda_args.append( numpy.int32(work_range[data_name][0]))
			cuda_args.append( numpy.int32(work_range[data_name][1]))

		elif u == -1:
			data = data_package.data
			dtype = type(data)

			if dtype in [int]: data = numpy.float32(data)
			if dtype in [float]: data = numpy.float32(data)

			cuda_args.append(numpy.float32(data)) # temp
		else:
			ss = data_package.split_shape
			sp = data_package.split_position
			dp = data_list[u][ss][sp] # it must be fixed to data_package latter

			memory_type = dp.memory_type
			if memory_type == 'devptr':

				cuda_args.append(dp.devptr)
				data_range = dp.data_range
				full_data_range = dp.full_data_range
				buffer_range = dp.buffer_range
				
				if False:
					print "DATA_NAME", data_name
					print "DATA_RANGE", data_range
					print "FULL_DATA_RANGE", full_data_range
					print "BUFFER_RANGE", buffer_range
					print "DATA_HALO", dp.data_halo
					print "BUFFER_HALO", dp.buffer_halo
					print dp
	
					print_devptr(dp.devptr, dp)
				ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, data_halo=dp.data_halo, buffer_halo=dp.buffer_halo, stream=stream)

				cuda_args.append(ad)
				FD.append(ad)
		
#	log("function cuda name %s"%(function_name),'detail',log_type)

#	if function_name in func_dict:
#		func = func_dict[function_name]
#	else:


	# set modelview matrix
	cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream)
	cuda.memcpy_htod_async(inv_mmtx, fp.inv_mmtx.reshape(16), stream=stream)

	try:
		if Debug:
			print "Function name: ", function_name
		func = mod.get_function(function_name.strip())
	except:
		print "Function not found ERROR"
		print "Function name: " + function_name
		assert(False)
	
	stream_list[0].synchronize()

	if log_type in ['time','all']:
		start = time.time()

	kernel_finish = cuda.Event()
	func( *cuda_args, block=block, grid=grid, stream=stream_list[0])
	kernel_finish.record(stream=stream_list[0])

	"""
	try:
		a = numpy.empty((30,30),dtype=numpy.int32)
		cuda.memcpy_dtoh(a, cuda_args[0])
		print a[10:-10,10:-10]
	except:
		print "Fail", function_name
		print "Fp.output", fp.output
		pass
	"""
	
	u = func_output.unique_id
	ss = func_output.split_shape
	sp = func_output.split_position

	target = (u,ss,sp)

	Event_dict[target] = kernel_finish
	if target not in valid_list:
		valid_list.append(target)

	#################################################################################
	# finish
	if log_type in ['time','all']:
		t = (time.time() - start)
		ms = 1000*t

		log("rank%d, %s,  \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "%(rank, func_output.data_name, function_name, u, device_number,  ms),'time',log_type)
	#log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type)

	###################################################################################
	# decrease retain_count
	for data_package in args:
		u = data_package.unique_id
		if u != -1:
			mem_release(data_package)

#	print "Release", time.time()
	return devptr, output_package
    def __init__(self, img_path):
        super(LFapplication, self).__init__()

        #
        # Load image data
        #
        base_path = os.path.splitext(img_path)[0]
        lenslet_path = base_path + '-lenslet.txt'
        optics_path = base_path + '-optics.txt'

        with open(lenslet_path, 'r') as f:
            tmp = eval(f.readline())
            x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
                     np.array(tmp, dtype=np.float32)

        with open(optics_path, 'r') as f:
            for line in f:
                name, val = line.strip().split()
                try:
                    setattr(self, name, np.float32(val))
                except:
                    pass

        max_angle = math.atan(self.pitch / 2 / self.flen)

        #
        # Prepare image
        #
        im_pil = Image.open(img_path)
        if im_pil.mode == 'RGB':
            self.NCHANNELS = 3
            w, h = im_pil.size
            im = np.zeros((h, w, 4), dtype=np.float32)
            im[:, :, :3] = np.array(im_pil).astype(np.float32)
            self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx), 3)
        else:
            self.NCHANNELS = 1
            im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(
                np.float32)
            h, w = im.shape
            self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx))

        x_start = x_offset - int(x_offset / right_dx) * right_dx
        y_start = y_offset - int(y_offset / down_dy) * down_dy
        x_ratio = self.flen * right_dx / self.pitch
        y_ratio = self.flen * down_dy / self.pitch

        #
        # Generate the cuda kernel
        #
        mod_LFview = pycuda.compiler.SourceModule(
            _kernel_tpl.render(newiw=self.LF_dim[1],
                               newih=self.LF_dim[0],
                               oldiw=w,
                               oldih=h,
                               x_start=x_start,
                               y_start=y_start,
                               x_ratio=x_ratio,
                               y_ratio=y_ratio,
                               x_step=right_dx,
                               y_step=down_dy,
                               NCHANNELS=self.NCHANNELS))

        self.LFview_func = mod_LFview.get_function("LFview_kernel")
        self.texref = mod_LFview.get_texref("tex")

        #
        # Now generate the cuda texture
        #
        if self.NCHANNELS == 3:
            cuda.bind_array_to_texref(
                cuda.make_multichannel_2d_array(im, order="C"), self.texref)
        else:
            cuda.matrix_to_texref(im, self.texref, order="C")

        #
        # We could set the next if we wanted to address the image
        # in normalized coordinates ( 0 <= coordinate < 1.)
        # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        #
        self.texref.set_filter_mode(cuda.filter_mode.LINEAR)

        #
        # Prepare the traits
        #
        self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0))
        self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0))

        self.plotdata = ArrayPlotData(LF_img=self.sampleLF())
        self.LF_img = Plot(self.plotdata)
        if self.NCHANNELS == 3:
            self.LF_img.img_plot("LF_img")
        else:
            self.LF_img.img_plot("LF_img", colormap=gray)
    def __init__(self, img_path):
	super(LFapplication, self).__init__()

	#
	# Load image data
	#
	base_path = os.path.splitext(img_path)[0]
	lenslet_path = base_path + '-lenslet.txt'
	optics_path = base_path + '-optics.txt'

	with open(lenslet_path, 'r') as f:
            tmp = eval(f.readline())
	    x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
              np.array(tmp, dtype=np.float32)

	with open(optics_path, 'r') as f:
            for line in f:
                name, val = line.strip().split()
                try:
                    setattr(self, name, np.float32(val))
                except:
                    pass

        max_angle = math.atan(self.pitch/2/self.flen)

        #
	# Prepare image
	#
	im_pil = Image.open(img_path)
        if im_pil.mode == 'RGB':
            self.NCHANNELS = 3
            w, h = im_pil.size
            im = np.zeros((h, w, 4), dtype=np.float32)
            im[:, :, :3] = np.array(im_pil).astype(np.float32)
            self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx), 3)
        else:
            self.NCHANNELS = 1
            im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(np.float32)
            h, w = im.shape
            self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx))

        x_start = x_offset - int(x_offset / right_dx) * right_dx
        y_start = y_offset - int(y_offset / down_dy) * down_dy
        x_ratio = self.flen * right_dx / self.pitch
        y_ratio = self.flen * down_dy / self.pitch

        #
        # Generate the cuda kernel
        #
        mod_LFview = pycuda.compiler.SourceModule(
            _kernel_tpl.render(
                newiw=self.LF_dim[1],
                newih=self.LF_dim[0],
                oldiw=w,
                oldih=h,
                x_start=x_start,
                y_start=y_start,
                x_ratio=x_ratio,
                y_ratio=y_ratio,
                x_step=right_dx,
                y_step=down_dy,
                NCHANNELS=self.NCHANNELS
                )
            )
        
        self.LFview_func = mod_LFview.get_function("LFview_kernel")
        self.texref = mod_LFview.get_texref("tex")
        
        #
	# Now generate the cuda texture
	#
        if self.NCHANNELS == 3:
            cuda.bind_array_to_texref(
                cuda.make_multichannel_2d_array(im, order="C"),
                self.texref
                )
        else:
            cuda.matrix_to_texref(im, self.texref, order="C")
            
	#
	# We could set the next if we wanted to address the image
	# in normalized coordinates ( 0 <= coordinate < 1.)
	# texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
	#
	self.texref.set_filter_mode(cuda.filter_mode.LINEAR)

        #
	# Prepare the traits
	#
        self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0))
        self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0))
	
        self.plotdata = ArrayPlotData(LF_img=self.sampleLF())
	self.LF_img = Plot(self.plotdata)
        if self.NCHANNELS == 3:
            self.LF_img.img_plot("LF_img")
        else:
            self.LF_img.img_plot("LF_img", colormap=gray)
Exemple #11
0
def watershed(I, mask=None):
    kernel_source = open("Dwatershed.cu").read()
    main_module = nvcc.SourceModule(kernel_source)
    descent_kernel = main_module.get_function("descent_kernel")
    stabilize_kernel = main_module.get_function("stabilize_kernel")
    image_texture = main_module.get_texref("img")
    plateau_kernel = main_module.get_function("plateau_kernel")
    minima_kernel = main_module.get_function("minima_kernel")
    flood_kernel = main_module.get_function("flood_kernel")
    increment_kernel = main_module.get_function("increment_kernel")

    # Get contiguous image + shape.
    height, width, depth = I.shape
    I = np.float32(I.copy())
    if mask is None:
        mask = np.ones(I.shape)
    mask = np.int32(mask)

    # Get block/grid size for steps 1-3.
    block_size = (8, 8, 8)
    grid_size = (width / (block_size[0] - 2) + 1,
                 height / (block_size[0] - 2) + 1,
                 depth / (block_size[0] - 2) + 1)

    # # Get block/grid size for step 4.
    # block_size2 = (10,10,10)
    # grid_size2  = (width/(block_size2[0]-2)+1,
    #               height/(block_size2[0]-2)+1,
    #               depth/(block_size2[0]-2)+1)

    # Initialize variables.
    labeled = np.zeros([height, width, depth])
    labeled = np.float64(labeled)
    width = np.int32(width)
    height = np.int32(height)
    depth = np.int32(depth)
    count = np.int32([0])

    # Transfer labels asynchronously.
    labeled_d = gpu.to_gpu_async(labeled)
    counters_d = gpu.to_gpu_async(count)
    # mask_d = cu.np_to_array( mask, order='C' )
    # cu.bind_array_to_texref(mask_d, mask_texture)
    # Bind CUDA textures.
    #I_cu = cu.matrix_to_array(I, order='C')
    I_cu = cu.np_to_array(I, order='C')
    cu.bind_array_to_texref(I_cu, image_texture)

    # Step 1.
    descent_kernel(labeled_d,
                   width,
                   height,
                   depth,
                   block=block_size,
                   grid=grid_size)
    start_time = cu.Event()
    end_time = cu.Event()
    start_time.record()

    counters_d = gpu.to_gpu(np.int32([0]))
    #counters_d = gpu.to_gpu_async(np.int32([0]))
    old, new = -1, -2
    it = 0
    while old != new:
        it += 1
        old = new
        plateau_kernel(labeled_d,
                       counters_d,
                       width,
                       height,
                       depth,
                       block=block_size,
                       grid=grid_size)
        new = counters_d.get()[0]
    print 'plateau kernel', it - 2

    # Step 2.
    increment_kernel(labeled_d,
                     width,
                     height,
                     depth,
                     block=block_size,
                     grid=grid_size)

    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    it = 0

    while old != new:
        it += 1
        old = new
        minima_kernel(labeled_d,
                      counters_d,
                      width,
                      height,
                      depth,
                      block=block_size,
                      grid=grid_size)
        new = counters_d.get()[0]
    print 'minima kernel', it - 2

    # Step 3.
    # counters_d = gpu.to_gpu(np.int32([0]))
    # old, new = -1, -2; it = 0
    # while old != new:
    #   it +=1
    #   old = new
    #   plateau_kernel(labeled_d, counters_d, width,
    #   height, depth, block=block_size, grid=grid_size)
    #   new = counters_d.get()[0]
    # print 'plateau kernel', it-2

    # Step 4
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    it = 0
    while old != new:
        it += 1
        old = new
        flood_kernel(labeled_d,
                     counters_d,
                     width,
                     height,
                     depth,
                     block=block_size,
                     grid=grid_size)
        new = counters_d.get()[0]
    print 'flood kernel', it - 2

    labels = labeled_d.get()
    labels = labels * mask

    # End GPU timers.
    end_time.record()
    end_time.synchronize()
    gpu_time = start_time.\
    time_till(end_time) * 1e-3

    # print str(gpu_time)
    #cu.DeviceAllocation.free(counters_d)
    del counters_d

    return labels
Exemple #12
0
def init():
	"""outputs the high resolution k-box, and the smoothed r box"""
	N = np.int32(DIM) #prepare for stitching
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN)/DIM
	HII_scale = np.float32(BOX_LEN)/HII_DIM
	shape = (N,N,N)
	
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)

	kernel_source = open(cmd_folder+"/initialize.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'VOLUME': VOLUME,
		'DIM': DIM
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_kernel = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	adj_complex_conj = main_module.get_function("adj_complex_conj")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")

	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size)

	#import IPython; IPython.embed()
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size)

	largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag

	#adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size)
	largebox = largebox_d.get()
	#np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox)

	#save real space box before smoothing
	plan = Plan(shape, dtype=np.complex64)
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async())

	#save real space box after smoothing and subsampling
	# host largebox is still in k space, no need to reload from disk
	largebox_d = gpuarray.to_gpu(largebox)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size);
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async())

	# reload the k-space box for velocity boxes
	largebox_d = gpuarray.to_gpu(largebox)
	
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64)
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size)
		HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size)
		plan.execute(largevbox_d, inverse=True)
		largevbox_d /= scale**3
		#import IPython; IPython.embed()
		subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size)
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get())

	return
Exemple #13
0
def init_stitch(N):
	"""outputs the high resolution k-box, and the smoothed r box

	Input
	-----------
	N:  int32
		size of box to load onto the GPU, should be related to DIM by powers of 2

	"""
	if N is None:
		N = np.int32(HII_DIM) #prepare for stitching
	META_GRID_SIZE = DIM/N
	M = np.int32(HII_DIM/META_GRID_SIZE)
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN/DIM)
	print 'scale', scale
	HII_scale = np.float32(BOX_LEN/HII_DIM)
	shape = (DIM,DIM,N)
	stitch_grid_size = (DIM/(block_size[0]),
						DIM/(block_size[0]),
						N/(block_size[0]))
	HII_stitch_grid_size = (HII_DIM/(block_size[0]),
						HII_DIM/(block_size[0]),
						M/(block_size[0]))
	#ratio of large box to small size
	kernel_source = open(cmd_folder+"/initialize_stitch.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'DIM': DIM, 
		'VOLUME': VOLUME,
		'META_BLOCKDIM': N
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_stitch = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)
	plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64)
	plan1d = Plan((np.int64(DIM)), dtype=np.complex64)
	print "init pspec"
	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)
	#hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64')
	hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64)
	#hbox_small = np.zeros(HII_shape, dtype=np.float32)
	#hbox_large = n
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)

	# Set up pinned memory for transfer
	#largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize())
	largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32)
	largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	print "init boxes"
	for meta_z in xrange(META_GRID_SIZE):
		# MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3)
		init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy()
	#if want to get velocity need to use this
	if True:
		print "saving kbox"
		np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)

	print "Executing FFT on device"
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print hbox_large.dtype
	print "Finished FFT on device"
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)
	
	if True:
		print "loading kbox"
		hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		#cuda.memcpy_htod_async(largebox_d, largebox_pin)
		largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size);
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async()
	#import IPython; IPython.embed()
	print "Executing FFT on host"
	#hbox_large = hifft(hbox_large).astype(np.complex64).real
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print "Finished FFT on host"
	#import IPython; IPython.embed()

	# for meta_x in xrange(META_GRID_SIZE):
	# 	for meta_y in xrange(META_GRID_SIZE):
	# 		for meta_z in xrange(META_GRID_SIZE):
	# 			largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N])
	# 			HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size);
	# 			hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get()
	#plan = Plan(shape, dtype=np.complex64)
	#plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)


	# This saves a large resolution deltax

	
	print "downsampling"
	smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32)
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		#largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		largebox_d /= scale**3 #
		subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space
		hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async()
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small)
	#import IPython; IPython.embed()


	# To get velocities: reload the k-space box
	hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32)
	hvbox_small = np.zeros(HII_shape, dtype=np.float32)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64)
	smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		for meta_z in xrange(META_GRID_SIZE):
			largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
			velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size)
			HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size)
			print hvbox_large.shape, largevbox_d.shape
			hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async()
		hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real
		for meta_z in xrange(META_GRID_SIZE):
			largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size)
			hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async()
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get())

	return
Exemple #14
0
def run_function(function_package):
    # global variables
    global FD
    global tb_cnt

    # initialize variables
    fp = function_package
    func_output = fp.output
    u = func_output.unique_id
    ss = func_output.split_shape
    sp = func_output.split_position
    data_halo = func_output.data_halo
    function_name = fp.function_name

    args = fp.function_args
    work_range = fp.work_range

    tb_cnt = 0

    stream = stream_list[0]

    #	cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream)
    #	cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream)
    if fp.update_tf == 1:
        tf.set_filter_mode(cuda.filter_mode.LINEAR)
        cuda.bind_array_to_texref(
            cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1, 256, 4),
                                            order='C'), tf)
        cuda.memcpy_htod_async(bandwidth,
                               numpy.float32(fp.TF_bandwidth),
                               stream=stream)
    if fp.update_tf2 == 1:
        tf1.set_filter_mode(cuda.filter_mode.LINEAR)
        cuda.bind_array_to_texref(
            cuda.make_multichannel_2d_array(fp.trans_tex.reshape(1, 256, 4),
                                            order='C'), tf1)
        cuda.memcpy_htod_async(bandwidth,
                               numpy.float32(fp.TF_bandwidth),
                               stream=stream)

    cuda_args = []

    data_exist = True
    if u not in data_list: data_exist = False
    elif ss not in data_list[u]: data_exist = False
    elif sp not in data_list[u][ss]: data_exist = False

    if data_exist:
        # initialize variables
        data_package = data_list[u][ss][sp]
        dp = data_package

        if dp.devptr == None:
            wait_data_arrive(data_package, stream=stream)
        ###########################
        devptr = dp.devptr
        output_range = dp.data_range
        full_output_range = dp.full_data_range
        ad = data_range_to_cuda_in(output_range,
                                   full_output_range,
                                   stream=stream)
        cuda_args += [ad]
        output_package = dp

        FD.append(ad)
    else:
        bytes = func_output.buffer_bytes
        devptr, usage = malloc_with_swap_out(bytes)
        log("created output data bytes %s" % (str(func_output.buffer_bytes)),
            'detail', log_type)
        data_range = func_output.data_range
        full_data_range = func_output.full_data_range
        buffer_range = func_output.buffer_range
        buffer_halo = func_output.buffer_halo
        ad = data_range_to_cuda_in(data_range,
                                   full_data_range,
                                   buffer_range,
                                   buffer_halo=buffer_halo,
                                   stream=stream)

        cuda_args += [ad]
        output_package = func_output
        output_package.buffer_bytes = usage

        if False:
            print "OUTPUT"
            print "OUTPUT_RANGE", data_range
            print "OUTPUT_FULL_RANGE", full_data_range

        FD.append(ad)

    # set work range
    block, grid = range_to_block_grid(work_range)
    # set block and grid
    #	log("work_range "+str(work_range),'detail',log_type)
    #	log("block %s grid %s"%(str(block),str(grid)),'detail',log_type)

    cuda_args = [devptr] + cuda_args

    #	print "GPU", rank, "BEFORE RECV", time.time()
    # Recv data from other process
    for data_package in args:
        u = data_package.unique_id
        data_name = data_package.data_name
        if data_name not in work_range and u != -1:
            wait_data_arrive(data_package, stream=stream)

#	print "GPU", rank, "Recv Done", time.time()
# set cuda arguments
    for data_package in args:
        data_name = data_package.data_name
        data_dtype = data_package.data_dtype
        data_contents_dtype = data_package.data_contents_dtype

        u = data_package.unique_id

        if data_name in work_range:
            cuda_args.append(numpy.int32(work_range[data_name][0]))
            cuda_args.append(numpy.int32(work_range[data_name][1]))

        elif u == -1:
            data = data_package.data
            dtype = type(data)

            if dtype in [int]: data = numpy.float32(data)
            if dtype in [float]: data = numpy.float32(data)

            cuda_args.append(numpy.float32(data))  # temp
        else:
            ss = data_package.split_shape
            sp = data_package.split_position
            dp = data_list[u][ss][
                sp]  # it must be fixed to data_package latter

            memory_type = dp.memory_type
            if memory_type == 'devptr':

                cuda_args.append(dp.devptr)
                data_range = dp.data_range
                full_data_range = dp.full_data_range
                buffer_range = dp.buffer_range

                if False:
                    print "DATA_NAME", data_name
                    print "DATA_RANGE", data_range
                    print "FULL_DATA_RANGE", full_data_range
                    print "BUFFER_RANGE", buffer_range
                    print "DATA_HALO", dp.data_halo
                    print "BUFFER_HALO", dp.buffer_halo
                    print dp

                    print_devptr(dp.devptr, dp)
                ad = data_range_to_cuda_in(data_range,
                                           full_data_range,
                                           buffer_range,
                                           data_halo=dp.data_halo,
                                           buffer_halo=dp.buffer_halo,
                                           stream=stream)

                cuda_args.append(ad)
                FD.append(ad)

#	log("function cuda name %s"%(function_name),'detail',log_type)

#	if function_name in func_dict:
#		func = func_dict[function_name]
#	else:

# set modelview matrix
    cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream)
    cuda.memcpy_htod_async(inv_mmtx, fp.inv_mmtx.reshape(16), stream=stream)

    try:
        if Debug:
            print "Function name: ", function_name
        func = mod.get_function(function_name.strip())
    except:
        print "Function not found ERROR"
        print "Function name: " + function_name
        assert (False)

    stream_list[0].synchronize()

    if log_type in ['time', 'all']:
        start = time.time()

    kernel_finish = cuda.Event()
    func(*cuda_args, block=block, grid=grid, stream=stream_list[0])
    kernel_finish.record(stream=stream_list[0])
    """
	try:
		a = numpy.empty((30,30),dtype=numpy.int32)
		cuda.memcpy_dtoh(a, cuda_args[0])
		print a[10:-10,10:-10]
	except:
		print "Fail", function_name
		print "Fp.output", fp.output
		pass
	"""

    u = func_output.unique_id
    ss = func_output.split_shape
    sp = func_output.split_position

    target = (u, ss, sp)

    Event_dict[target] = kernel_finish
    if target not in valid_list:
        valid_list.append(target)

    #################################################################################
    # finish
    if log_type in ['time', 'all']:
        t = (time.time() - start)
        ms = 1000 * t

        log(
            "rank%d, %s,  \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "
            %
            (rank, func_output.data_name, function_name, u, device_number, ms),
            'time', log_type)
    #log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type)

    ###################################################################################
    # decrease retain_count
    for data_package in args:
        u = data_package.unique_id
        if u != -1:
            mem_release(data_package)

#	print "Release", time.time()
    return devptr, output_package
Exemple #15
0
def run_function(function_package, function_name):
	# global variables
	global FD
	
	# initialize variables
	fp = function_package
	func_output = fp.output
	u, ss, sp = func_output.get_id()
	data_halo = func_output.data_halo
	
	args = fp.function_args
	work_range = fp.work_range

	stream = stream_list[0]
	mod = source_module_dict[function_name]

#	cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream)
#	cuda.memcpy_htod_async(front_back, numpy.int32(fp.front_back), stream=stream)

	tf = mod.get_texref('TFF')	
	tf1 = mod.get_texref('TFF1')
	bandwidth,_ = mod.get_global('TF_bandwidth')
	
	if fp.Sliders != None:
		sld,_ = mod.get_global('slider')
		sld_op,_ = mod.get_global('slider_opacity')

		cuda.memcpy_htod_async(sld, fp.Sliders, stream=stream)
		cuda.memcpy_htod_async(sld_op, fp.Slider_opacity, stream=stream)
	
	if fp.transN != 0:
		tf = mod.get_texref('TFF')
		tf1 = mod.get_texref('TFF1')
		bandwidth,_ = mod.get_global('TF_bandwidth')

		if fp.update_tf == 1 and fp.trans_tex != None:
			global tfTex
			tfTex = fp.trans_tex
		if fp.update_tf2 == 1 and fp.trans_tex != None:
			global tfTex2
			tfTex2  = fp.trans_tex

		tf.set_filter_mode(cuda.filter_mode.LINEAR)
		tf1.set_filter_mode(cuda.filter_mode.LINEAR)
		cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(tfTex.reshape(1,256,4), order='C'), tf)
		cuda.bind_array_to_texref(cuda.make_multichannel_2d_array(tfTex2.reshape(1,256,4), order='C'), tf1)
		cuda.memcpy_htod_async(bandwidth, numpy.float32(fp.TF_bandwidth), stream=stream)
	
	cuda_args = []

	data_exist = True
	if u not in data_list: data_exist = False
	elif ss not in data_list[u]: data_exist = False
	elif sp not in data_list[u][ss]: data_exist = False

	if data_exist:
		# initialize variables
		data_package = data_list[u][ss][sp]
		dp = data_package

		if dp.devptr == None:
			wait_data_arrive(data_package, stream=stream)
		###########################
		devptr = dp.devptr
		output_range = dp.data_range
		full_output_range = dp.full_data_range
		buffer_range = dp.buffer_range
		buffer_halo = dp.buffer_halo
		
		ad = data_range_to_cuda_in(output_range, full_output_range, buffer_range, buffer_halo=buffer_halo, stream=stream)
		cuda_args += [ad]
		output_package = dp
		
		FD.append(ad)
	else:
		bytes = func_output.data_bytes
		devptr, usage = malloc_with_swap_out(bytes)
		log("created output data bytes %s"%(str(func_output.data_bytes)),'detail',log_type)
		data_range = func_output.data_range
		full_data_range = func_output.full_data_range
		buffer_range = func_output.buffer_range
		buffer_halo = func_output.buffer_halo
	
		ad = data_range_to_cuda_in(data_range, full_data_range, buffer_range, buffer_halo=buffer_halo, stream=stream)

		cuda_args += [ad]
		output_package = func_output
		output_package.set_usage(usage)
		
		if False:
			print "OUTPUT"
			print "OUTPUT_RANGE", data_range
			print "OUTPUT_FULL_RANGE",  full_data_range
			
		FD.append(ad)

	# set work range
	block, grid = range_to_block_grid(work_range)
	cuda_args = [devptr] + cuda_args
	
#	print "GPU", rank, "BEFORE RECV", time.time()
	# Recv data from other process
	for data_package in args:
		u = data_package.get_unique_id()
		data_name = data_package.data_name
		if data_name not in work_range and u != '-1':
			wait_data_arrive(data_package, stream=stream)

#	print "GPU", rank, "Recv Done", time.time()
	# set cuda arguments 
	for data_package in args:
		data_name = data_package.data_name
		data_dtype = data_package.data_dtype
		data_contents_dtype = data_package.data_contents_dtype

		u = data_package.get_unique_id()
		if data_name in work_range:
			cuda_args.append(numpy.int32(work_range[data_name][0]))
			cuda_args.append(numpy.int32(work_range[data_name][1]))
		elif u == '-1':
			data = data_package.data
			dtype = data_package.data_contents_dtype
			if dtype == 'int': 
				cuda_args.append(numpy.int32(data))
			elif dtype == 'float':
				cuda_args.append(numpy.float32(data))
			elif dtype == 'double':
				cuda_args.append(numpy.float64(data))
			else:
				cuda_args.append(numpy.float32(data)) # temp
		else:
			ss = data_package.get_split_shape()
			sp = data_package.get_split_position()
			dp = data_list[u][ss][sp] # it must be fixed to data_package later

			memory_type = dp.memory_type
			if memory_type == 'devptr':

				cuda_args.append(dp.devptr)
				data_range = dp.data_range
				full_data_range = dp.full_data_range
				if False:
					print "DP", dp.info()
					print_devptr(dp.devptr, dp)
				ad = data_range_to_cuda_in(data_range, full_data_range, data_halo=dp.data_halo, stream=stream)

				cuda_args.append(ad)
				FD.append(ad)

	# set modelview matrix
	func = mod.get_function(function_name)
	mmtx,_ = mod.get_global('modelview')
	inv_mmtx, _ = mod.get_global('inv_modelview')
	inv_m = numpy.linalg.inv(fp.mmtx)
	cuda.memcpy_htod_async(mmtx, fp.mmtx.reshape(16), stream=stream)
	cuda.memcpy_htod_async(inv_mmtx, inv_m.reshape(16), stream=stream)

	stream_list[0].synchronize()

	if log_type in ['time','all']:
		start = time.time()
	
#	st = time.time()
	kernel_finish = cuda.Event()
	func( *cuda_args, block=block, grid=grid, stream=stream_list[0])
	kernel_finish.record(stream=stream_list[0])
#	ctx.synchronize()
#	print "GPU TIME", time.time() - st
#	print "FFFFOOo", func_output.info()
#	print_devptr(cuda_args[0], func_output)
	u, ss, sp = func_output.get_id()
	target = (u,ss,sp)

	Event_dict[target] = kernel_finish
	if target not in valid_list:
		valid_list.append(target)

	#################################################################################
	# finish
	if log_type in ['time','all']:
		t = (time.time() - start)
		ms = 1000*t

		log("rank%d, %s,  \"%s\", u=%d, GPU%d function running,,, time: %.3f ms "%(rank, func_output.data_name, function_name, u, device_number,  ms),'time',log_type)
	#log("rank%d, \"%s\", GPU%d function finish "%(rank, function_name, device_number),'general',log_type)

	###################################################################################
	# decrease retain_count
	for data_package in args:
		u = data_package.get_unique_id()
		if u != '-1':
			mem_release(data_package)

#	print "Release", time.time()
	return devptr, output_package
Exemple #16
0
        sensor = ball.gen_sensors_custom(pos, [[0, 0, 0]])
        val = sensor[0].getB(magnets)
        tex[i, j, 0] = val[0]
        tex[i, j, 1] = val[1]
        tex[i, j, 2] = val[2]
        # print(val)
        x_angle_queries[k] = theta / 180.0
        y_angle_queries[k] = phi / 180.0
        k += 1

print(texture_shape)

interpol = mod.get_function("MagneticFieldInterpolateKernel")
texref = mod.get_texref('tex')

drv.bind_array_to_texref(drv.make_multichannel_2d_array(tex, order="C"),
                         texref)
texref.set_flags(drv.TRSF_NORMALIZED_COORDINATES)
texref.set_filter_mode(drv.filter_mode.LINEAR)
texref.set_address_mode(0, drv.address_mode.WRAP)
texref.set_address_mode(1, drv.address_mode.WRAP)

# number_of_queries = 100
# x_angle_queries = np.random.rand(number_of_queries)
# y_angle_queries = np.random.rand(number_of_queries)
# x_angle_queries = x_angles
# y_angle_queries = y_angles
# x_angle_queries = np.float32(np.arange(0,1,1/number_of_queries))
# y_angle_queries = np.float32(np.arange(0,1,1/number_of_queries))
# x_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32)
# y_angle_queries = np.zeros(number_of_samples*number_of_samples,dtype=np.float32)
# k = 0
Exemple #17
0
def watershed(I):

    # Get contiguous image + shape.
    height, width = I.shape
    I = np.float32(I.copy())

    # Get block/grid size for steps 1-3.
    block_size = (6, 6, 1)
    grid_size = (width / (block_size[0] - 2), height / (block_size[0] - 2))

    # Get block/grid size for step 4.
    block_size2 = (16, 16, 1)
    grid_size2 = (width / (block_size2[0] - 2), height / (block_size2[0] - 2))

    # Initialize variables.
    labeled = np.zeros([height, width])
    labeled = np.float32(labeled)
    width = np.int32(width)
    height = np.int32(height)
    count = np.int32([0])

    # Transfer labels asynchronously.
    labeled_d = gpu.to_gpu_async(labeled)
    counter_d = gpu.to_gpu_async(count)

    # Bind CUDA textures.
    I_cu = cu.matrix_to_array(I, order='C')
    cu.bind_array_to_texref(I_cu, image_texture)

    # Step 1.
    descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size)

    start_time = cu.Event()
    end_time = cu.Event()
    start_time.record()

    # Step 2.
    increment_kernel(labeled_d,
                     width,
                     height,
                     block=block_size2,
                     grid=grid_size2)
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2

    while old != new:
        old = new
        minima_kernel(labeled_d,
                      counters_d,
                      width,
                      height,
                      block=block_size,
                      grid=grid_size)
        new = counters_d.get()[0]

    # Step 3.
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    while old != new:
        old = new
        plateau_kernel(labeled_d,
                       counters_d,
                       width,
                       height,
                       block=block_size,
                       grid=grid_size)
        new = counters_d.get()[0]

    # Step 4
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    while old != new:
        old = new
        flood_kernel(labeled_d,
                     counters_d,
                     width,
                     height,
                     block=block_size2,
                     grid=grid_size2)
        new = counters_d.get()[0]

    result = labeled_d.get()

    # End GPU timers.
    end_time.record()
    end_time.synchronize()
    gpu_time = start_time.\
    time_till(end_time) * 1e-3

    # print str(gpu_time)

    return result
Exemple #18
0
def find_bubbles(I, scale=1., fil='kspace'):
	"""brute force method"""
	zeta = 40.
	Z = 12.
	RMAX = 30.
	RMIN = 1.
	mm = mmin(Z)
	smin = sig0(m2R(mm))
	deltac = Deltac(Z)
	fgrowth = deltac/1.686
	#fgrowth = pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True)
	"""find bubbbles for deltax box I"""
	kernel_source = open("find_bubbles.cu").read()
	kernel_code = kernel_source % {
        'DELTAC': deltac,
        'RMIN': RMIN,
        'SMIN': smin, 
        'ZETA': zeta
    }
	main_module = nvcc.SourceModule(kernel_code)
	if fil == 'rspace':
		kernel = main_module.get_function("real_tophat_kernel")
	elif fil == 'kspace':
		kernel = main_module.get_function("k_tophat_kernel")
	image_texture = main_module.get_texref("img")

	# Get contiguous image + shape.
	height, width, depth = I.shape
	I = np.float32(I.copy()*fgrowth)

	# Get block/grid size for steps 1-3.
	block_size =  (8,8,8)
	grid_size =   (width/(block_size[0])+1,
				height/(block_size[0])+1,
				depth/(block_size[0])+1)
	 # Initialize variables.
	ionized       = np.zeros([height,width,depth]) 
	ionized       = np.float32(ionized)
	width         = np.int32(width)

	# Transfer labels asynchronously.
	ionized_d = gpuarray.to_gpu_async(ionized)
	I_cu = cu.np_to_array(I, order='C')
	cu.bind_array_to_texref(I_cu, image_texture)

	
	R = RMAX
	while R > RMIN:
		print R
		Rpix = np.float32(R/scale)
		S0 = np.float32(sig0(R))
		start = cu.Event()
		end = cu.Event()
		start.record()
		kernel(ionized_d, width, Rpix, S0, block=block_size, grid=HII_grid_size)
		end.record()
		end.synchronize()
		R *= (1./1.5)

	ionized = ionized_d.get()
	return ionized
Exemple #19
0
def watershed(I):

  # Get contiguous image + shape.
  height, width = I.shape
  I = np.float32(I.copy())

  # Get block/grid size for steps 1-3.
  block_size =  (6,6,1)
  grid_size =   (width/(block_size[0]-2),
                height/(block_size[0]-2))

  # Get block/grid size for step 4.
  block_size2 = (16,16,1)
  grid_size2  = (width/(block_size2[0]-2),
                height/(block_size2[0]-2))

  # Initialize variables.
  labeled       = np.zeros([height,width]) 
  labeled       = np.float32(labeled)
  width         = np.int32(width)
  height        = np.int32(height)
  count         = np.int32([0])

  # Transfer labels asynchronously.
  labeled_d = gpu.to_gpu_async(labeled)
  counter_d = gpu.to_gpu_async(count)

  # Bind CUDA textures.
  I_cu = cu.matrix_to_array(I, order='C')
  cu.bind_array_to_texref(I_cu, image_texture)

  # Step 1.
  descent_kernel(labeled_d, width, 
  height, block=block_size, grid=grid_size)
  
  start_time = cu.Event()
  end_time = cu.Event()
  start_time.record()

  # Step 2.
  increment_kernel(labeled_d,width,height,
  block=block_size2,grid=grid_size2)
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2

  while old != new:
    old = new
    minima_kernel(labeled_d, counters_d,
    width, height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]

  # Step 3.
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    plateau_kernel(labeled_d, counters_d, width,
    height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]
  
  # Step 4
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    flood_kernel(labeled_d, counters_d, width,
    height, block=block_size2, grid=grid_size2)
    new = counters_d.get()[0]

  result = labeled_d.get()
  
  # End GPU timers.
  end_time.record()
  end_time.synchronize()
  gpu_time = start_time.\
  time_till(end_time) * 1e-3

  # print str(gpu_time)

  return result