Example #1
0
def pytest_generate_tests_for_pyopencl(metafunc):
    class ContextGetter:
        def __init__(self, device):
            self.device = device

        def __call__(self):
            return cl.Context([device])

        def __str__(self):
            return "<context getter for %s>" % self.device
    if ("device" in metafunc.funcargnames
            or "ctx_getter" in metafunc.funcargnames):
        arg_dict = {}

        for platform in cl.get_platforms():
            if "platform" in metafunc.funcargnames:
                arg_dict["platform"] = platform

            for device in platform.get_devices():
                if "device" in metafunc.funcargnames:
                    arg_dict["device"] = device

                if "ctx_getter" in metafunc.funcargnames:
                    arg_dict["ctx_getter"] = ContextGetter(device)

                metafunc.addcall(funcargs=arg_dict.copy(),
                        id=", ".join("%s=%s" % (arg, value)
                                for arg, value in arg_dict.iteritems()))

    elif "platform" in metafunc.funcargnames:
        for platform in cl.get_platforms():
            metafunc.addcall(
                    funcargs=dict(platform=platform),
                    id=str(platform))
Example #2
0
    def __init__(self, cl_mode = True, cl_device = None):
        """Initialize the class.
        """
        if cl_mode:
            import pyopencl as cl
            import pyopencl.array
            if cl_device == 'gpu':
                gpu_devices = []
                for platform in cl.get_platforms():
                    try: gpu_devices += platform.get_devices(device_type=cl.device_type.GPU)
                    except: pass
                self.ctx = cl.Context(gpu_devices)
            elif cl_device == 'cpu':
                cpu_devices = []
                for platform in cl.get_platforms():
                    try: cpu_devices += platform.get_devices(device_type=cl.device_type.CPU)
                    except: pass
                self.ctx = cl.Context([cpu_devices[0]])
            else:
                self.ctx = cl.create_some_context()

            self.queue = cl.CommandQueue(self.ctx)
            self.mf = cl.mem_flags
            self.device = self.ctx.get_info(cl.context_info.DEVICES)[0]
            self.device_type = self.device.type
            self.device_compute_units = self.device.max_compute_units

        self.cl_mode = cl_mode
        self.obs = []
        self.samples = {}
Example #3
0
    def __init__(self, cl_mode=False, cl_device=None, sample_size=1000, cutoff=None,
                 output_to_stdout=False,
                 search=False, search_tolerance = 100, search_data_fit_only = False,
                 annealing = False, debug_mumble = False):
        """Initialize the class.
        """
        if debug_mumble:
            logging.basicConfig(level=logging.INFO)
        
        if cl_mode:
            import pyopencl as cl
            import pyopencl.array, pyopencl.tools, pyopencl.clrandom
            if cl_device == 'gpu':
                gpu_devices = []
                for platform in cl.get_platforms():
                    try: gpu_devices += platform.get_devices(device_type=cl.device_type.GPU)
                    except: pass
                self.ctx = cl.Context(gpu_devices)
            elif cl_device == 'cpu':
                cpu_devices = []
                for platform in cl.get_platforms():
                    try: cpu_devices += platform.get_devices(device_type=cl.device_type.CPU)
                    except: pass
                self.ctx = cl.Context([cpu_devices[0]])
            else:
                self.ctx = cl.create_some_context()

            self.queue = cl.CommandQueue(self.ctx)
            self.mf = cl.mem_flags
            self.device = self.ctx.get_info(cl.context_info.DEVICES)[0]
            self.device_type = self.device.type
            self.device_compute_units = self.device.max_compute_units

        self.cl_mode = cl_mode
        self.cutoff = cutoff
        self.data = []
        self.N = 0 # number of data points

        # sampling parameters
        self.sample_size = sample_size
        self.output_to_stdout = output_to_stdout
        self.iteration = 0
        self.thining = 1
        self.burnin = 0
        self.gpu_time = 0
        self.total_time = 0

        # stochastic search parameters
        self.best_sample = (None, None, None) # (sample, logprobability of model, loglikelihood of data)
        self.search = search
        self.search_data_fit_only = search_data_fit_only
        self.best_diff = []
        self.no_improv = 0
        self.search_tolerance = search_tolerance
       
        # annealing parameters, if used
        self.annealing = annealing
        self.annealing_temp = 1
        
        self.debug_mumble = debug_mumble
	def __init__(self, gpuOnly=True, sharedGlContext=False, hidePlatformDetails=False):
		super(BaseCalculator, self).__init__()
		self.platform = cl.get_platforms()[0]
		self.devices = self.platform.get_devices()

		if not hidePlatformDetails:
			for platform in cl.get_platforms():
				for device in platform.get_devices():
					print("===============================================================")
					print("Platform name:", platform.name)
					print("Platform profile:", platform.profile)
					print("Platform vendor:", platform.vendor)
					print("Platform version:", platform.version)
					print("---------------------------------------------------------------")
					print("Device name:", device.name)
					print("Device type:", cl.device_type.to_string(device.type))
					print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
					print("Device max clock speed:", device.max_clock_frequency, 'MHz')
					print("Device compute units:", device.max_compute_units)
					print("Device max work group size:", device.max_work_group_size)
					print("Device max work item sizes:", device.max_work_item_sizes)

		properties = None
		if sharedGlContext:
			assert cl.have_gl()
			properties = get_gl_sharing_context_properties()

		devices = self.devices
		if gpuOnly and len(self.devices) > 1:
			devices = [self.devices[1]]

		self.context = cl.Context(properties=properties, devices=devices)

		self.queue = None
Example #5
0
 def __getOpenClDevice(self, platformId, deviceId):
     if pyopencl is None:
         return None
     if not (0 <= platformId < len(pyopencl.get_platforms())):
         return None
     platform = pyopencl.get_platforms()[platformId]
     if not (0 <= deviceId < len(platform.get_devices())):
         return None
     return platform.get_devices()[deviceId]
  def __init__(self,threads=0,platform_directory_string="Platforms/OpenCLGPU/opencl_code/",root_directory_string="../../..",platform_name="",device_type=pyopencl.device_type.GPU):
    self.threads = threads
    self.platform_directory_string = platform_directory_string
    self.root_directory_string = root_directory_string
    
    self.platform_name = platform_name
    
    self.platform = None
    
    flag = False
    for p in pyopencl.get_platforms():
	for d in p.get_devices():
	  if(self.platform_name in str(p).lower() and d.get_info(pyopencl.device_info.TYPE)==device_type):
	    self.platform = p
	    self.device_type = device_type
	    flag = True
	    break
	
	if(flag): break
    
    if not(self.platform): #If the preferred platform isn't available, just take the first one with the preferred device type
      for p in pyopencl.get_platforms():
	for d in p.get_devices():
	  if(d.get_info(pyopencl.device_info.TYPE)==device_type):
	    self.platform = p
	    self.device_type = device_type
	    flag = True
	    break
	
	if(flag): break
	  
    if not(self.platform): #Failing that, just take the first one that has a CPU and use that
      for p in pyopencl.get_platforms():
	for d in p.get_devices():
	  if(d.get_info(pyopencl.device_info.TYPE)==pyopencl.device_type.CPU):
	    self.platform = p
	    self.device_type = pyopencl.device_type.CPU
	    flag = True
	    break
	
	if(flag): break
      
    self.platform_name = self.platform.get_info(pyopencl.platform_info.VENDOR)
    #if("Advanced Micro Devices" in self.platform_name): self.platform_name = self.platform.get_info(pyopencl.platform_info.NAME)
    self.device = self.platform.get_devices(self.device_type)[0] #Takes the first device available for the specified platform and type
    #except: #If the preferred device type isn't available, just take the first available CPU to that platform
      #self.device_type = pyopencl.device_type.CPU
      #self.device = self.platform.get_devices(pyopencl.device_type.CPU)[0]
    
    self.context = pyopencl.Context(devices=[self.device])
    
    self.amd_gpu_flag = False
    if((("AMD" in self.platform_name) and (self.device_type==pyopencl.device_type.GPU)) or self.amd_gpu_flag):
      self.cpu_device = self.platform.get_devices(pyopencl.device_type.CPU)[0] #Taking the first CPU available, needed for AMD GPUs
      self.cpu_context = pyopencl.Context(devices=[self.cpu_device])
      self.amd_gpu_flag = True
Example #7
0
def compute(trans_matrix, config_vector, validapps, num_valid_apps):
	# computation
	device = cl.get_platforms()[1].get_devices()[0]
	# print device.max_work_item_sizes
	ctx = cl.Context([device])

	platform = cl.get_platforms()[1]
	device = platform.get_devices()[0]

	queue = cl.CommandQueue(ctx,
	        properties=cl.command_queue_properties.PROFILING_ENABLE)

	trans_np = np.array(trans_matrix, dtype = np.integer).flatten()
	config_vector_np = np.array(config_vector, dtype=np.integer)
	validapps_np = np.array(validapps, dtype=np.integer)
	result_config_vectors_np = np.empty(num_valid_apps * row).astype(np.integer)

	kernel = """

		__kernel void compute(__global int* trans_matrix, __global int* config_vector, __global int* validapps, __global int* result_config_vectors){

		int dot_result[COL_SIZE];
		int grpid = get_group_id(0);

		if(get_local_id(0) == 0) {
			for(int j = 0; j < COL_SIZE; j ++) {
				int sum = 0;
				for(int i = 0; i < ROW_SIZE; i ++) sum += validapps[grpid * ROW_SIZE + i] * trans_matrix[i * COL_SIZE + j];
				dot_result[j] = sum;
			}
			for(int i = 0; i < COL_SIZE; i ++) result_config_vectors[grpid * COL_SIZE + i] = config_vector[i] + dot_result[i];
		}
	} 
	"""
	mat_size = "#define MAT_SIZE " + str(len(trans_np)) + '\n'
	column_size = "#define COL_SIZE " + str(row) + '\n'
	row_size = "#define ROW_SIZE " + str(col) + '\n'

	kernel = mat_size + column_size + row_size + kernel
	program = cl.Program(ctx, kernel).build()

	queue = cl.CommandQueue(ctx)

	# create memory buffers
	mf = cl.mem_flags
	trans_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = trans_np)
	config_vector_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = config_vector_np)
	validapps_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = validapps_np)
	result_config_vectors_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result_config_vectors_np.nbytes)

	# execute the kernel
	program.compute(queue, validapps_np.shape, (col, ), trans_buf, config_vector_buf, validapps_buf, result_config_vectors_buf)
	cl.enqueue_copy(queue, result_config_vectors_np, result_config_vectors_buf)

	return result_config_vectors_np
	def __init__(self,threads=0,platform_directory_string="Platforms/OpenCLGPU/opencl_code",root_directory_string=None,platform_name="",device_type=pyopencl.device_type.GPU,ssh_alias="",remote=False,hostname=None):
		"""Constructor
		
		Parameters
			platform_directory_string, root_directory_String, ssh_alias, remote, hostname - same as Platform class
			platform_name - (string) name of OpenCL SDK to use
			device_type - (pyopencl.device_type) OpenCL device type to use
		"""
		self.threads = threads
		Platform.Platform.__init__(self,platform_directory_string,root_directory_string,ssh_alias,remote,hostname)

		self.platform_name = platform_name

		self.platform = None

		#Selecting the specified platform and device
		flag = False
    		for p in pyopencl.get_platforms():
			for d in p.get_devices():
	  			if(self.platform_name in str(p).lower() and d.get_info(pyopencl.device_info.TYPE)==device_type):
	    				self.platform = p
	    				self.device_type = device_type
	    				flag = True
	    				break
	
			if(flag): break
    
		if not(self.platform): #If the preferred platform isn't available, just take the first one with the preferred device type
      			for p in pyopencl.get_platforms():
				for d in p.get_devices():
	  				if(d.get_info(pyopencl.device_info.TYPE)==device_type):
	    					self.platform = p
	    					self.device_type = device_type
	    					flag = True
	    					break
				if(flag): break
	  
		if not(self.platform): #Failing that, just take the first one that has a CPU and use that
      			for p in pyopencl.get_platforms():
				for d in p.get_devices():
	  				if(d.get_info(pyopencl.device_info.TYPE)==pyopencl.device_type.CPU):
	    					self.platform = p
	    					self.device_type = pyopencl.device_type.CPU
	    					flag = True
	    					break
	
				if(flag): break
      
    		self.platform_name = self.platform.get_info(pyopencl.platform_info.VENDOR)
    		self.device = self.platform.get_devices(self.device_type)[0] #Takes the first device available for the specified platform and type
    
   		self.context = pyopencl.Context(devices=[self.device])
    	
		"""
Example #9
0
def get_devices():
    if len(cl.get_platforms()) > 1:
        for found_platform in cl.get_platforms():
            if found_platform.name == 'NVIDIA CUDA':
                my_platform = found_platform
                print("Selected platform:", my_platform.name)
    else: my_platform = cl.get_platforms()[0]

    devices = {}
    for device in my_platform.get_devices():
      devices[cl.device_type.to_string(device.type)] = device
    return devices
Example #10
0
def get_test_platforms_and_devices(plat_dev_string=None):
    """Parse a string of the form 'PYOPENCL_TEST=0:0,1;intel:i5'.

    :return: list of tuples (platform, [device, device, ...])
    """

    if plat_dev_string is None:
        import os
        plat_dev_string = os.environ.get("PYOPENCL_TEST", None)

    def find_cl_obj(objs, identifier):
        try:
            num = int(identifier)
        except Exception:
            pass
        else:
            return objs[num]

        found = False
        for obj in objs:
            if identifier.lower() in (obj.name + ' ' + obj.vendor).lower():
                return obj
        if not found:
            raise RuntimeError("object '%s' not found" % identifier)

    if plat_dev_string:
        result = []

        for entry in plat_dev_string.split(";"):
            lhsrhs = entry.split(":")

            if len(lhsrhs) == 1:
                platform = find_cl_obj(cl.get_platforms(), lhsrhs[0])
                result.append((platform, platform.get_devices()))

            elif len(lhsrhs) != 2:
                raise RuntimeError("invalid syntax of PYOPENCL_TEST")
            else:
                plat_str, dev_strs = lhsrhs

                platform = find_cl_obj(cl.get_platforms(), plat_str)
                devs = platform.get_devices()
                result.append(
                        (platform,
                            [find_cl_obj(devs, dev_id)
                                for dev_id in dev_strs.split(",")]))

        return result

    else:
        return [
                (platform, platform.get_devices())
                for platform in cl.get_platforms()]
    def __init__(self, nBands, cType, isFloat):
        # Get opencl devices and count
        devices = [j for i in cl.get_platforms() for j in i.get_devices()]
        self.nDevices = len(devices)
        self.inBuffer = queue.Queue(self.nDevices)
        self.outBuffer = queue.Queue(self.nDevices)
        
        #Create context for each device
        contexts = [cl.Context([device]) for device in devices]

        #Compile the program for each context
        cSrcCode = cSrc.format(nBands, cType, int(isFloat))
        programs = [cl.Program(context, cSrcCode) for context in contexts]
        [program.build() for program in programs]
    
        # Queues for contexts
        queues = [cl.CommandQueue(context) for context in contexts]
        
        

        #Create a processingUnit for each program/context/queue
        self.workerExec = [
            ProcessingUnit(
                programs[i], contexts[i], queues[i], self.inBuffer, self.outBuffer
                ) for i in range(self.nDevices)
                ]
        
        for i in self.workerExec:
            i.start()
Example #12
0
    def __init__(self):
        plats = cl.get_platforms()
        ctx_props = cl.context_properties

        self.props = [(ctx_props.PLATFORM, plats[0]),
                      (ctx_props.GL_CONTEXT_KHR, platform.GetCurrentContext())]

        if sys.platform == "linux2":
            self.props.append((ctx_props.GLX_DISPLAY_KHR,
                               GLX.glXGetCurrentDisplay()))
        elif sys.platform == "win32":
            self.props.append((ctx_props.WGL_HDC_KHR, WGL.wglGetCurrentDC()))
        self.ctx = cl.Context(properties=self.props)

        self.cross4 = ElementwiseKernel(
            self.ctx, "__global const float4 *u, "
            "__global const float4 *v, "
            "__global const float4 *w, "
            "__global       float4 *r",
            "r[i] = cross4(u[i],v[i],w[i])",
            "cross4_final",
            preamble=cross4_preamble)

        self.distance2 = ElementwiseKernel(
            self.ctx, "__global const float4 *a, "
            "__global const float4 *b, "
            "__global       float4 *d",
            "d[i] = distance2(a[i],b[i])",
            "distance_final",
            preamble=distance_preamble)
        self.place_hyperspheres()
Example #13
0
    def __init__(self, device_index, options):
        super(OpenCLMiner, self).__init__(device_index, options)
        self.output_size = 0x100

        self.defspace = ''
        self.platform = cl.get_platforms()[options.platform]
        if self.platform.name == 'Apple':
            self.defspace = ' '
        self.device = self.platform.get_devices()[device_index]
        self.device_name = self.device.name.strip('\r\n \x00\t')
        self.gpu_amd = 0
        if self.device.type == cl.device_type.GPU and is_amd(self.device.platform):
            self.gpu_amd = 1
        self.frames = 30

        self.worksize = self.frameSleep= self.rate = self.estimated_rate = 0

        self.adapterIndex = None
        if ADL and is_amd(self.device.platform) and self.device.type == cl.device_type.GPU:
            with adl_lock:
                self.adapterIndex = self.get_adapter_info()
                if self.adapterIndex:
                    self.adapterIndex = self.adapterIndex[self.device_index].iAdapterIndex
        self.temperature = 0
        self.target6 = 0
        self.target7 = 0
Example #14
0
	def InitCL(self, DEVICE="CPU"):

		try:
			for platform in cl.get_platforms():
				for device in platform.get_devices():
					if cl.device_type.to_string(device.type)== DEVICE:
						my_device =	 device
						print my_device.name, "	 ", cl.device_type.to_string(my_device.type)

		except:
			my_device = cl.get_platforms()[0].get_devices()
			print my_device.name, "	 ", cl.device_type.to_string(my_device.type)

		self.ctx   = cl.Context([my_device])
		self.queue = cl.CommandQueue(self.ctx)
		self.mf	   = cl.mem_flags
def test_opencl_0(zz, a, b, c_result):
 
    for platform in cl.get_platforms():
        for device in [platform.get_devices()[1]]:
            print("===============================================================")
            print("Platform name:", platform.name)
            print("Platform profile:", platform.profile)
            print("Platform vendor:", platform.vendor)
            print("Platform version:", platform.version)
            print("---------------------------------------------------------------")
            print("Device name:", device.name)
            print("Device type:", cl.device_type.to_string(device.type))
            print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
            print("Device max clock speed:", device.max_clock_frequency, 'MHz')
            print("Device compute units:", device.max_compute_units)

        # Simnple speed test
            ctx = cl.Context([device])
            queue = cl.CommandQueue(ctx, 
                                    properties=cl.command_queue_properties.PROFILING_ENABLE)

            mf = cl.mem_flags
            a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
            b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
            dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

            prg = cl.Program(ctx, """
                __kernel void sum(__global const double *a,
                __global const double *b, __global double *c)
                {
                            int loop;
                            int gid = get_global_id(0);
                            for(loop=0; loop<%s;loop++)
                            {
                                    c[gid] = a[gid] + b[gid];
                                    c[gid] = c[gid] * (a[gid] + b[gid]);
                                    c[gid] = c[gid] * (a[gid] / 2);
                                    c[gid] = log(exp(c[gid]));
                            }
                }
            """ % (zz)).build()

            exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
            exec_evt.wait()
            elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)

            print("Execution time of test: %g s" % elapsed)

            c = numpy.empty_like(a)
            cl.enqueue_read_buffer(queue, dest_buf, c).wait()
            error = 0
            for i in range(zz):
                if c[i] != c_result[i]:
                        print("c_i: ", c[i], " c_results_i: ", c_result[i]) 
                        print("diff: ", numpy.abs(c[i] - c_result[i]))
                        error = 1
            if error:
                print("Results doesn't match!!")
            else:
                print("Results OK")
Example #16
0
def get_cl_context(gl_context):
    """Creates a CL context, with or without given GL context."""
    if gl_context is not None: # ... with OpenGL interop?
        with gl_context:
            assert cl.have_gl(), "GL interoperability not enabled."
            from pyopencl.tools import get_gl_sharing_context_properties
            cl_platform = cl.get_platforms()[0]
            cl_properties = [(cl.context_properties.PLATFORM, cl_platform)] + get_gl_sharing_context_properties()
            cl_devices = [cl_platform.get_devices()[-1]]  # Only one is allowed!
            cl_context = cl.Context(properties=cl_properties, devices=cl_devices)
    else: # ... or in stand-alone mode, CL context without GL?
        cl_platform = cl.get_platforms()[0]  # @UndefinedVariable
        cl_properties = [(cl.context_properties.PLATFORM, cl_platform)]
        cl_devices = [cl_platform.get_devices()[-1]]  # Only one is allowed!
        cl_context = cl.Context(properties=cl_properties, devices=cl_devices)
    return cl_context
Example #17
0
File: test.py Project: bazk/srs2d
    def run(self, args):
        device_type = cl.device_type.ALL
        if args.device_type == 'cpu':
            device_type = cl.device_type.CPU
        elif args.device_type == 'gpu':
            device_type = cl.device_type.GPU

        platform = cl.get_platforms()[0]
        devices = platform.get_devices(device_type=device_type)
        context = cl.Context(devices=devices)
        queue = cl.CommandQueue(context)

        simulator = physics.Simulator(context, queue, num_worlds=args.num_worlds, num_robots=args.num_robots, ta=args.ta, tb=args.tb, test=False, random_targets=args.random_targets)

        if args.params is not None:
            pos = args.params.decode('hex')
        else:
            pos = ''
            for i in xrange(physics.ANN_PARAMS_SIZE):
                pos += chr(random.randint(0,255))

        decoded = np.zeros(len(pos))
        for i in xrange(len(pos)):
            decoded[i] = float(ord(pos[i])) / 255

        if args.save is None:
            fitness = simulator.simulate([ decoded for i in xrange(args.num_worlds) ], targets_distance=args.targets_distance, targets_angle=args.targets_angle)

        else:
            fitness = simulator.simulate_and_save(args.save, [ decoded for i in xrange(args.num_worlds) ], targets_distance=args.targets_distance, targets_angle=args.targets_angle)

        print 'fitness = ', fitness
Example #18
0
    def __init__(self, filename, *args, **kwargs):
        plats = cl.get_platforms()
        from pyopencl.tools import get_gl_sharing_context_properties
        import sys

        if sys.platform == "darwin":
            self.ctx = cl.Context(properties=get_gl_sharing_context_properties(), devices=[])
        else:
            self.ctx = cl.Context(
                properties=[(cl.context_properties.PLATFORM, plats[0])] + get_gl_sharing_context_properties(),
                devices=None,
            )

        self.queue = cl.CommandQueue(self.ctx)

        self.loadProgram(filename)

        self.gl_objects = []
        # TODO get these from kwargs
        self.kernelargs = None
        self.global_size = (0,)
        self.local_size = None
        self.PreExecute = None
        self.PostExecute = None
        self.kernelname = filename.split(".")[0]
Example #19
0
    def __init__(self, parent=None):
        super().__init__(parent)
        self.setupUi(self)

        self.axSlider.valueChanged.connect(self.setAx)
        self.sxSlider.valueChanged.connect(self.setSx)
        self.aSlider.valueChanged.connect(lambda x: self.setA(x/1000))
        self.hSlider.valueChanged.connect(lambda x: self.setH(x/1000))

        self.showOriginal = False
        self.loadButton.clicked.connect(lambda: self.loadImage(QFileDialog.getOpenFileName(self, "Open Image")))
        self.saveButton.clicked.connect(lambda: self.saveImage(QFileDialog.getSaveFileName(self, "Save Image", filter="*.png")))
        self.toggleOriginalButton.clicked.connect(self.toggleImage)

        platform = choose(self, cl.get_platforms(), "OpenCL Platform", "Please choose OpenCL Platform")
        print(platform)
        device = choose(self, platform.get_devices(), "OpenCL Device", "Please choose OpenCL Device")
        print(device)
        ctx = cl.Context([device])
        self.nlmeans = NLMeans(ctx)

        self.imageLabel = QLabel()
        self.imageScrollarea.setWidget(self.imageLabel)
        self.maskScrollarea.setAlignment(Qt.AlignCenter)

        self.maskLabel = QLabel()
        self.maskScrollarea.setWidget(self.maskLabel)
        self.showMask()

        self.loadImage("lena.jpg")

        self.resetButton.clicked.connect(self.resetParameters)
        self.resetParameters()
Example #20
0
    def __init__(self, max_elements, cta_size, dtype):
        self.WARP_SIZE = 32
        self.SCAN_WG_SIZE = 256
        self.MIN_LARGE_ARRAY_SIZE = 4 * self.SCAN_WG_SIZE
        self.bit_step = 4
        self.cta_size = cta_size
        self.uintsz = dtype.itemsize

        plat = cl.get_platforms()[0]
        device = plat.get_devices()[0]
        self.ctx = cl.Context(devices=[device])
        self.queue = cl.CommandQueue(self.ctx, device)

        self.loadProgram()

        if (max_elements % (cta_size * 4)) == 0:
            num_blocks = max_elements / (cta_size * 4)
        else:
            num_blocks = max_elements / (cta_size * 4) + 1

        #print "num_blocks: ", num_blocks
        self.d_tempKeys = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * max_elements)
        self.d_tempValues = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * max_elements)

        self.mCounters = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * self.WARP_SIZE * num_blocks)
        self.mCountersSum = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * self.WARP_SIZE * num_blocks)
        self.mBlockOffsets = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * self.WARP_SIZE * num_blocks)

        numscan = max_elements/2/cta_size*16
        #print "numscan", numscan
        if numscan >= self.MIN_LARGE_ARRAY_SIZE:
        #MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE 1024
            self.scan_buffer = cl.Buffer(self.ctx, mf.READ_WRITE, size = self.uintsz * numscan / 1024)
Example #21
0
    def init_cl(self, platnum, devnum):
        # Check that specified platform exists
        platforms = cl.get_platforms()
        if len(platforms) <= platnum:
            print "Specified OpenCL platform number (%d) does not exist."
            print "Options are:"
            for p in range(len(platforms)):
                print "%d: %s" % (p, str(platforms[p]))
            return False
        else:
            platform = platforms[platnum]

        # Check that specified device exists on that platform
        devices = platforms[platnum].get_devices()
        if len(devices) <= devnum:
            print "Specified OpenCL device number (%d) does not exist on platform %s." % (devnum, platform)
            print "Options are:"
            for d in range(len(devices)):
                print "%d: %s" % (d, str(devices[d]))
            return False
        else:
            device = devices[devnum]

        # Create a context and queue
        self.CLContext = cl.Context(properties=[(cl.context_properties.PLATFORM, platform)], devices=[device])
        self.CLQueue = cl.CommandQueue(self.CLContext)
        print "Set up OpenCL context:"
        print "  Platform: %s" % (str(platform.name))
        print "  Device: %s" % (str(device.name))
        return True
Example #22
0
def _enumerate_cl_devices_for_ref_test():
    import pyopencl as cl

    noncpu_devs = []
    cpu_devs = []

    for pf in cl.get_platforms():
        for dev in pf.get_devices():
            if dev.type & cl.device_type.CPU:
                cpu_devs.append(dev)
            else:
                noncpu_devs.append(dev)

    if not (cpu_devs or noncpu_devs):
        raise LoopyError("no CL device found for test")

    if not cpu_devs:
        warn("No CPU device found for running reference kernel. The reference "
                "computation will either fail because of a timeout "
                "or take a *very* long time.")

    for dev in cpu_devs:
        yield dev

    for dev in noncpu_devs:
        yield dev
def get_OCL_context():
    """
    Retrieves the OpenCL context
    """
    if not pyopencl:
        raise RuntimeError("OpenCL unuseable")
    ctx = None
    if sys.platform == "darwin":
        ctx = pyopencl.Context(properties=get_gl_sharing_context_properties(),
                         devices=[])
    else:
        # Some OSs prefer clCreateContextFromType, some prefer
        # clCreateContext. Try both and loop.
        for platform in pyopencl.get_platforms():
            try:
                ctx = pyopencl.Context(properties=[
                            (pyopencl.context_properties.PLATFORM, platform)]
                                       + get_gl_sharing_context_properties())
            except:
                for device in platform.get_devices():
                    try:
                        ctx = pyopencl.Context(properties=[
                            (pyopencl.context_properties.PLATFORM, platform)]
                                               + get_gl_sharing_context_properties(),
                            devices=[device])
                    except:
                        ctx = None
                    else:
                        break
            else:
                break
            if ctx:
                break
    return ctx
Example #24
0
    def init_opencl(self):
        platforms = cl.get_platforms()
        print 'The platforms detected are:'
        print '---------------------------'
        for platform in platforms:
            print platform.name, platform.vendor, 'version:', platform.version

        # List devices in each platform
        for platform in platforms:
            print 'The devices detected on platform', platform.name, 'are:'
            print '---------------------------'
            for device in platform.get_devices():
                print device.name, '[Type:', cl.device_type.to_string(device.type), ']'
                print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz'
                print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB'
                print 'Maximum work group size', device.max_work_group_size
                print 'Maximum work item dimensions', device.max_work_item_dimensions
                print 'Maximum work item size', device.max_work_item_sizes
                print '---------------------------'

        # Create a context with all the devices
        devices = platforms[0].get_devices()
        self.context = cl.Context(devices)
        print 'This context is associated with ', len(self.context.devices), 'devices'
        self.queue = cl.CommandQueue(self.context, self.context.devices[0],
                                     properties=cl.command_queue_properties.PROFILING_ENABLE)
        self.kernels = cl.Program(self.context, open(file_dir + '/D2Q9.cl').read()).build(options='')
Example #25
0
File: idt.py Project: Fuhji/TopoMC
    def __init__(self, coords, values, wantCL=True, platform_num=None):
        """
        Take the coordinates and values and build a KD tree.

        Keyword arguments:
        coords -- input coordinates (x, y)
        values -- input values

        """

        self.coords = np.asarray(coords, dtype=np.float32)
        self.values = np.asarray(values, dtype=np.int32)

        if self.coords.shape[0] != self.values.shape[0]:
            raise AssertionError('lencoords does not equal lenvalues')

        self.wantCL = wantCL
        self.canCL = False

        if hasCL and self.wantCL:
            try:
                platforms = cl.get_platforms()
                try:
                    platform = platforms[platform_num]
                    self.devices = self.platform.get_devices()
                    self.context = cl.Context(self.devices)
                except TypeError:
                    # The user may be asked to select a platform.
                    self.context = cl.create_some_context()
                    self.devices = self.context.devices
                except IndexError:
                    raise
                self.queue = cl.CommandQueue(self.context)
                filestr = ''.join(open('idt.cl', 'r').readlines())
                self.program = cl.Program(self.context, filestr).build(devices=self.devices)
                for device in self.devices:
                    buildlog = self.program.get_build_info(device, cl.program_build_info.LOG)
                    if (len(buildlog) > 1):
                        print 'Build log for device', device, ':\n', buildlog
                # Only the first kernel is used.
                self.kernel = self.program.all_kernels()[0]

                # Local and global sizes are device-dependent.
                self.local_size = {}
                self.global_size = {}
                # Groups should be overcommitted.
                # For now, use 3 (48 cores / 16 cores per halfwarp) * 2
                for device in self.devices:
                    work_group_size = self.kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
                    num_groups_for_1d = device.max_compute_units * 3 * 2
                    self.local_size[device] = (work_group_size,)
                    self.global_size[device] = (num_groups_for_1d * work_group_size,)
                self.canCL = True
            except cl.RuntimeError:
                print 'warning: unable to use pyopencl, defaulting to cKDTree'

        if self.canCL:
            self.tree = build_tree(coords)
        else:
            self.tree = KDTree(coords)
Example #26
0
def main():
    # Get module name to load
    if len(sys.argv)<2:
        print "Please specify a model (.py) file"
        exit(0)
    else:
        moduleName = sys.argv[1]

    # Get OpenCL platform/device numbers
    if len(sys.argv)<3:
        # User input of OpenCL setup
        import pyopencl as cl
        # Platform
        platforms = cl.get_platforms()
        print "Select OpenCL platform:"
        for i in range(len(platforms)):
            print 'press '+str(i)+' for '+str(platforms[i])
        platnum = int(input('Platform Number: '))

        # Device
        devices = platforms[platnum].get_devices()
        print "Select OpenCL device:"
        for i in range(len(devices)):
            print 'press '+str(i)+' for '+str(devices[i])
        devnum = int(input('Device Number: '))
    else:
        platnum = int(sys.argv[2])
        devnum = int(sys.argv[3])

    # Set up complete, now run the simulation
    simulate(moduleName, platnum, devnum)
Example #27
0
def save_device_fetch(type):
	devices = []
	for platform in cl.get_platforms():
		devices = devices + platform.get_devices(device_type=type)

	# Just use the first GPU device, they are all good
	return [devices[0]]
Example #28
0
    def __init__(self, locCard, plots, outPlotQueues, alarmQueue, idLoc):
        self.status = -1
        if locCard.backend.startswith('shadow'):
            self.runDir = locCard.cwd + os.sep + 'tmp' + str(idLoc)
#        self.name = 'klmn' + self.name
#        if _DEBUG:
#            print self.name
#            print os.getpid()
        self.idN = idLoc
        self.status = 0
        self.plots = plots
        self.outPlotQueues = outPlotQueues
        self.alarmQueue = alarmQueue
        self.card = locCard
        isOpenCL = False
        self.cl_ctx = None
        if isOpenCL:
            iDevice = None
            for platform in cl.get_platforms():
                for device in platform.get_devices():
                    if device.type == 2:
                        iDevice = device
                        break
                if iDevice is not None:
                    break
            if iDevice is not None:
                self.cl_ctx = cl.Context(devices=[iDevice])
                self.cl_queue = cl.CommandQueue(self.cl_ctx)
                cl_file = os.path.join(__dir__, r'hist.cl')
                with open(cl_file, 'r') as f:
                    kernelsource = f.read()
                self.cl_program = cl.Program(self.cl_ctx, kernelsource).build()
                self.cl_mf = cl.mem_flags
Example #29
0
def init(platform_name=None, device_index=None, profiling=False, profiling_file='profile.dat',
         loglevel=logging.INFO, logfile=None, double_precision=False):
    """Initialize syris with *device_index*."""
    cfg.init_logging(level=loglevel, logger_file=logfile)
    cfg.PRECISION = cfg.Precision(double_precision)
    cfg.OPENCL = cfg.OpenCL()
    platforms = []
    try:
        platforms = cl.get_platforms()
    except Exception as e:
        LOG.exception(str(e))
    else:
        if not platforms:
            LOG.warning('No OpenCL platforms found, GPU computing will not be available')
        else:
            make_opencl_defaults(platform_name=platform_name, device_index=device_index, profiling=profiling)
    if profiling:
        _wrap_opencl()
        prf.PROFILER = prf.Profiler(cfg.OPENCL.queues, profiling_file)
        prf.PROFILER.start()

        @atexit.register
        def exit_handler():
            """Shutdown the profiler on exit."""
            prf.PROFILER.shutdown()

    if platforms:
        init_programs()
Example #30
0
File: lab1.py Project: spetz911/CL
def cl_init(type = 'GPU'):
	if type == 'GPU':
		my_type = cl.device_type.GPU
	elif type == 'CPU':
		my_type = cl.device_type.CPU
	
	try:
		platform = cl.get_platforms()[0]
		devices = platform.get_devices(device_type=my_type)
		ctx = cl.Context(devices = devices)
	except:
		ctx = cl.create_some_context(interactive=True)
	
	device = devices[0]
	print("===============================================================")
	print("Platform name: " + platform.name)
	print("Platform vendor: " + platform.vendor)
	print("Platform version: " + platform.version)
	print("---------------------------------------------------------------")
	print("Device name: " + device.name)
	print("Device type: " + cl.device_type.to_string(device.type))
	print("Local memory: " + str(device.local_mem_size//1024) + ' KB')
	print("Device memory: " + str(device.global_mem_size//1024//1024) + ' MB')
	print("Device max clock speed:" + str(device.max_clock_frequency) + ' MHz')
	print("Device compute units:" + str(device.max_compute_units))
	
	return ctx
Example #31
0
def offsetData(data, offset=0):
    shape = (len(data), len(data[0]), 3)
    h, w, dim = shape
    result = np.empty(h * w * dim, dtype=np.float32)

    # read data as floats
    data = np.array(data)
    data = data.astype(np.float32)

    # convert to 1-dimension
    data = data.reshape(-1)

    # the kernel function
    src = """
    __kernel void offsetData(__global float *dataIn, __global float *result){
        int w = %d;
        int dim = %d;
        int offsetX = %d;

        // get current position
        int posx = get_global_id(1);
        int posy = get_global_id(0);

        // convert position from 0,360 to -180,180
        int posxOffset = posx;
        if (offsetX > 0 || offsetX < 0) {
            if (posx < offsetX) {
                posxOffset = posxOffset + offsetX;
            } else {
                posxOffset = posxOffset - offsetX;
            }
        }

        // get indices
        int i = posy * w * dim + posxOffset * dim;
        int j = posy * w * dim + posx * dim;

        // set result
        result[j] = dataIn[i];
        result[j+1] = dataIn[i+1];
        result[j+2] = dataIn[i+2];
    }
    """ % (w, dim, offset)

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        ctx = cl.Context(devices=GPUs)
    else:
        print "Warning: using CPU"
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    dataIn = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    prg.offsetData(queue, [h, w], None, dataIn, outResult)

    # Copy result
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    return result
Example #32
0
def get_platforms():
    return cl.get_platforms()
Example #33
0
from __future__ import print_function
import pyopencl as cl
from pyopencl import array

try:
    from pyopencl import cltypes
except ImportError:
    from ..utils import cltypes
import numpy as np

print(cl.get_platforms())
kernel_src = """
/**
 * Updates the table for every bit in every step
 *
**/
__kernel void learn(
    __global const uint* activeBitIdx, // work size is the number of activations, array of active bits indices in the input
    __global float* averages,
    __global uint* count,
    float const alpha, // moving average alpha
    float const actualValue, // actual input value from the PF
    uint  const bucketIdx, // bucket that actualValue falls into
    uint  const bucketCount // number of buckets
) {
    const int gid = get_global_id(0);
    const int n = activeBitIdx[gid]; // each job updates the table for a single active bit of the input

    const int nbI = n*bucketCount + bucketIdx;
    // increment the active count for this bit's bucket
    averages[nbI] = ((1-alpha)*averages[nbI]) + alpha * actualValue;
import pyopencl as cl
import pyopencl.array as cl_array
import pyopencl.cltypes as cltypes
import numpy
import pytest

import rk_pd_4d

platform = next(platform for platform in cl.get_platforms()
                if platform.name == 'Intel(R) OpenCL')

device = platform.get_devices()

context = cl.Context(device)  # Initialize the Context
queue = cl.CommandQueue(context)  # Instantiate a Queue


@pytest.mark.parametrize(
    'initials, t0, t1, derived_function, expected, delta_absolute_error, absolute_error, relative_error, expected_error_runge_kutta',
    [
        (numpy.array([cltypes.make_double4(0.0, 0.0, 0.0, 0.0)
                      ]), 0.0, 1.0, '1.0, 1.0, 1.0, 1.0',
         numpy.array([cltypes.make_double4(1.0, 1.0, 1.0, 1.0)]), 1e-18, 1e-18,
         1e-18, numpy.array([numpy.double(0.0)])),
        (numpy.array([cltypes.make_double4(0.0, 0.0, 0.0, 0.0)
                      ]), 0.0, 1.0, '1.0, 1.0, 2.0 * Y->x, - 2.0 * Y->y',
         numpy.array([cltypes.make_double4(1.0, 1.0, 1.0, -1.0)]), 2.3e-16,
         2.3e-16, 1e-18, numpy.array([numpy.double(0.0)])),
        (numpy.array([cltypes.make_double4(0.0, 0.0, 0.0, 0.0)]), 0.0, 1.0,
         '1.0, 1.0, 3.0 * Y->x * Y->x, - 3.0 * Y->y * Y->y',
         numpy.array([cltypes.make_double4(1.0, 1.0, 1.0, -1.0)]), 3.2e-16,
Example #35
0
def getTemperatureImage(data, p):
    tRange = p["temperature_range"]
    gradient = p["gradient"]

    dataG = np.array(gradient)
    dataG = dataG.astype(np.float32)

    shape = data.shape
    h, w, dim = shape

    data = data.reshape(-1)
    dataG = dataG.reshape(-1)

    # the kernel function
    src = """
    __kernel void lerpImage(__global float *d, __global float *grad, __global uchar *result){
        int w = %d;
        int dim = %d;
        int gradLen = %d;
        float minValue = %f;
        float maxValue = %f;

        // get current position
        int posx = get_global_id(1);
        int posy = get_global_id(0);

        // get index
        int i = posy * w * dim + posx * dim;
        float temperature = d[i];
        int r = 45;
        int g = 50;
        int b = 55;

        // assume large values are invalid
        if (temperature > -99.0 && temperature < 99.0) {
            // normalize the temperature
            float norm = (temperature - minValue) / (maxValue - minValue);
            // clamp
            if (norm > 1.0) {
                norm = 1.0;
            }
            if (norm < 0.0) {
                norm = 0.0;
            }
            // get color from gradient
            int gradientIndex = (int) round(norm * (gradLen-1));
            gradientIndex = gradientIndex * 3;
            r = (int) round(grad[gradientIndex] * 255);
            g = (int) round(grad[gradientIndex+1] * 255);
            b = (int) round(grad[gradientIndex+2] * 255);
        }

        // set the color
        result[i] = r;
        result[i+1] = g;
        result[i+2] = b;
    }
    """ % (w, dim, len(gradient), tRange[0], tRange[1])

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        # print "Using GPU"
        ctx = cl.Context(devices=GPUs)
    else:
        print "Warning: using CPU"
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data)
    inG = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataG)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, (data.astype(np.uint8)).nbytes)

    prg.lerpImage(queue, [h, w], None, inData, inG, outResult)

    # Copy result
    result = np.empty_like(data)
    result = result.astype(np.uint8)
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    imOut = Image.fromarray(result, mode="RGB")
    return imOut
Example #36
0
class MergeSort:

    NAME = 'NVIDIA CUDA'
    platforms = cl.get_platforms()
    devs = None
    for platform in platforms:
        if platform.name == NAME:
            devs = platform.get_devices()
    ctx = cl.Context(devs)
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    tile_x = np.int(32)
    tile_y = np.int(1)

    def merge_sort_serial(self, a_cpu):

        # a_cpu: an array generated in cpu.
        # return: the sorted array of a_cpu.
        a_length = len(a_cpu)

        #Base case
        if a_length <= 1:
            return a_cpu

        #Recursive Case
        a_mid = int(a_length / 2)
        left = np.array(a_cpu[0:a_mid])
        right = np.array(a_cpu[a_mid:a_length])

        #Recursively Sort
        merge_sort = MergeSort()
        left = merge_sort.merge_sort_serial(left)
        right = merge_sort.merge_sort_serial(right)

        return merge_sort.merge_serial(left, right)

    def merge_serial(self, left_cpu, right_cpu):
        #initialize
        result = []

        # while not empty
        while (len(left_cpu) > 0 and len(right_cpu) > 0):
            left_first = left_cpu[0]
            right_first = right_cpu[0]
            # print(left_first, type(left_first), right_first, type(right_first))
            if (left_first <= right_first):
                result.append(left_cpu[0])
                left_cpu = np.array(left_cpu[1:len(left_cpu)])
            else:
                result.append(right_cpu[0])
                right_cpu = np.array(right_cpu[1:len(right_cpu)])

        # consume other when one is empty
        if len(left_cpu) == 0:
            result = np.concatenate((result, right_cpu))
        elif len(right_cpu) == 0:
            result = np.concatenate((result, left_cpu))
        else:
            print("length error")

        return result

    #%%

    merge_sort_naive_kernel_code = """
    __kernel void Merge_sort_naive(__global float* a, __global float* a_temp, __global float* c, const unsigned int a_length)
    {
        //-----initialize-----
        int tx = get_local_id(0);
        int bx = get_group_id(0); 
        int col = bx * get_local_size(0) + tx; 
        
        const int a_len = a_length;
        const int block_size = 32; 

        //-----iterate stride and tile_shift-----
        for (int stride =1; stride<a_len; stride*=2){
            int shift_count = (a_len-1)/(block_size*stride*2)+1;
            
            for (int tile_shift= 0; tile_shift < shift_count; tile_shift++){

                int beginning = col * stride *2 + tile_shift *  stride * 2 * block_size;
                int middle = beginning + stride;
                int end = middle + stride;

                if (beginning>= a_len) continue; 

                //alter middle and end if necessary
                if (end>a_len){
                    end = a_len;
                }

                if (middle > a_len){
                    middle = a_len;
                }

                int temp_distance_1 = middle - beginning;
                int temp_distance_2 = end - middle;

                //merge
                int m = 0;
                int n = 0;

                while (m<temp_distance_1 && n<temp_distance_2){
                    if (a[beginning+m] < a[middle+n]){
                        a_temp[beginning+m+n]=a[beginning+m];
                        m++;
                    }
                    else if (a[beginning+m] >= a[middle+n]){
                        a_temp[beginning+m + n]=a[middle+n];
                        n++;
                    }
                }

                //put in the rest of arr2
                if (n<temp_distance_2){
                    while (n<temp_distance_2){
                        a_temp[beginning+m+n] = a[middle+n];
                        n++;
                    }
                }

                if (m<temp_distance_1){
                    while (m<temp_distance_1){
                        a_temp[beginning+m+n] = a[beginning+m];
                        m++;
                    }
                }
                    
                barrier(CLK_LOCAL_MEM_FENCE);
                barrier(CLK_GLOBAL_MEM_FENCE);
                
                for (int j=0; j<end-beginning; j++){
                    a[beginning+j] = a_temp[beginning+j];
                    a_temp[beginning+j] = 0; //set to zero to clean
                }
                
                float min_temp = a[beginning]; 
                float max_temp = a[end]; 
                //printf("%d, %d, %d | %f, %f  \\n", beginning, middle, end, min_temp, max_temp); 
                
                barrier(CLK_LOCAL_MEM_FENCE);
                barrier(CLK_GLOBAL_MEM_FENCE);
                
            }
            
        }

        for (int k=0; k<a_len; k++){
            c[k] = a[k];
        }
        
        barrier(CLK_LOCAL_MEM_FENCE);
        barrier(CLK_GLOBAL_MEM_FENCE);
    
    }

    """
    #%%
    merge_sort_optimized1_kernel_code = """
    __kernel void Merge_sort_optimized1(__global float* a, __global float* c, const unsigned int a_length)
    {
        //initialize
        int tx = get_local_id(0);
        int bx = get_group_id(0); 
        int col = bx * get_local_size(0) + tx; 
        const int a_len = a_length;

        //-----load a array into shared memory-----
        __local float a_shared[1024];

        if (col<a_length){
            a_shared[col] = a[col];
        }
        
        barrier(CLK_LOCAL_MEM_FENCE);
        barrier(CLK_GLOBAL_MEM_FENCE);

        //-----sort-----
        //-----set stride-----
        for (int stride = 1; stride < a_len; stride *= 2){
            int beginning = col * stride *2; //test
            int middle = beginning + stride;
            int end = middle + stride;

            if(beginning>=a_len) continue;

            //-----watch for edge cases of beginning, middle, or end larger than a_length-----

            //alter middle and end if necessary
            if (end>a_len){
                end = a_len;
            }

            if (middle>a_len){
                middle = a_len;
            }

            int temp_distance_1 = middle-beginning;
            int temp_distance_2 = end - middle;

            //merge
            int m = 0;
            int n = 0;
            float a_temp[1024];

            while (m<temp_distance_1 && n<temp_distance_2){
                if (a_shared[beginning+m] < a_shared[middle+n]){
                    a_temp[beginning + m +n]=a_shared[beginning+m];
                    m++;
                }
                else if (a_shared[beginning+m] >= a_shared[middle+n]){
                    a_temp[beginning + m + n]=a_shared[middle+n];
                    n++;
                }
            }

            //put in the rest of arr2
            if (n<temp_distance_2){
                while (n<temp_distance_2){
                    a_temp[beginning+m+n] = a_shared[middle+n];
                    n++;
                }
            }

            if (m<temp_distance_1){
                while (m<temp_distance_1){
                    a_temp[beginning+m+n] = a_shared[beginning+m];
                    m++;
                }
            }
                
            //put temp into shared
            for (int j=beginning; j<end; j++){
                a_shared[j] = a_temp[j];
            }
            
            float min_temp = a_shared[beginning]; 
            float max_temp = a_shared[end]; 
            // printf("%d, %d, %d | %f, %f  \\n", beginning, middle, end, min_temp, max_temp); 
            
            barrier(CLK_LOCAL_MEM_FENCE);
            barrier(CLK_GLOBAL_MEM_FENCE);
        }
        for (int k=0; k<a_len; k++){
            c[k] = a_shared[k];
        }
    
    }
    """

    #%%

    prg_merge_sort_naive = cl.Program(ctx,
                                      merge_sort_naive_kernel_code).build()
    prg_merge_sort_optimized1 = cl.Program(
        ctx, merge_sort_optimized1_kernel_code).build()

    #%%

    def __init__(self):
        self.a_gpu = None

    def prepare_data(self, a_cpu):
        if self.a_gpu is None:
            self.a_gpu = cl.array.to_device(MergeSort.queue, a_cpu)

    #%%

    def merge_sort_naive(self, a_cpu):
        print("-" * 80)
        print("Naive")

        a_length = len(a_cpu)
        minimum = min(a_length, 32)
        place_holder = a_cpu[0:minimum]

        self.prepare_data(a_cpu)
        place_holder_gpu = cl.array.empty(MergeSort.queue, place_holder.shape,
                                          a_cpu.dtype)
        c_naive_gpu = cl.array.empty(MergeSort.queue, a_cpu.shape, a_cpu.dtype)
        b_naive_gpu = cl.array.empty(MergeSort.queue, a_cpu.shape, a_cpu.dtype)
        evt = MergeSort.prg_merge_sort_naive.Merge_sort_naive(
            MergeSort.queue, place_holder_gpu.shape, place_holder_gpu.shape,
            self.a_gpu.data, b_naive_gpu.data, c_naive_gpu.data,
            np.int32(a_length))
        evt.wait()
        time_naive = 1e-10 * (evt.profile.end - evt.profile.start)
        c_naive = c_naive_gpu.get()
        return c_naive, time_naive

    #%%

    def merge_sort_optimized1(self, a_cpu):
        print("-" * 80)
        print("Optimized")
        """different a_length version"""
        #        a_length = np.array((len(a_cpu))).astype(np.int32)
        a_length = len(a_cpu)

        self.prepare_data(a_cpu)
        c_optimized_gpu = cl.array.empty(MergeSort.queue, a_cpu.shape,
                                         a_cpu.dtype)
        evt = MergeSort.prg_merge_sort_optimized1.Merge_sort_optimized1(
            MergeSort.queue, c_optimized_gpu.shape, c_optimized_gpu.shape,
            self.a_gpu.data, c_optimized_gpu.data, np.int32(a_length))
        evt.wait()
        time_optimized = 1e-10 * (evt.profile.end - evt.profile.start)
        c_optimized = c_optimized_gpu.get()
        return c_optimized, time_optimized
Example #37
0
  knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2" )
  
  return knl


n=128
r=3
k=0
norm2=1
norm=1
eps=1e-8
d=3
dimension=[n,n,n]


plt = cl.get_platforms()
nvidia_plat = plt[1]
ctx = cl.Context(nvidia_plat.get_devices())


knl_get_tensor = get_tensor(ctx)
knl_r_U = Prav_U(ctx)
knl_r_V = Prav_V(ctx)
knl_r_W = Prav_W(ctx)
knl_l_U = left_U(ctx)
knl_l_V = left_V(ctx)
knl_l_W = left_W(ctx)
cknl_r_U = lp.CompiledKernel(ctx, knl_r_U)
cknl_r_V = lp.CompiledKernel(ctx, knl_r_V)
cknl_r_W = lp.CompiledKernel(ctx, knl_r_W)
cknl_l_U = lp.CompiledKernel(ctx, knl_l_U)
Example #38
0
import pyopencl as cl
import pytest
import numpy as np

from pyclesperanto_prototype import create_image

DEVICES = [
    device for platform in cl.get_platforms()
    for device in platform.get_devices()
]


@pytest.fixture(params=DEVICES, ids=lambda x: x.name)
def context(request):
    return cl.Context(devices=[request.param])


dtypes = {
    "int8",
    "int16",
    "int32",
    # 'int64',
    "uint8",
    "uint16",
    "uint32",
    # 'uint64',
    "float16",
    "float32",
    # 'float64',
    # "complex64",
}
Example #39
0
    def __init__(self, interface):
        platforms = cl.get_platforms()

        # Initialize object attributes and retrieve command-line options...)
        self.device = None
        self.kernel = None
        self.interface = interface
        self.core = self.interface.addCore()
        self.defines = ''
        self.loopExponent = 0

        # Set the initial number of nonces to run per execution
        # 2^(16 + aggression)
        self.AGGRESSION += 16
        self.AGGRESSION = min(32, self.AGGRESSION)
        self.AGGRESSION = max(16, self.AGGRESSION)
        self.size = 1 << self.AGGRESSION

        # We need a QueueReader to efficiently provide our dedicated thread
        # with work.
        self.qr = QueueReader(self.core, lambda nr: self.preprocess(nr),
                              lambda x, y: self.size * 1 << self.loopExponent)

        # The platform selection must be valid to mine.
        if self.PLATFORM >= len(platforms) or \
            (self.PLATFORM is None and len(platforms) > 1):
            self.interface.log(
                'Wrong platform or more than one OpenCL platform found, '
                'use PLATFORM=ID to select one of the following\n', False,
                True)

            for i, p in enumerate(platforms):
                self.interface.log('    [%d]\t%s' % (i, p.name), False, False)

            # Since the platform is invalid, we can't mine.
            self.interface.fatal()
            return
        elif self.PLATFORM is None:
            self.PLATFORM = 0

        devices = platforms[self.PLATFORM].get_devices()

        # The device selection must be valid to mine.
        if self.DEVICE >= len(devices) or \
            (self.DEVICE is None and len(devices) > 1):
            self.interface.log(
                'No device specified or device not found, '
                'use DEVICE=ID to specify one of the following\n', False, True)

            for i, d in enumerate(devices):
                self.interface.log('    [%d]\t%s' % (i, d.name), False, False)

            # Since the device selection is invalid, we can't mine.
            self.interface.fatal()
            return
        elif self.DEVICE is None:
            self.DEVICE = 0

        self.device = devices[self.DEVICE]

        # We need the appropriate kernel for this device...
        try:
            self.loadKernel(self.device)
        except Exception:
            self.interface.fatal('Failed to load OpenCL kernel!')
            return

        # Initialize a command queue to send commands to the device, and a
        # buffer to collect results in...
        self.commandQueue = cl.CommandQueue(self.context)
        self.output = np.zeros(self.OUTPUT_SIZE + 1, np.uint32)
        self.output_buf = cl.Buffer(self.context,
                                    cl.mem_flags.WRITE_ONLY
                                    | cl.mem_flags.USE_HOST_PTR,
                                    hostbuf=self.output)

        self.applyMeta()
Example #40
0
def threadedSimulation(time_delta,
                       time_steps,
                       objects,
                       queue_data,
                       queue_comm,
                       skip_n,
                       openCL=False,
                       method="first order leapfrog"):
    """
	Transforms the input to numpy types if the simulation is run on the CPU or to OpenCL 
	types if OpenCL is being used.
	Then coditionally sets up an OpenCL environment and finally runs the siumlation.
	"""
    #transformation-----------------------------------------------------
    name = [""] * len(objects)
    mass = np.zeros(len(objects), dtype=np.float64)
    position_out = np.zeros(
        (int(time_steps) // int(skip_n) + 1, len(objects), 3),
        dtype=np.float64)

    if openCL == False:
        pos = np.zeros((len(objects), 3), dtype=np.float64)
        vel = np.zeros((len(objects), 3), dtype=np.float64)

        for elem in range(len(objects)):
            name[elem] = objects[elem].getName()
            mass[elem] = objects[elem].getMass()
            pos[elem] = np.array([ao.getAstrObjPos(objects[elem], objects)],
                                 dtype=np.float64)
            vel[elem] = np.array([ao.getAstrObjVel(objects[elem], objects)],
                                 dtype=np.float64)

        position_out[0] = np.array(pos, dtype=np.float64)

    else:
        pos = np.zeros((1, len(objects)), cl.array.vec.double4)
        vel = np.zeros((1, len(objects)), cl.array.vec.double4)
        force = np.zeros((1, len(objects)), cl.array.vec.double4)

        for elem in range(len(objects)):
            name[elem] = objects[elem].getName()
            mass[elem] = objects[elem].getMass()
            pos[0,
                elem] = tuple(ao.getAstrObjPos(objects[elem], objects)) + (0, )
            vel[0,
                elem] = tuple(ao.getAstrObjVel(objects[elem], objects)) + (0, )

        position_out[0] = np.array(
            [list(pos[0][i])[0:3] for i in range(len(pos[0]))],
            dtype=np.float64)

    #OpenCL initialization----------------------------------------------
    if openCL == True:
        platform = cl.get_platforms()[0]
        device = platform.get_devices()[0]
        context = cl.Context([device])
        clqueue = cl.CommandQueue(context)

        if method == "first order leapfrog":
            kernel = open("first_order_leapfrog.cl", 'r').read()
            program = cl.Program(context, kernel).build()
            program.kick.set_scalar_arg_dtypes(
                [None, None, None, np.int32, np.float32])
            program.drift.set_scalar_arg_dtypes(
                [None, None, np.int32, np.float32])
        elif method == "PEFRL":
            kernel = open("PEFRL.cl", 'r').read()
            program = cl.Program(context, kernel).build()
            program.PEFRL_1.set_scalar_arg_dtypes(
                [None, None, np.int32, np.float32])
            program.PEFRL_2.set_scalar_arg_dtypes(
                [None, None, None, np.int32, np.float32])
            program.PEFRL_3.set_scalar_arg_dtypes(
                [None, None, np.int32, np.float32])
            program.PEFRL_4.set_scalar_arg_dtypes(
                [None, None, None, np.int32, np.float32])
            program.PEFRL_5.set_scalar_arg_dtypes(
                [None, None, np.int32, np.float32])

        mem_flags = cl.mem_flags

        buffer_position = cl.Buffer(context,
                                    mem_flags.READ_WRITE
                                    | mem_flags.COPY_HOST_PTR,
                                    hostbuf=pos)
        buffer_velocity = cl.Buffer(context,
                                    mem_flags.READ_WRITE
                                    | mem_flags.COPY_HOST_PTR,
                                    hostbuf=vel)
        buffer_mass = cl.Buffer(context,
                                mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                                hostbuf=mass)

    #simulation----------------------------------------------------
    if openCL == False:
        if method == "first order leapfrog":
            vel = ode.leapfrog_first_order_kick(pos, vel, mass, time_delta /
                                                2)  #initial phase shift
        j = 1
        for i in range(time_steps):
            if i % 10 == 0:
                if not queue_comm.empty():
                    tmp = queue_comm.get()
                    if tmp == "stop":
                        break
                    else:
                        queue_comm.put(tmp)

            if method == "first order leapfrog":
                pos = ode.leapfrog_first_order_drift(pos, vel, time_delta)
                vel = ode.leapfrog_first_order_kick(pos, vel, mass, time_delta)
            elif method == "PEFRL":
                pos, vel = ode.PEFRL(pos, vel, mass, time_delta)

            #there should be a closed form covering both cases...
            if skip_n == 1:
                position_out[i + 1] = np.array(pos, dtype=np.float64)
            elif (i + 1) % skip_n == 0:
                position_out[j] = np.array(pos, dtype=np.float64)
                j += 1
            if queue_comm.empty():
                queue_comm.put(i / time_steps * 100)
    else:
        dim = np.int32(len(objects))
        time_delta_CL = np.float32(time_delta)

        #send kernels to GPU--------------------------------------------------------------
        if method == "first order leapfrog":
            kernel_kick_built = program.kick
            kernel_kick_built.set_args(buffer_mass, buffer_position,
                                       buffer_velocity, dim,
                                       np.float32(time_delta / 2))
            #inital phase offset
            cl.enqueue_nd_range_kernel(clqueue, kernel_kick_built, vel.shape,
                                       None)

            kernel_drift_built = program.drift
            kernel_drift_built.set_args(buffer_position, buffer_velocity, dim,
                                        time_delta_CL)

            kernel_kick_built.set_args(buffer_mass, buffer_position,
                                       buffer_velocity, dim, time_delta_CL)
        elif method == "PEFRL":
            kernel_PEFRL1_built = program.PEFRL_1
            kernel_PEFRL1_built.set_args(buffer_position, buffer_velocity, dim,
                                         time_delta_CL)

            kernel_PEFRL2_built = program.PEFRL_2
            kernel_PEFRL2_built.set_args(buffer_mass, buffer_position,
                                         buffer_velocity, dim, time_delta_CL)

            kernel_PEFRL3_built = program.PEFRL_3
            kernel_PEFRL3_built.set_args(buffer_position, buffer_velocity, dim,
                                         time_delta_CL)

            kernel_PEFRL4_built = program.PEFRL_4
            kernel_PEFRL4_built.set_args(buffer_mass, buffer_position,
                                         buffer_velocity, dim, time_delta_CL)

            kernel_PEFRL5_built = program.PEFRL_5
            kernel_PEFRL5_built.set_args(buffer_position, buffer_velocity, dim,
                                         time_delta_CL)

        #actual simulation loop-----------------------------------------------------------
        j = 1
        for i in range(time_steps):
            if i % 10 == 0:
                if not queue_comm.empty():
                    tmp = queue_comm.get()
                    if tmp == "stop":
                        break
                    else:
                        queue_comm.put(tmp)

            if method == "first order leapfrog":
                cl.enqueue_nd_range_kernel(clqueue, kernel_drift_built,
                                           vel.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_kick_built,
                                           vel.shape, None)
            elif method == "PEFRL":
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL1_built,
                                           pos.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL2_built,
                                           vel.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL3_built,
                                           pos.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL4_built,
                                           vel.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL5_built,
                                           pos.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL4_built,
                                           vel.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL3_built,
                                           pos.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL2_built,
                                           vel.shape, None)
                cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL1_built,
                                           pos.shape, None)

            #prevents the queue from growing in some implementations of OpenCL
            if i % 100 == 0:
                clqueue.finish()

            if skip_n == 1:
                cl.enqueue_copy(clqueue, pos, buffer_position)
                position_out[i + 1] = np.array(
                    [list(pos[0][k])[0:3] for k in range(len(pos[0]))],
                    dtype=np.float64)
            elif (i + 1) % skip_n == 0:
                cl.enqueue_copy(clqueue, pos, buffer_position)
                position_out[j] = np.array(
                    [list(pos[0][k])[0:3] for k in range(len(pos[0]))],
                    dtype=np.float64)
                j += 1
            if queue_comm.empty():
                queue_comm.put(i / time_steps * 100)

    queue_data.put([name, [time_delta, skip_n], position_out])
Example #41
0
def buildGraph(ip):
        """Builds the knn grap with intial params.
        params:
        ------
        ip: initial params

        return: 
        ------
        graph: graph object of Graph 
        """
        # find the nearest neighbors on the gpu
        start = time()
        nbrs = NearestNeighbors(n_neighbors=ip.k+1, algorithm="buffer_kd_tree", tree_depth=9, plat_dev_ids={0:[0]})    
        nbrs.fit(ip.signal)
        dists, inds = nbrs.kneighbors(ip.signal)  

        dists_gpu = dists
        dists_gpu = dists_gpu[0:,1:]
        dists_gpu = unroll(dists_gpu)
        dists_gpu = dists_gpu.astype('float32')

        ngbrs_gpu = inds
        ngbrs_gpu = ngbrs_gpu[0:,1:]
        ngbrs_gpu = unroll(ngbrs_gpu)
        ngbrs_gpu = ngbrs_gpu.astype('int32')

        k = ip.k
        scale = ip.sigma
        n, chnl = ip.signal.shape

        # now build the graph using those nns using gpu
        platform = cl.get_platforms()[0]
        print(platform)
        device = platform.get_devices()[0]
        print(device)
        context = cl.Context([device])
        print(context)
        program = cl.Program(context, open(mywf).read()).build()
        print(program)
        queue = cl.CommandQueue(context)
        print(queue)

        # create the buffers on the device, intensity, nbgrs, weights
        mem_flags = cl.mem_flags
        dists_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,hostbuf=dists_gpu)
        weight_vec = np.ndarray(shape=(n*k,), dtype=np.float32)
        weight_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, weight_vec.nbytes)

        # run the kernel to compute the weights
        program.compute_weights(queue, (n*k,), None,  dists_buf, weight_buf, np.int32(k), np.float32(scale))
        queue.finish()

        # copy the weihts to the host memory
        cl.enqueue_copy(queue, weight_vec, weight_buf)
        queue.finish()
        end = time() - start

        print('total time taken by the gpu python:', end)
        # save the graph
        graph = Graph(weight_vec,ngbrs_gpu,k)
        return graph
Example #42
0
    print("[INFO]: implement = %s" % (args.type))
    print("[INFO]: arch = %s" % (args.arch))
    print("[INFO]: kernel @ %s" % (args.kernel))
    print("[INFO]: repeat %d times" % (args.repeat))
    print("[INFO]: transA = %s" % (args.transA))
    print("[INFO]: transB = %s" % (args.transB))
    print("[INFO]: m = %d" % (args.m))
    print("[INFO]: n = %d" % (args.n))
    print("[INFO]: k = %d" % (args.k))
    print("[INFO]: alpha = %f" % (args.alpha))
    print("[INFO]: beta = %f" % (args.beta))
    print("[INFO]: verify = %s" % (args.verify))

    # create platform, cq
    platforms = filter(lambda p: 'AMD' in p.name, cl.get_platforms())
    devices = filter(lambda d: args.arch == d.name, platforms[0].get_devices())
    assert len(devices) == 1
    ctx = cl.Context(devices)
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    hgemm = Hgemm(args.kernel, ctx, devices)
    hgemm.tune(args.m, args.n, args.k,\
               args.m, args.n, args.m,\
               args.alpha, args.beta,\
               args.transA, args.transB,\
               implement = args.type)

    #A = np.asfortranarray(np.random.rand(args.m, args.k).astype(np.float16))
    #B = np.asfortranarray(np.random.rand(args.n, args.k).astype(np.float16))
    #A = np.asfortranarray(np.tril(np.full((args.m, args.k),1.0)).astype(np.float16))
Example #43
0
import numpy
import pyopencl             #@UnresolvedImport
from pyopencl import mem_flags  #@UnresolvedImport

from xpra.util import engs
from xpra.os_util import _memoryview

PREFERRED_DEVICE_TYPE = os.environ.get("XPRA_OPENCL_DEVICE_TYPE", "GPU")
PREFERRED_DEVICE_NAME = os.environ.get("XPRA_OPENCL_DEVICE_NAME", "")
PREFERRED_DEVICE_PLATFORM = os.environ.get("XPRA_OPENCL_PLATFORM", "")

OPENCL_YUV2RGB = os.environ.get("XPRA_OPENCL_YUV2RGB", "0")=="1"

AMD_WARNING_SHOWN = not os.environ.get("XPRA_AMD_WARNING", "1")=="1"

opencl_platforms = pyopencl.get_platforms()
if len(opencl_platforms)==0:
    raise ImportError("no OpenCL platforms found!")

def roundup(n, m):
    return (n + m - 1) & ~(m - 1)

def dimdiv(dim, div):
    #when we divide a dimensions by the subsampling
    #we want to round up so as to include the last
    #pixel when we hit odd dimensions
    return roundup(dim//div, div)

def device_type(d):
    try:
        return pyopencl.device_type.to_string(d.type)
Example #44
0
def getParticleData(data, p):
    h = p["particles"]
    w = p["points_per_particle"]
    dim = 4  # four points: x, y, alpha, width

    # fade in and out
    fade_ms = p["fade_ms"]
    dur = p["duration_ms"]
    ms = p["ms"]
    fadeProgress = 1.0
    if ms < fade_ms:
        fadeProgress = 1.0 * ms / fade_ms
    elif ms > (dur - fade_ms):
        fadeProgress = 1.0 - 1.0 * (ms - (dur - fade_ms)) / fade_ms
    if p["debug"]:
        fadeProgress = 1.0

    offset = 1.0 - p["animationProgress"]
    tw = p["width"]
    th = p["height"]
    dh = len(data)
    dw = len(data[0])

    result = np.zeros(tw * th, dtype=np.float32)

    # print "%s x %s x %s = %s" % (w, h, dim, len(result))

    fData = np.array(data)
    fData = fData.astype(np.float32)
    fData = fData.reshape(-1)

    # print "%s x %s x 3 = %s" % (dw, dh, len(fData))

    pData = np.array(p["particleProperties"])
    pData = pData.astype(np.float32)
    pData = pData.reshape(-1)

    # print "%s x 3 = %s" % (h, len(pData))

    # the kernel function
    src = """

    static float lerp(float a, float b, float mu) {
        return (b - a) * mu + a;
    }

    static float det(float a0, float a1, float b0, float b1) {
        return a0 * b1 - a1 * b0;
    }

    static float2 lineIntersection(float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3) {
        float xd0 = x0 - x1;
        float xd1 = x2 - x3;
        float yd0 = y0 - y1;
        float yd1 = y2 - y3;

        float div = det(xd0, xd1, yd0, yd1);

        float2 intersection;
        intersection.x = -1.0;
        intersection.y = -1.0;

        if (div != 0.0) {
            float d1 = det(x0, y0, x1, y1);
            float d2 = det(x2, y2, x3, y3);
            intersection.x = det(d1, d2, xd0, xd1) / div;
            intersection.y = det(d1, d2, yd0, yd1) / div;
        }

        return intersection;
    }


    static float norm(float value, float a, float b) {
        float n = (value - a) / (b - a);
        if (n > 1.0) {
            n = 1.0;
        }
        if (n < 0.0) {
            n = 0.0;
        }
        return n;
    }

    static float wrap(float value, float a, float b) {
        if (value < a) {
            value = b - (a - value);
        } else if (value > b) {
            value = a + (value - b);
        }
        return value;
    }

    void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness);
    void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha);

    void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness) {
        int dx = abs(x1-x0);
        int dy = abs(y1-y0);

        if (dx==0 && dy==0) {
            return;
        }

        // draw the first line
        drawSingleLine(p, x0, y0, x1, y1, w, h, alpha);

        thickness--;
        if (thickness < 1) return;

        int stepX = 0;
        int stepY = 0;
        if (dx > dy) stepY = 1;
        else stepX = 1;

        // loop through thickness
        int offset = 1;
        for (int i=0; i<thickness; i++) {
            int xd = stepX * offset;
            int yd = stepY * offset;

            drawSingleLine(p, x0+xd, y0+yd, x1+xd, y1+yd, w, h, alpha);

            // alternate above and below
            offset *= -1;
            if (offset > 0) {
                offset++;
            }
        }


    }

    void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha) {
        // clamp
        x0 = clamp(x0, 0, w-1);
        x1 = clamp(x1, 0, w-1);
        y0 = clamp(y0, 0, h-1);
        y1 = clamp(y1, 0, h-1);

        int dx = abs(x1-x0);
        int dy = abs(y1-y0);

        if (dx==0 && dy==0) {
            return;
        }

        int sy = 1;
        int sx = 1;
        if (y0>=y1) {
            sy = -1;
        }
        if (x0>=x1) {
            sx = -1;
        }
        int err = dx/2;
        if (dx<=dy) {
            err = -dy/2;
        }
        int e2 = err;

        int x = x0;
        int y = y0;
        for(int i=0; i<w; i++){
            p[y*w+x] = alpha;
            if (x==x1 && y==y1) {
                break;
            }
            e2 = err;
            if (e2 >-dx) {
                err -= dy;
                x += sx;
            }
            if (e2 < dy) {
                err += dx;
                y += sy;
            }
        }
    }

    __kernel void getParticles(__global float *data, __global float *pData, __global float *result){
        int points = %d;
        int dw = %d;
        int dh = %d;
        float tw = %f;
        float th = %f;
        float offset = %f;
        float magMin = %f;
        float magMax = %f;
        float alphaMin = %f;
        float alphaMax = %f;
        float velocityMult = %f;
        float fadeProgress = %f;
        float lineWidthMin = %f;
        float lineWidthMax = %f;
        float lineWidthLatMin = %f;
        float lineWidthLatMax = %f;

        // get current position
        int i = get_global_id(0);
        float dx = pData[i*3];
        float dy = pData[i*3+1];
        float doffset = pData[i*3+2];

        // set starting position
        float x = dx * (tw-1);
        float y = dy * (th-1);

        for(int j=0; j<points; j++) {
            // get UV value
            int lon = (int) round(dx * (dw-1));
            int lat = (int) round(dy * (dh-1));
            int dindex = lat * dw * 3 + lon * 3;
            float u = data[dindex+1];
            float v = data[dindex+2];

            // check for invalid values
            if (u >= 999.0 || u <= -999.0) {
                u = 0.0;
            }
            if (v >= 999.0 || v <= -999.0) {
                v = 0.0;
            }

            // calc magnitude
            float mag = sqrt(u * u + v * v);
            mag = norm(mag, magMin, magMax);

            // determine alpha transparency based on magnitude and offset
            float jp = (float) j / (float) (points-1);
            float progressMultiplier = (jp + offset + doffset) - floor(jp + offset + doffset);

            float alpha = lerp(alphaMin, alphaMax, mag * progressMultiplier);
            float thickness = lerp(lineWidthMin, lineWidthMax, mag * progressMultiplier);

            // adjust thickness based on latitude
            float latMultiplier = (float) abs(lat - (dh/2)) / (float) (dh/2);
            float thicknessMultiplier = lerp(lineWidthLatMin, lineWidthLatMax, latMultiplier);
            thickness *= thicknessMultiplier;
            if (thickness < 1.0) thickness = 1.0;

            // we are fading in/out
            if (fadeProgress < 1.0) {
                alpha = alpha * fadeProgress;
            }

            float x1 = x + u * velocityMult;
            float y1 = y + (-v) * velocityMult;

            // clamp y
            if (y1 < 0.0) {
                y1 = 0.0;
            }
            if (y1 > (th-1.0)) {
                y1 = th-1.0;
            }

            // check for no movement
            if (x==x1 && y==y1) {
                break;

            // check for invisible line
            } else if (alpha < 1.0) {
                // continue

            // wrap from left to right
            } else if (x1 < 0) {
                float2 intersection = lineIntersection(x, y, x1, y1, (float) 0.0, (float) 0.0, (float) 0.0, th);
                if (intersection.y > 0.0) {
                    drawLine(result, (int) round(x), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                    drawLine(result, (int) round((float) (tw-1.0) + x1), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                }

            // wrap from right to left
            } else if (x1 > tw-1.0) {
                float2 intersection = lineIntersection(x, y, x1, y1, (float) (tw-1.0), (float) 0.0, (float) (tw-1.0), th);
                if (intersection.y > 0.0) {
                    drawLine(result, (int) round(x), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                    drawLine(result, (int) round((float) x1 - (float)(tw-1.0)), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                }

            // draw it normally
            } else {
                drawLine(result, (int) round(x), (int) round(y), (int) round(x1), (int) round(y1), (int) tw, (int) th, round(alpha), (int) thickness);
            }

            // wrap x
            x1 = wrap(x1, 0.0, tw-1);
            dx = x1 / tw;
            dy = y1 / th;

            x = x1;
            y = y1;
        }
    }
    """ % (w, dw, dh, tw, th, offset, p["mag_range"][0], p["mag_range"][1],
           p["alpha_range"][0], p["alpha_range"][1], p["velocity_multiplier"],
           fadeProgress, p["linewidth_range"][0], p["linewidth_range"][1],
           p["linewidth_lat_range"][0], p["linewidth_lat_range"][1])

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        # print "Using GPU"
        ctx = cl.Context(devices=GPUs)
    else:
        print "Warning: using CPU"
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=fData)
    inPData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pData)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    prg.getParticles(queue, (h, ), None, inData, inPData, outResult)

    # Copy result
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape((th, tw))
    result = result.astype(np.uint8)

    return result
Example #45
0
def get_all_cl_gpus():
    gpu_list = []
    for platf in cl.get_platforms():
        gpu_list.extend(platf.get_devices(cl.device_type.GPU))

    return gpu_list
Example #46
0
def addParticlesToImage(baseImage, colorImage, particles, p):
    basePx = np.array(baseImage)
    basePx = basePx.astype(np.uint8)

    colorPx = np.array(colorImage)
    colorPx = colorPx.astype(np.uint8)

    shape = colorPx.shape
    h, w, dim = shape

    basePx = basePx.reshape(-1)
    colorPx = colorPx.reshape(-1)
    particles = particles.reshape(-1)

    # the kernel function
    src = """
    __kernel void addParticles(__global uchar *base, __global uchar *colors, __global uchar *particles, __global uchar *result){
        int w = %d;
        int dim = %d;
        float power = 1.0 - %f; // lower number = more visible lines

        int posx = get_global_id(1);
        int posy = get_global_id(0);
        int i = posy * w * dim + posx * dim;
        int j = posy * w + posx;

        float alpha = (float) particles[j] / 255.0;
        int r = colors[i];
        int g = colors[i+1];
        int b = colors[i+2];

        if (alpha > 0) {
            alpha = pow(alpha*alpha + alpha*alpha, power);
            if (alpha > 1.0) {
                alpha = 1.0;
            }
            float inv = 1.0 - alpha;
            r = (int) round((r * alpha) + ((float) base[i] * inv));
            g = (int) round((g * alpha) + ((float) base[i+1] * inv));
            b = (int) round((b * alpha) + ((float) base[i+2] * inv));
        } else {
            r = base[i];
            g = base[i+1];
            b = base[i+2];
        }

        result[i] = r;
        result[i+1] = g;
        result[i+2] = b;
    }
    """ % (w, dim, p["line_visibility"])

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        ctx = cl.Context(devices=GPUs)
    else:
        print "Warning: using CPU"
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=basePx)
    inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=colorPx)
    inC = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=particles)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, colorPx.nbytes)

    prg.addParticles(queue, [h, w], None, inA, inB, inC, outResult)

    # Copy result
    result = np.empty_like(colorPx)
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    return result
Example #47
0
    block = sixtracklib.cBlock.from_line(line)
    cbeam = bref.copy().reshape(-1)[:npart]
    st = time.time()
    block.track_cl(cbeam, nturn=nturn, turnbyturn=True)
    st = time.time() - st
    perfgpu = st / npart / nturn * 1e6
    print("GPU part %4d, turn %4d: %10.3f usec/part*turn" %
          (npart, nturn, perfgpu))

    block = sixtracklib.cBlock.from_line(line)
    npart2 = npart / 100
    cbeam = bref.copy().reshape(-1)[:npart2]
    st = time.time()
    block.track(cbeam, nturn=nturn, turnbyturn=True)
    st = time.time() - st
    perfcpu = st / npart2 / nturn * 1e6
    print("CPU part %4d, turn %4d: %10.3f usec/part*turn" %
          (npart2, nturn, perfcpu))

    print("GPU/CPU : %g" % (perfcpu / perfgpu))
    return st, npart, nturn, perfgpu, perfcpu


out = open(time.strftime("bench_%Y%M%dT%H%m%S.txt"), 'w')
out.write("#%s" % pyopencl.get_platforms()[0].get_devices()[0])
for npart in [100, 1000, 2000, 5000, 10000, 20000]:
    for nturn in [1, 2, 5, 10]:
        st, npart, nturn, perfgpu, perfcpu = mkbench(npart, nturn)
        fmt = "%5d %5d %10.3f %10.3f %10.3f\n"
        out.write(fmt % (npart, nturn, perfgpu, perfcpu, perfcpu / perfgpu))
Example #48
0
import pyopencl as cl
import numpy as np
from timeit import timeit_repeat
from pyopencl.algorithm import RadixSort
from pyopencl.bitonic_sort import BitonicSort
from pyopencl import clrandom
from pyopencl.scan import GenericScanKernel

device = cl.get_platforms()[1].get_devices()[0]
ctx = cl.Context([device])
queue = cl.CommandQueue(ctx)
reps = 64


@timeit_repeat(reps)
def test_radix_speed(buff, sorter):
    sorter(buff)[1].wait()


@timeit_repeat(reps)
def test_bitonic_speed(buff, sorter):
    sorter(buff)[1].wait()


@timeit_repeat(reps)
def test_numpy_speed(buff):
    np.sort(buff)


from collections import defaultdict
Example #49
0
#!/usr/bin/env python
"""
Basic 2d histogram.
"""

import time

import pyopencl as cl
import pyopencl.array
import numpy as np

# Select the desired OpenCL platform; you shouldn't need to change this:
NAME = 'NVIDIA CUDA'
platforms = cl.get_platforms()
devs = None
for platform in platforms:
    if platform.name == NAME:
        devs = platform.get_devices()

# Set up a command queue:
ctx = cl.Context(devs)
queue = cl.CommandQueue(ctx)


# Compute histogram in Python:
def hist(x):
    bins = np.zeros(256, np.uint32)
    for v in x.flat:
        bins[v] += 1
    return bins
Example #50
0
    def __init__(self,
                 descriptor,
                 geometry,
                 moments,
                 collide,
                 pop_eq_src='',
                 boundary_src='',
                 platform=0,
                 precision='single',
                 layout=None,
                 padding=None,
                 align=False,
                 opengl=False):
        self.descriptor = descriptor
        self.geometry = geometry
        self.grid = Grid(self.geometry, padding)

        self.time = 0

        self.float_type = {
            'single': (numpy.float32, 'float'),
            'double': (numpy.float64, 'double'),
        }.get(precision, None)

        self.mako_lookup = TemplateLookup(directories=[Path(__file__).parent])

        self.platform = cl.get_platforms()[platform]

        if opengl:
            try:
                self.context = cl.Context(
                    properties=[(cl.context_properties.PLATFORM,
                                 self.platform)] +
                    get_gl_sharing_context_properties())
            except:
                self.context = cl.Context(
                    properties=[(cl.context_properties.PLATFORM, self.platform)
                                ] + get_gl_sharing_context_properties(),
                    devices=[self.platform.get_devices()[0]])
        else:
            self.context = cl.Context(
                properties=[(cl.context_properties.PLATFORM, self.platform)])

        self.queue = cl.CommandQueue(self.context)

        self.memory = Memory(self.descriptor, self.grid, self.context,
                             self.float_type[0], align, opengl)
        self.tick = False

        self.moments = moments
        self.collide = collide

        self.pop_eq_src = pop_eq_src
        self.boundary_src = boundary_src

        self.layout = layout

        self.compiler_args = {
            'single': '-cl-single-precision-constant -cl-fast-relaxed-math',
            'double': '-cl-fast-relaxed-math'
        }.get(precision, None)

        self.build_kernel()

        self.program.equilibrilize(self.queue, self.grid.size(), self.layout,
                                   self.memory.cl_pop_a,
                                   self.memory.cl_pop_b).wait()

        self.material = numpy.ndarray(shape=(self.memory.volume, 1),
                                      dtype=numpy.int32)
Example #51
0
def main3d(Run):
    tbegin = time.time()
    params = Run.params
    params.phi_step = np.array(params.phi_step, dtype=np.float32)
    ascii_gen_list = params.symmetry_operators
    ops_list = genlist2oplist(ascii_gen_list)
    apply_sym = 0
    if len(ops_list) > 1:
        apply_sym = 1
    number_of_run = params.number_of_run
    Bmatrix = Run.Bmatrix
    Bi = np.linalg.inv(Bmatrix)

    flist = Run.flist
    total = 0
    for run in range(number_of_run):
        total += len(flist[run])
    p = ProgressBar(total)
    Filter = fabio.open(params.maskFile).data.astype(np.float32)
    (dim1, dim2) = Filter.shape
    last_run = 0
    if not Run.making_volume:
        Run.number_of_volume = int(1)
        Run.cube_dim = int(1)
    if not Run.making_shell:
        Run.number_of_shell = int(1)
        Run.shell_dim = int(1)
    if not Run.making_slice:
        Run.number_of_slice = int(1)
        Run.slice_dim = int(1)
    if not Run.making_pole_figure:
        Run.number_of_figure = int(1)
        Run.pole_size = int(1)
    #GPU
    gpu_enable = int(params.gpu_enable)
    if gpu_enable:
        platform = cl.get_platforms()[int(params.platform_id)]
        device = platform.get_devices()[int(params.device_id)]
        context = cl.Context([device])
        queue = cl.CommandQueue(context)
        mf = cl.mem_flags
        kernel_code = open("kernelCode.cl", "r").read()
        kernel_pars = {"number_of_volume":Run.number_of_volume, \
                       "nx":Run.cube_dim, \
                       "ny":Run.cube_dim, \
                       "nz":Run.cube_dim, \
                       "dim1":dim1, \
                       "dim2":dim2, \
                       "dimsym":ops_list.shape[0],\
                       "number_of_shell": Run.number_of_shell,\
                       "sx": Run.shell_dim,\
                       "sy": Run.shell_dim,\
                       "sz": Run.shell_dim,\
                       "number_of_figure": Run.number_of_figure,\
                       "px": Run.pole_size,\
                       "py": Run.pole_size,\
                       "slice_size": Run.slice_dim
                       }
        prog = cl.Program(context, kernel_code % kernel_pars).build()
        data_gpu = cl.Buffer(context,
                             mf.READ_ONLY | mf.COPY_HOST_PTR,
                             hostbuf=Filter)
        Qfin_gpu = cl.Buffer(context,
                             mf.READ_ONLY | mf.COPY_HOST_PTR,
                             hostbuf=Run.all_Q0[0])
        Filter_gpu = cl.Buffer(context,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=Filter)
        symOps_gpu = cl.Buffer(context,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=ops_list)
    if Run.making_volume:
        Volume = np.zeros(
            (Run.number_of_volume, Run.cube_dim, Run.cube_dim, Run.cube_dim),
            dtype=np.float32)
        Mask = np.zeros(
            (Run.number_of_volume, Run.cube_dim, Run.cube_dim, Run.cube_dim),
            dtype=np.uint32)
        if gpu_enable:
            volCenter_gpu = cl.Buffer(context,
                                      mf.READ_ONLY | mf.COPY_HOST_PTR,
                                      hostbuf=Run.volume_center)
            volExtent_gpu = cl.Buffer(context,
                                      mf.READ_ONLY | mf.COPY_HOST_PTR,
                                      hostbuf=Run.volume_extent)
            Volume_gpu = cl.Buffer(context,
                                   mf.READ_WRITE | mf.COPY_HOST_PTR,
                                   hostbuf=Volume)
            Mask_gpu = cl.Buffer(context,
                                 mf.READ_WRITE | mf.COPY_HOST_PTR,
                                 hostbuf=Mask)

    if Run.making_shell:
        ShellVolume = np.zeros(
            (Run.number_of_shell, Run.shell_dim, Run.shell_dim, Run.shell_dim),
            dtype=np.float32)
        ShellMask = np.zeros(
            (Run.number_of_shell, Run.shell_dim, Run.shell_dim, Run.shell_dim),
            dtype=np.uint32)
        if gpu_enable:
            Q_shell_gpu = cl.Buffer(context,
                                    mf.READ_ONLY | mf.COPY_HOST_PTR,
                                    hostbuf=Run.Q_shell)
            shell_center_gpu = cl.Buffer(context,
                                         mf.READ_ONLY | mf.COPY_HOST_PTR,
                                         hostbuf=Run.shell_center)
            # shell_extent_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.shell_extent)
            shell_extent_gpu = np.float32(Run.shell_extent)
            # shell_thickness_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.shell_thickness)
            shell_thickness_gpu = np.float32(Run.shell_thickness)
            ShellVolume_gpu = cl.Buffer(context,
                                        mf.READ_WRITE | mf.COPY_HOST_PTR,
                                        hostbuf=ShellVolume)
            ShellMask_gpu = cl.Buffer(context,
                                      mf.READ_WRITE | mf.COPY_HOST_PTR,
                                      hostbuf=ShellMask)

    if Run.making_slice:
        SliceImage = np.zeros(
            (Run.number_of_slice, Run.slice_dim, Run.slice_dim),
            dtype=np.float32)
        SliceMask = np.zeros(
            (Run.number_of_slice, Run.slice_dim, Run.slice_dim),
            dtype=np.uint32)
        if gpu_enable:
            G_gpu = cl.Buffer(context,
                              mf.READ_ONLY | mf.COPY_HOST_PTR,
                              hostbuf=Run.G)
            dQ0_gpu = cl.Buffer(context,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=Run.dQ0)
            dQ1_gpu = cl.Buffer(context,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=Run.dQ1)
            dQ2_gpu = cl.Buffer(context,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=Run.dQ2)
            Qoff_gpu = cl.Buffer(context,
                                 mf.READ_ONLY | mf.COPY_HOST_PTR,
                                 hostbuf=Run.Qoff)
            SliceImage_gpu = cl.Buffer(context,
                                       mf.READ_WRITE | mf.COPY_HOST_PTR,
                                       hostbuf=SliceImage)
            SliceMask_gpu = cl.Buffer(context,
                                      mf.READ_WRITE | mf.COPY_HOST_PTR,
                                      hostbuf=SliceMask)

    if Run.making_pole_figure:
        PoleData = np.zeros(
            (Run.number_of_figure, Run.pole_size, Run.pole_size),
            dtype=np.float32)
        PoleMask = np.zeros(
            (Run.number_of_figure, Run.pole_size, Run.pole_size),
            dtype=np.uint32)
        if gpu_enable:
            PoleData_gpu = cl.Buffer(context,
                                     mf.READ_WRITE | mf.COPY_HOST_PTR,
                                     hostbuf=PoleData)
            PoleMask_gpu = cl.Buffer(context,
                                     mf.READ_WRITE | mf.COPY_HOST_PTR,
                                     hostbuf=PoleMask)
            Qpole_gpu = cl.Buffer(context,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR,
                                  hostbuf=Run.Qpole)
            # pole_thickness_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.pole_thickness)
            pole_thickness_gpu = np.float32(Run.pole_thickness)

    #GPU
    # sample_angles = np.zeros(int(Run.params.sample_circles))
    # scanning_motor_index = Run.params.sample_axis.index(Run.params.scanning_motor)
    # sample_rotation_dir = list(Run.params.rot_dir)
    # print("Scanning motor: %s, index: %d"%(Run.params.scanning_motor, scanning_motor_index))
    print("Scanning motor: ", Run.params.scanning_motor)
    for run in range(number_of_run):
        nbfile = 0
        sample_angles = np.zeros(int(Run.params.sample_circles))
        print("Scanning motor: %s" % (Run.params.scanning_motor[run]))
        scanning_motor_index = Run.params.sample_axis.index(
            Run.params.scanning_motor[run])
        sample_rotation_dir = list(Run.params.rot_dir)
        print("Scanning motor: %s, index: %d" %
              (Run.params.scanning_motor[run], scanning_motor_index))
        for id in range(len(flist[run])):
            data = Run.allData_allRun[run][id]
            motors = Run.allMotor_allRun[run][id]
            data = (data * Filter) / (Run.all_C3[run] * Run.all_POLA[run])
            for sc in range(int(Run.params.sample_circles)):
                sample_angles[sc] = motors[Run.params.sample_axis[sc]]
            U = Run.Umatrix
            if gpu_enable:
                cl.enqueue_copy(queue, data_gpu, data).wait()

            for j in range(Run.interp_factor):
                interphi = sample_angles[
                    scanning_motor_index] + j / Run.interp_factor * params.phi_step[
                        run]
                sample_angles[scanning_motor_index] = interphi
                R = Sample_Rotation(sample_angles, sample_rotation_dir)
                Q = np.tensordot(Run.all_Q0[run], R.T, axes=([2], [1]))
                Qfin = np.tensordot(Q, U.T, axes=([2], [1]))
                if gpu_enable:
                    cl.enqueue_copy(queue, Qfin_gpu, Qfin).wait()
                if Run.making_volume:
                    if gpu_enable:
                        prog.volReconstruction(queue, data.shape, None,
                                               volCenter_gpu, volExtent_gpu,
                                               Volume_gpu, Mask_gpu, Qfin_gpu,
                                               data_gpu, Filter_gpu,
                                               np.int32(apply_sym),
                                               symOps_gpu).wait()
                    else:
                        fillvolume.volume(Run.volume_center, Run.volume_extent,
                                          Volume, Mask, Qfin, data, Filter,
                                          apply_sym, ops_list)

                if Run.making_shell:
                    if gpu_enable:
                        prog.extract_shell(
                            queue, data.shape, None, Q_shell_gpu,
                            shell_center_gpu, shell_extent_gpu,
                            shell_thickness_gpu, ShellVolume_gpu,
                            ShellMask_gpu, Qfin_gpu, data_gpu, Filter_gpu,
                            np.int32(apply_sym), symOps_gpu).wait()
                    else:
                        fillvolume.extract_shell(Run.Q_shell, Run.shell_center,
                                                 Run.shell_extent,
                                                 Run.shell_thickness,
                                                 ShellVolume, ShellMask, Qfin,
                                                 data, Filter, apply_sym,
                                                 ops_list)

                if Run.making_slice:
                    if gpu_enable:
                        prog.extract_slice(queue, data.shape, None, np.int32(Run.number_of_slice), dQ0_gpu, dQ1_gpu, dQ2_gpu, Qoff_gpu, SliceImage_gpu, SliceMask_gpu,\
                                           Qfin_gpu, data_gpu, Filter_gpu, np.int32(apply_sym), symOps_gpu, G_gpu).wait()
                    else:
                        fillvolume.extract_slice(Run.number_of_slice, Run.dQ0,
                                                 Run.dQ1, Run.dQ2, Run.Qoff,
                                                 SliceImage, SliceMask, Qfin,
                                                 data, Filter, apply_sym,
                                                 ops_list, Run.G)

                if Run.making_pole_figure:
                    if gpu_enable:
                        prog.stereo_projection(queue, data.shape, None, Qpole_gpu, pole_thickness_gpu, PoleData_gpu, PoleMask_gpu,\
                                               Qfin_gpu, data_gpu, Filter_gpu, np.int32(apply_sym), symOps_gpu).wait()
                    else:
                        fillvolume.stereo_projection(Run.Qpole,
                                                     Run.pole_thickness,
                                                     PoleData, PoleMask, Qfin,
                                                     data, Filter, apply_sym,
                                                     ops_list)

                print('interpolation #%d on %d' % (j + 1, Run.interp_factor))
            nbfile += 1
            timeI2 = time.time()
            p.update_time(nbfile + last_run)
            print(
                '------------------------------------------------------------')
            print(p)
            print(
                '------------------------------------------------------------')
            print('\n')
        last_run += nbfile

    print('3D Intensity Distribution : Done')
    ##################################
    #GPU
    if gpu_enable:
        Qfin_gpu.release()
        data_gpu.release()
        Filter_gpu.release()
        symOps_gpu.release()

    #GPU
    if Run.making_volume:
        if gpu_enable:
            # Getting data from gpu back
            cl.enqueue_copy(queue, Volume, Volume_gpu).wait()
            cl.enqueue_copy(queue, Mask, Mask_gpu).wait()
            Volume_gpu.release()
            Mask_gpu.release()
            volExtent_gpu.release()
            volCenter_gpu.release()
        for v in range(Run.number_of_volume):
            filter_ids = np.where(Mask[v] != 0)
            Volume[v][filter_ids] = Volume[v][filter_ids] / Mask[v][filter_ids]
            save_cmap(Run.volumeName[v], Volume[v])

    if Run.making_shell:
        if gpu_enable:
            cl.enqueue_copy(queue, ShellVolume, ShellVolume_gpu).wait()
            cl.enqueue_copy(queue, ShellMask, ShellMask_gpu).wait()
            ShellVolume_gpu.release()
            ShellMask_gpu.release()
            Q_shell_gpu.release()
            shell_center_gpu.release()
            # shell_extent_gpu.release()
            # shell_thickness_gpu.release()

        for sh in range(Run.number_of_shell):
            filter_ids = np.where(ShellMask[sh] != 0)
            ShellVolume[sh][filter_ids] = ShellVolume[sh][
                filter_ids] / ShellMask[sh][filter_ids]
            save_cmap(Run.shellName[sh], ShellVolume[sh])

    if Run.making_slice:
        if gpu_enable:
            cl.enqueue_copy(queue, SliceImage, SliceImage_gpu).wait()
            cl.enqueue_copy(queue, SliceMask, SliceMask_gpu).wait()
            SliceImage_gpu.release()
            SliceMask_gpu.release()
        for s in range(Run.number_of_slice):
            mapout = np.zeros_like(SliceImage[s])
            mapout[np.where(SliceMask[s] != 0)] = SliceImage[s][np.where(
                SliceMask[s] != 0)] / SliceMask[s][np.where(SliceMask[s] != 0)]

            tmp2 = mapout * params.scale_factor
            wi = fabio.cbfimage.cbfimage(data=tmp2.astype(np.int32))
            mapOutName = params.slice_outname[s]
            wi.write(mapOutName)
            Qoutname = mapOutName.split(".")[0] + "_hkl.h5"
            print("Slice %s saved." % mapOutName)
            Qoffset = np.dot(Run.Qoff[s], Run.G[s])
            x = np.linspace(-Run.dQ1[s], Run.dQ1[s], Run.slice_dim)
            y = np.linspace(-Run.dQ2[s], Run.dQ2[s], Run.slice_dim)
            x, y = np.meshgrid(x, y)
            z = np.zeros(x.shape)
            q = np.zeros((Run.slice_dim, Run.slice_dim, 3))
            q[:, :, 0] = x + Qoffset[0]
            q[:, :, 1] = y + Qoffset[1]
            q[:, :, 2] = z + Qoffset[2]
            Gi = np.linalg.inv(Run.G[s])
            Qn = np.tensordot(q, Gi, axes=([2], [1]))
            HKL = np.tensordot(Qn, Bi, axes=([2], [1]))
            h5file = h5py.File(Qoutname, "w")
            h5file.create_dataset("/Q",
                                  data=HKL,
                                  compression='gzip',
                                  compression_opts=9)
            h5file.create_dataset("/data",
                                  data=tmp2,
                                  compression='gzip',
                                  compression_opts=9)
            h5file.close()
            print("HKL coordinates saved.")
            rsmViewer_fn = mapOutName.split(".")[0] + "_rsmviewer.h5"
            # save2RSMviewer(tmp2, HKL, rsmViewer_fn)

    if Run.making_pole_figure:
        if gpu_enable:
            cl.enqueue_copy(queue, PoleData, PoleData_gpu).wait()
            cl.enqueue_copy(queue, PoleMask, PoleMask_gpu).wait()
            PoleData_gpu.release()
            PoleMask_gpu.release()
        for p in range(Run.number_of_figure):
            mapout = np.zeros_like(PoleData[p])
            mapout[np.where(PoleMask[p] != 0)] = PoleData[p][np.where(
                PoleMask[p] != 0)] / PoleMask[p][np.where(PoleMask[p] != 0)]

            tmp = mapout * params.scale_factor
            wi = fabio.cbfimage.cbfimage(data=tmp.astype(np.int32))
            mapOutName = Run.pole_name[p]
            wi.write(mapOutName)
            print("Pole %s saved." % mapOutName)

    ###################################
    print('Normal END')
    gc.collect()
    tend = time.time()
    print("Total time for this operation: %.3f s" % (tend - tbegin))
Example #52
0
# data points must be a multiple of workers

a = numpy.random.rand(data_points).astype(numpy.float32)
b = numpy.random.rand(data_points).astype(numpy.float32)
c_result = numpy.empty_like(a)

# Speed in normal CPU usage
time1 = time()
c_temp = (a + b)  # adds each element in a to its corresponding element in b
c_temp = c_temp * c_temp  # element-wise multiplication
c_result = c_temp * (a / 2.0)  # element-wise half a and multiply
time2 = time()

print("Execution time of test without OpenCL: ", time2 - time1, "s")

for platform in cl.get_platforms():
    for device in platform.get_devices():
        print(
            "===============================================================")
        print("Platform name:", platform.name)
        print("Platform profile:", platform.profile)
        print("Platform vendor:", platform.vendor)
        print("Platform version:", platform.version)
        print(
            "---------------------------------------------------------------")
        print("Device name:", device.name)
        print("Device type:", cl.device_type.to_string(device.type))
        print("Device memory: ", device.global_mem_size // 1024 // 1024, 'MB')
        print("Device max clock speed:", device.max_clock_frequency, 'MHz')
        print("Device compute units:", device.max_compute_units)
        print("Device max work group size:", device.max_work_group_size)
Example #53
0
    def __init__(self,
                 platform,
                 salt,
                 iter,
                 debug,
                 N=0,
                 r=0,
                 p=0,
                 length=0x20):
        if type(salt) != bytes:
            assert ("Parameter salt has to be type of bytes")
        if type(iter) != int:
            assert ("Parameter Iteration has to be type of int")
        platforms = cl.get_platforms()
        if (platform > len(platforms)):
            assert ("Selected platform %d doesn't exist" % platform)

        saltlen = int(len(salt))
        if (saltlen > int(64)):
            print('Salt longer than 64 chars is not supported!')
            exit(0)
        hash = b'\x00' * 64
        hash_len = 64
        n_salt = np.fromstring(salt, dtype=np.uint32)
        n_saltlen = np.array([len(salt)], dtype=np.uint32)
        self.n_iter = np.array(iter, dtype=np.uint32)
        self.salt = np.append(n_saltlen, n_salt)
        self.N = N  #np.array(N, dtype=np.uint32)
        self.r = r  #np.array(r, dtype=np.uint32)
        self.p = p  #np.array(p, dtype=np.uint32)

        # Get platforms
        devices = platforms[platform].get_devices()
        self.workgroupsize = 60000
        #Create context for GPU/CPU
        print("Using Platform %d:" % platform)
        self.ctx = cl.Context(devices)
        for device in devices:
            print(
                '--------------------------------------------------------------------------'
            )
            print(' Device - Name: ' + device.name)
            print(' Device - Type: ' + cl.device_type.to_string(device.type))
            print(' Device - Compute Units: {0}'.format(
                device.max_compute_units))
            print(' Device - Max Work Group Size: {0:.0f}'.format(
                device.max_work_group_size))
            if (device.max_work_group_size < self.workgroupsize):
                self.workgroupsize = device.max_work_group_size

        print("\nUsing work group size of %d\n" % self.workgroupsize)

        # Create queue for each kernel execution
        self.queue = cl.CommandQueue(self.ctx)

        # Kernel function
        src = ""
        if (debug):
            os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
            src = """
            typedef struct {
                unsigned int length;
                unsigned int buffer[32/4];
            } inbuf;
            
            typedef struct {
                unsigned int buffer[32/4];
            } outbuf;
            
            static void pbkdf(__global const unsigned int *pass, int pass_len, const unsigned int *salt, int salt_len, int iter, unsigned int* hash, unsigned int hash_len)
            {
                hash[0]=pass_len;
                hash[1]=pass[0];
                hash[2]=hash_len;
                hash[3]=iter;
                hash[4]=salt_len;
                hash[5]=salt[0];
            }
            
            __kernel void func_pbkdf2(__global const inbuf * inbuffer, __global outbuf * outbuffer, __global const inbuf * salt, const int iterations)
            {
                unsigned int idx = get_global_id(0);
                unsigned int hash[32/4]={0};
                unsigned int ssalt[32/4]={0};
                ssalt[0]=salt[0].buffer[0];
                ssalt[1]=salt[0].buffer[1];
                ssalt[2]=salt[0].buffer[2];
                ssalt[3]=salt[0].buffer[3];
                ssalt[4]=salt[0].buffer[4];
                ssalt[5]=salt[0].buffer[5];
                ssalt[6]=salt[0].buffer[6];
                ssalt[7]=salt[0].buffer[7];
                int salt_len=salt[0].length;
                pbkdf(inbuffer[idx].buffer, inbuffer[idx].length, ssalt, salt_len, iterations, hash,32);
                outbuffer[idx].buffer[0]=hash[0];
                outbuffer[idx].buffer[1]=hash[1];
                outbuffer[idx].buffer[2]=hash[2];
                outbuffer[idx].buffer[3]=hash[3];
                outbuffer[idx].buffer[4]=hash[4];
                outbuffer[idx].buffer[5]=hash[5];
                outbuffer[idx].buffer[6]=hash[6];
                outbuffer[idx].buffer[7]=hash[7];
            }
            """
        else:
            os.environ['PYOPENCL_COMPILER_OUTPUT'] = '0'
Example #54
0
    def set_bfast_parameters(self,
                             start_monitor,
                             end_monitor,
                             start_hist,
                             freq,
                             k,
                             hfrac,
                             trend,
                             level,
                             backend='opencl',
                             verbose=0,
                             device_id=0):
        '''Set parameters, see bfast for what they do.. okay we should say this here
        
        parameters:
        -----------
        
        start_monitor : datetime object
        A datetime object specifying the start of 
        the monitoring phase.
        
        end_monitor: datetime object
        A datetime object specifying the end of 
        the monitoring phase.
        
        start_hist: datetime object
        A datetime object specifying the start of
        the history phase.
        
        freq : int, default 365
            The frequency for the seasonal model.

        k : int, default 3
            The number of harmonic terms.

        hfrac : float, default 0.25
            Float in the interval (0,1) specifying the 
            bandwidth relative to the sample size in 
            the MOSUM/ME monitoring processes.

        trend : bool, default True
            Whether a tend offset term shall be used or not

        level : float, default 0.05
            Significance level of the monitoring (and ROC, 
            if selected) procedure, i.e., probability of 
            type I error.
            
        
        backend : string, either 'opencl' or 'python'
            Chooses what backend to use. opencl uses the GPU
            implementation, which is much faster. 
        
        verbose : int, optional (default=0)
            The verbosity level (0=no output, 1=output)
        '''

        self.start_monitor = start_monitor
        self.end_monitor = end_monitor
        self.start_hist = start_hist
        self.freq = freq
        self.k = k
        self.hfrac = hfrac
        self.trend = trend
        self.level = level
        self.backend = backend
        self.verbose = verbose
        self.device_id = device_id

        self.model = bfast.BFASTMonitor(
            self.start_monitor,
            freq=freq,  # add these
            k=k,
            hfrac=hfrac,
            trend=trend,
            level=level,
            backend=backend,
            verbose=verbose,
            device_id=device_id,
        )

        try:
            print("device: ", pyopencl.get_platforms()[0].get_devices())
        except:
            print(
                "You selected  openCL, but no device was found, are you sure you set up a gpu session?"
            )
Example #55
0
	def __init__(self, batchSize, maxT, maxC, kernelVariant=1, enableGPUDebug=False):
		"specify size: number of batch elements, number of time-steps, number of characters. Set kernelVariant to either 1 or 2. Set enableGPUDebug to True to debug kernel via CodeXL."

		# force rebuild of program such that GPU debugger can attach to kernel
		self.enableGPUDebug = enableGPUDebug
		if enableGPUDebug:
			os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
			os.environ['PYOPENCL_NO_CACHE'] = '1'

		#consts
		self.batchSize = batchSize
		self.maxT = maxT
		self.maxC = maxC
		assert kernelVariant in [1, 2]
		self.kernelVariant = kernelVariant

		# platform, context, queue
		platforms = cl.get_platforms()
		assert platforms
		self.platform = platforms[0] # take first platform
		devices = self.platform.get_devices(cl.device_type.GPU) # get GPU devices
		assert devices
		self.device = devices[0] # take first GPU
		self.context = cl.Context([self.device]) # context contains the first GPU
		self.queue = cl.CommandQueue(self.context, self.device) # command queue to first GPU

		# buffer
		sizeOfFloat32 = 4
		batchBufSize = batchSize * maxC * maxT * sizeOfFloat32
		self.batchBuf = cl.Buffer(self.context, cl.mem_flags.READ_ONLY, size=batchBufSize, hostbuf=None)
		self.res = np.zeros([batchSize, maxT]).astype(np.int32)
		self.resBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY, self.res.nbytes)
		self.tmpBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY, self.res.nbytes)

		# compile program and use defines for program-constants to avoid passing private variables
		buildOptions = '-D STEP_BEGIN={} -D MAX_T={} -D MAX_C={}'.format(2 ** math.ceil(math.log2(maxT)), maxT, maxC)
		self.program = cl.Program(self.context, open('BestPathCL.cl').read()).build(buildOptions)

		# variant 1: single pass
		if kernelVariant == 1:
			self.kernel1 = cl.Kernel(self.program, 'bestPathAndCollapse')
			self.kernel1.set_arg(0, self.batchBuf)
			self.kernel1.set_arg(1, self.resBuf)

			# all time-steps must fit into a work-group
			assert maxT <= self.kernel1.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)

		# variant 2: two passes
		else:
			# kernel1: calculate best path
			self.kernel1 = cl.Kernel(self.program, 'bestPath')
			self.kernel1.set_arg(0, self.batchBuf)
			self.kernel1.set_arg(1, self.tmpBuf)

			# kernel2: collapse best path
			self.kernel2 = cl.Kernel(self.program, 'collapsePath')
			self.kernel2.set_arg(0, self.tmpBuf)
			self.kernel2.set_arg(1, self.resBuf)

			# all chars must fit into a work-group
			assert maxC <= self.kernel1.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)
Example #56
0
 def printplatforms(self):
     i = 0
     for platform in cl.get_platforms():
         print('Platform %d - Name %s, Vendor %s' %
               (i, platform.name, platform.vendor))
         i += 1
Example #57
0
import pyopencl as cl
import numpy as np
from pyopencl import cltypes
import os
from nncl import nn, losses
from nncl.layers import layer

if __name__ == "__main__":
    ctx = cl.Context([cl.get_platforms()[1].get_devices()[0]])
    queue = cl.CommandQueue(ctx)
    net = nn.Network(ctx)
    iris = np.loadtxt(os.path.dirname(os.path.realpath(__file__)) +
                      '/../data/iris.csv',
                      skiprows=1,
                      delimiter=',',
                      dtype=cltypes.float)
    np.random.seed(420)
    np.random.shuffle(iris, )
    x = iris[:, :-1]
    y = iris[:, -1:]
    # convert y to sparse categorical,
    # ie. row with class 1 will have [0,1,0]
    # 3 output classes
    # sparse_y = np.zeros((x.shape[0], 3))
    # for idx, c in enumerate(iris[:, -1]):
    #     sparse_y[idx, int(c)] = 1
    split_idx = int(0.33 * x.shape[0])
    x_train = x[:split_idx]
    y_train = y[:split_idx]
    x_test = x[split_idx:]
    y_test = y[split_idx:]
Example #58
0
cfl = .5
time_max = .214

p4p1 = 10.  # pressure ratio
r4r1 = 8.  # density ratio
gamma = 1.4  # ratio of sepcific heat

x = mesh(xmin, xmax, imax)
dx = x[1] - x[0]

# initial condition
u = ic(imax)

t = 0.

platform = cl.get_platforms()[0]
device = platform.get_devices()[1]
ctx = cl.Context([device])
#ctx = cl.create_some_context()

queue = cl.CommandQueue(ctx)

mf = cl.mem_flags

start_time = time.time()
while (t < time_max):
    # time step
    dt = step()

    # solver
    lax()
Example #59
0
import numpy as np
import os
from matplotlib import pyplot as plt
import cv2 as cv
import random
import pickle
import sys
import logging
import time
import datetime
import pyscreenshot as ImageGrab
import ctypes
import pyopencl as cl

# (1) setup OpenCL
platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD)
platform = platforms[0] # take first platform
devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform
device = devices[0] # take first GPU
context = cl.Context([device]) # put selected GPU into context object
queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context

print("Platform: {} Device:{}".format(platform, device))

# Parse the screen size
user32 = ctypes.windll.user32
screensize = user32.GetSystemMetrics(0), user32.GetSystemMetrics(1)

# Initialize the parameters
confThreshold = 0.20  #Confidence threshold
nmsThreshold = 0.40   #Non-maximum suppression threshold
Example #60
0
result = run_simulation(TaskGenerator(dt=0.1), CudaSolver(), True, 16, 1000, 20)
fits = plot_and_fit(result)
q = fits.plot()

check(TaskGenerator(), CudaSolver(), True)[0]

test_fits(CudaSolver)

"""# OpenCL"""

import pyopencl as cl  # Import the OpenCL GPU computing API
import pyopencl.array as cl_array

print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices')
for platform in cl.get_platforms():  # Print each platform on this computer
    print('=' * 60)
    print('Platform - Name:  ' + platform.name)
    print('Platform - Vendor:  ' + platform.vendor)
    print('Platform - Version:  ' + platform.version)
    print('Platform - Profile:  ' + platform.profile)
    for device in platform.get_devices():  # Print each device per-platform
        print('    ' + '-' * 56)
        print('    Device - Name:  ' + device.name)
        print('    Device - Type:  ' + cl.device_type.to_string(device.type))
        print('    Device - Max Clock Speed:  {0} Mhz'.format(device.max_clock_frequency))
        print('    Device - Compute Units:  {0}'.format(device.max_compute_units))
        print('    Device - Local Memory:  {0:.0f} KB'.format(device.local_mem_size/1024))
        print('    Device - Constant Memory:  {0:.0f} KB'.format(device.max_constant_buffer_size/1024))
        print('    Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size/1073741824.0))
print('\n')