def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices=devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking=blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking=blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking=blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking=blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking=blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for=[in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def get_queue(cls, device=get_gpu()): if device.value in cls.queues: return cls.queues[device.value] else: ctx = pycl.clCreateContext(devices=[device]) queue = pycl.clCreateCommandQueue(context=ctx, device=device) cls.queues[device.value] = queue return queue
def get_context_and_queue_from_devices(devices): key = tuple(device.vendor_id for device in devices) try: return devices_context_queue_map[key] except KeyError: context = pycl.clCreateContext(devices) queue = pycl.clCreateCommandQueue(context) devices_context_queue_map[key] = (context, queue) return devices_context_queue_map[key]
def __init__(self): """__init__ Creates a context and queue that can be reused across calls to this function. """ devices = cl.clGetDeviceIDs() self.device = devices[-1] self.context = cl.clCreateContext([self.device]) self.queue = cl.clCreateCommandQueue(self.context)
def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices = devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def ocl_init( ocl_src ): platforms = cl.clGetPlatformIDs() use_devices = None for platform in platforms: try: devices = cl.clGetDeviceIDs(platform,device_type=cl.CL_DEVICE_TYPE_GPU) use_devices = devices[0:1] # arbitraily choose first device except cl.DeviceNotFoundError: pass if use_devices is not None: break if use_devices is None: raise ValueError( "no GPU openCL device found" ) assert use_devices is not None print( "OpenCL use_devices: " + str(use_devices) ) context = cl.clCreateContext(use_devices) queue = cl.clCreateCommandQueue(context) prog = cl.clCreateProgramWithSource( context, ocl_src ).build() print prog #run_mxplusb( prog, queue ) run_conv( prog, queue )
def ocl_init(ocl_src): platforms = cl.clGetPlatformIDs() use_devices = None for platform in platforms: try: devices = cl.clGetDeviceIDs(platform, device_type=cl.CL_DEVICE_TYPE_GPU) use_devices = devices[0:1] # arbitraily choose first device except cl.DeviceNotFoundError: pass if use_devices is not None: break if use_devices is None: raise ValueError("no GPU openCL device found") assert use_devices is not None print ("OpenCL use_devices: " + str(use_devices)) context = cl.clCreateContext(use_devices) queue = cl.clCreateCommandQueue(context) prog = cl.clCreateProgramWithSource(context, ocl_src).build() print prog # run_mxplusb( prog, queue ) run_conv(prog, queue)
def __init__(self, array, output): self.device = clGetDeviceIDs()[-1] self.context = clCreateContext([self.device]) self.queue = clCreateCommandQueue(self.context) self.array = array self.output = output
def get_unique_kernel_name(): global count count += 1 return "fn{}".format(count) if backend in {"ocl", "opencl", "OCL"}: try: # platforms = cl.clGetPlatformIDs() # devices = cl.clGetDeviceIDs(platforms[1]) devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU) except cl.DeviceNotFoundError: devices = cl.clGetDeviceIDs() context = cl.clCreateContext(devices[-1:]) if os.environ.get("TRAVIS"): queues = [cl.clCreateCommandQueue(context)] else: queues = [cl.clCreateCommandQueue(context) for _ in range(8)] # queues = [ # cl.clCreateCommandQueue( # context, # properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE # ) for _ in range(10) # ] queue = queues[0] hm_dir = os.path.join(tempfile.gettempdir(), "hindemith") if not os.path.exists(hm_dir):
count = 0 def get_unique_kernel_name(): global count count += 1 return "fn{}".format(count) if backend in {"ocl", "opencl", "OCL"}: try: # platforms = cl.clGetPlatformIDs() # devices = cl.clGetDeviceIDs(platforms[1]) devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU) except cl.DeviceNotFoundError: devices = cl.clGetDeviceIDs() context = cl.clCreateContext(devices[-1:]) if os.environ.get("TRAVIS"): queues = [cl.clCreateCommandQueue(context)] else: queues = [ cl.clCreateCommandQueue( context ) for _ in range(8) ] # queues = [ # cl.clCreateCommandQueue( # context, # properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE # ) for _ in range(10) # ] queue = queues[0]
def __init__(self): self.device = clGetDeviceIDs()[-1] self.context = clCreateContext([self.device]) self.queue = clCreateCommandQueue(self.context)
# ### Specialist-Writtern Code ### # # The code below is written by an industrt SPECIALIST. This code is meant # to be more complicated and requires specialized knowledge to write. # # # Global Constants # WORK_GROUP_SIZE = 1024 devices = cl.clCreateContextFromType().devices + cl.clCreateContext().devices TARGET_GPU = devices[1] ITERATIONS = 0 class ConcreteReduction(ConcreteSpecializedFunction): def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue(self.context, device=TARGET_GPU) def finalize(self, kernel, tree, entry_name, entry_type): self.kernel = kernel self._c_function = self._compile(entry_name, tree, entry_type) return self def __call__(self, A):
prefetch_options = ["ON", "OFF"] prefetch_option = os.getenv("LATTE_PREFETCH_MODE", "ON") unroll_options = ["ON", "OFF"] unroll_option = os.getenv("LATTE_UNROLL", "ON") if parallel_strategy not in parallel_strategies: logger.warn("Invalid parallel strategy [%s], defaulting to OPENMP", parallel_strategy) parallel_strategy = "OPENMP" nthreads = os.getenv("LATTE_NUM_THREADS", None) if parallel_strategy == "OPENCL_SIMPLE_LOOP": import pycl as cl cl_ctx = cl.clCreateContext() cl_queue = cl.clCreateCommandQueue(cl_ctx) elif parallel_strategy in ["SIMPLE_LOOP" ] or parallel_strategy in ["FLOWGRAPH_LOOP"]: package_path = os.path.dirname(os.path.abspath(__file__)) _file = FileTemplate( os.path.dirname(os.path.abspath(__file__)) + "/runtime/runtime.cpp", {"LATTE_PACKAGE_PATH": StringTemplate(package_path)}) c_file = C.CFile("runtime", [_file]) module = util.mpi_compile(ctree.nodes.Project([c_file])) init_nthreads = module.get_callable("init_nthreads", ctypes.CFUNCTYPE(None, ctypes.c_int)) init_default = module.get_callable("init_default", ctypes.CFUNCTYPE(None)) if nthreads is not None: init_nthreads(int(nthreads))