uint bot = clamp(yg+1, (uint)0, h); if(yl==hl) c_loc[xl+wm*(hl+1)] = c[xg+w*bot]; barrier(CLK_LOCAL_MEM_FENCE); uchar4 blr = c_loc[xl+wm*(yl-1)]/(uchar)5 + c_loc[xl-1+wm*yl]/(uchar)5 + c_loc[xl+wm*yl]/(uchar)5 + c_loc[xl+1+wm*yl]/(uchar)5 + c_loc[xl+wm*(yl+1)]/(uchar)5; res[xg+w*yg] = blr; } """).build() n_pix = cat.size[0] * cat.size[1] result = np.empty_like(pix) mf = cl.mem_flags pix_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pix) pixb_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=result) wgs = cl.Kernel(prg, 'blur').get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.get_info(cl.context_info.DEVICES)[0]) n_local = (16, 12) if n_local[0] * n_local[1] > wgs: print "Reduce the n_local variable size please!" nn_buf = cl.LocalMemory(4 * (n_local[0] + 2) * (n_local[1] + 2)) n_workers = (cat.size[0], cat.size[1]) prg.blur(queue, n_workers, n_local, pix_buf, pixb_buf, nn_buf, np.uint32(cat.size[0]), np.uint32(cat.size[1]))
start_time = time.time() city_x = numpy.random.random(CITIES).astype(numpy.float32) * 100 city_y = numpy.random.random(CITIES).astype(numpy.float32) * 100 # prepare memory for final answer from OpenCL final = numpy.zeros(MAP_SIZE, dtype=numpy.float32) time_hostdata_loaded = time.time() print('create context') ctx = cl.create_some_context() print('create command queue') queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) time_ctx_queue_creation = time.time() # prepare device memory for OpenCL print('prepare device memory for input / output') dev_x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=city_x) dev_y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=city_y) dev_fianl = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, final.nbytes) time_devicedata_loaded = time.time() print('compile kernel code') prg = cl.Program(ctx, kernels).build() time_kernel_compilation = time.time() print('execute kernel programs') evt = prg.calc_distance(queue, (MAP_SIZE, ), (1, ), numpy.int32(CITIES), dev_x, dev_y, dev_fianl) print('wait for kernel executions') evt.wait() elapsed = 1e-9 * (evt.profile.end - evt.profile.start) print('elapsed time: {}'.format(elapsed))
#mb_wg_markup = numpy.array(wg_markup, dtype=numpy.float32) #mb_wg_stop_loss = numpy.array(wg_stop_loss, dtype=numpy.float32) #mb_wg_stop_age = numpy.array(wg_stop_age, dtype=numpy.float32) #mb_wg_macd_buy_trip = numpy.array(wg_macd_buy_trip, dtype=numpy.float32) #mb_wg_buy_wait_after_stop_loss = numpy.array(wg_buy_wait_after_stop_loss, dtype=numpy.uint32) #mb_wg_quartile = numpy.array(wg_quartile, dtype=numpy.uint32) #mb_wg_market_classification = numpy.array(wg_market_classification, dtype=numpy.uint32) mb_wg_input = numpy.array(wg_input, dtype=numpy.float32) #mb_wg_score = numpy.array(range(work_group_size), dtype=numpy.float32) #mb_wg_orders = numpy.array(range(work_group_size * max_open_orders * order_array_size), dtype=numpy.float32) #create OpenCL buffers #mapped - makes sure the data is completly loaded before processing begins #ocl_mb_wg_market_classification = cl.Buffer(ctx, mf.READ_ONLY | mf.ALLOC_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=mb_wg_market_classification) ocl_mb_wg_input = cl.Buffer(ctx, mf.READ_ONLY | mf.ALLOC_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=mb_wg_input) #ocl_mb_wg_orders = cl.Buffer(ctx, mf.READ_WRITE | mf.ALLOC_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=mb_wg_orders)#mb_wg_orders.nbytes #unmapped - can be transferred on demand #ocl_mb_wg_quartile = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_quartile) #ocl_mb_wg_score = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=mb_wg_score) #ocl_mb_wg_shares = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_shares) ocl_mb_wg_wll = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_wll) ocl_mb_wg_wls = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_wls) #ocl_mb_wg_buy_wait = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_buy_wait) #ocl_mb_wg_markup = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_markup)
def testConvolution(): # read kernel file f = open(PATH_TO_KERNEL, 'r', encoding='utf-8') kernels = ' '.join(f.readlines()) f.close() # create context, queue, buffers and compile kernels ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags prg = cl.Program(ctx, kernels).build() # init parameters for kernel-call cX = torch.ones(10, 10) print("Input x shape - " + str(cX.shape)) print(cX) cKernel = getKernel(4) print("Input kernel shape - " + str(cKernel.shape)) print(cKernel) cOutput = torch.zeros(8, 8) print("Output shape - " + str(cOutput.shape)) # convert Tensors into usabel np_arrays np_cX = cX.numpy() np_cKernel = cKernel.numpy() np_cOutput = cOutput.numpy() print("Numpy cX - ") print(str(np_cX)) print("Numpy cX dType - " + str(np_cX.dtype)) print("Numpy cKernel - ") print(str(np_cKernel)) print("Numpy cKernel dType - " + str(np_cKernel.dtype)) np_dim_cX = np.array(np_cX.shape, dtype=np.int32) # fits device integer bit-length print("Dimensions of np_cX - " + str(np_dim_cX)) print("Dtype of np_cX - " + str(type(np_dim_cX[0]))) np_dim_cKernel = np.array(np_cKernel.shape, dtype=np.int32) # fits device integer bit-length print("Dimensions of np_cKernel - " + str(np_dim_cKernel)) print(np_cKernel.shape) print(type(np_dim_cKernel[0])) # copy np_arrays into buffers of device # input buf_cX = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_cX) # input dimension buf_dim_cX = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_dim_cX) # kernel buf_cKernel = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_cKernel) # kernel dimension buf_dim_cKernel = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_dim_cKernel) # output buf_cOutput = cl.Buffer(ctx, mf.WRITE_ONLY, np_cOutput.nbytes) ###### # Options ###### # stride np_stride = np.array([1, 1], dtype=np.int32) buf_stride = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_stride) print("Calling Kernel with global_work_size: " + str(np_cOutput.shape[0] * np_cOutput.shape[1])) convKernel = prg.conv2d2 convKernel.set_args(buf_cKernel, buf_dim_cKernel, buf_cX, buf_dim_cX, buf_cOutput, buf_stride) ev = cl.enqueue_nd_range_kernel(queue, convKernel, np_cOutput.shape, None) #prg.conv2d2(queue, np_cOutput.shape, None, # buf_cX, buf_dim_cX, # buf_cKernel, buf_dim_cKernel, # buf_cOutput) cl.enqueue_copy(queue, np_cOutput, buf_cOutput) print(type(np_cOutput)) print(np_cOutput.dtype) print(np_cOutput)
def clFindKnn(h_bf_indexes, h_bf_distances, h_pointset, h_query, kth, thelier, nchunks, pointdim, signallength, gpuid): triallength = int(signallength / nchunks) # print 'Values:', pointdim, triallength, signallength, kth, thelier '''for platform in cl.get_platforms(): for device in platform.get_devices(): print("===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print("---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) print("Device max work group size:", device.max_work_group_size) print("Device max work item sizes:", device.max_work_item_sizes)''' # Set up OpenCL my_gpu_devices, context, queue = _get_device(gpuid) # Check memory resources. usedmem = int((h_query.nbytes + h_pointset.nbytes + h_bf_distances.nbytes + h_bf_indexes.nbytes) // 1024 // 1024) totalmem = int(my_gpu_devices[gpuid].global_mem_size // 1024 // 1024) if (totalmem * 0.90) < usedmem: print(("WARNING:", usedmem, "Mb used out of", totalmem, "Mb. The GPU could run out of memory.")) # Create OpenCL buffers d_bf_query = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_query) d_bf_pointset = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_pointset) d_bf_distances = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_bf_distances.nbytes) d_bf_indexes = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_bf_indexes.nbytes) # Kernel Launch kernelLocation = resource_filename(__name__, 'gpuKnnBF_kernel.cl') kernelsource = open(kernelLocation).read() program = cl.Program(context, kernelsource).build() kernelKNNshared = program.kernelKNNshared kernelKNNshared.set_scalar_arg_dtypes([ None, None, None, None, np.int32, np.int32, np.int32, np.int32, np.int32, None, None ]) # Size of workitems and NDRange if signallength / nchunks < my_gpu_devices[gpuid].max_work_group_size: workitems_x = 8 elif my_gpu_devices[gpuid].max_work_group_size < 256: workitems_x = my_gpu_devices[gpuid].max_work_group_size else: workitems_x = 256 if signallength % workitems_x != 0: temp = int(round(((signallength) / workitems_x), 0) + 1) else: temp = int(signallength / workitems_x) NDRange_x = workitems_x * temp # Local memory for distances and indexes localmem = (np.dtype(np.float32).itemsize * kth * workitems_x + np.dtype(np.int32).itemsize * kth * workitems_x) / 1024 if localmem > my_gpu_devices[gpuid].local_mem_size / 1024: print('Localmem alocation will fail. {0} kb available, and it needs ' '{1} kb.'.format(my_gpu_devices[gpuid].local_mem_size / 1024, localmem)) localmem1 = cl.LocalMemory( np.dtype(np.float32).itemsize * kth * workitems_x) localmem2 = cl.LocalMemory(np.dtype(np.int32).itemsize * kth * workitems_x) kernelKNNshared(queue, (NDRange_x, ), (workitems_x, ), d_bf_query, d_bf_pointset, d_bf_indexes, d_bf_distances, pointdim, triallength, signallength, kth, thelier, localmem1, localmem2) queue.finish() # Download results cl.enqueue_copy(queue, h_bf_distances, d_bf_distances) cl.enqueue_copy(queue, h_bf_indexes, d_bf_indexes) # Free buffers d_bf_distances.release() d_bf_indexes.release() d_bf_query.release() d_bf_pointset.release() return 1
def test_image_2d(ctx_factory): context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if "Intel" in device.vendor and "31360.31426" in device.version: from pytest import skip skip("images crashy on %s" % device) if "pocl" in device.platform.vendor and ( "0.8" in device.platform.version or "0.9" in device.platform.version ): from pytest import skip skip("images crashy on %s" % device) prg = cl.Program(context, """ __kernel void copy_image( __global float *dest, __read_only image2d_t src, sampler_t samp, int stride0) { int d0 = get_global_id(0); int d1 = get_global_id(1); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1] = read_imagef(src, samp, (float2)(d1, d0)).x; } """).build() num_channels = 1 a = np.random.rand(1024, 512, num_channels).astype(np.float32) if num_channels == 1: a = a[:, :, 0] queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, np.int32(a.strides[0]/a.dtype.itemsize)) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ("The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good
def test_get_info(ctx_factory): ctx = ctx_factory() device, = ctx.devices platform = device.platform failure_count = [0] pocl_quirks = [ (cl.Buffer, cl.mem_info.OFFSET), (cl.Program, cl.program_info.KERNEL_NAMES), (cl.Program, cl.program_info.NUM_KERNELS), ] CRASH_QUIRKS = [ (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ (cl.Event, cl.event_info.COMMAND_QUEUE), ]), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.8-pre"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.8"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.9-pre"), pocl_quirks), (("Apple", "Apple", "OpenCL 1.2 (Apr 25 2013 18:32:06)"), [ (cl.Program, cl.program_info.SOURCE), ]), ] QUIRKS = [] plat_quirk_key = ( platform.vendor, platform.name, platform.version) def find_quirk(quirk_list, cl_obj, info): for entry_plat_key, quirks in quirk_list: if entry_plat_key == plat_quirk_key: for quirk_cls, quirk_info in quirks: if (isinstance(cl_obj, quirk_cls) and quirk_info == info): return True return False def do_test(cl_obj, info_cls, func=None, try_attr_form=True): if func is None: def func(info): cl_obj.get_info(info) for info_name in dir(info_cls): if not info_name.startswith("_") and info_name != "to_string": print(info_cls, info_name) info = getattr(info_cls, info_name) if find_quirk(CRASH_QUIRKS, cl_obj, info): print("not executing get_info", type(cl_obj), info_name) print("(known crash quirk for %s)" % platform.name) continue try: func(info) except: msg = "failed get_info", type(cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): msg += ("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 if try_attr_form: try: getattr(cl_obj, info_name.lower()) except: print("failed attr-based get_info", type(cl_obj), info_name) if find_quirk(QUIRKS, cl_obj, info): print("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 do_test(platform, cl.platform_info) do_test(device, cl.device_info) do_test(ctx, cl.context_info) props = 0 if (device.queue_properties & cl.command_queue_properties.PROFILING_ENABLE): profiling = True props = cl.command_queue_properties.PROFILING_ENABLE queue = cl.CommandQueue(ctx, properties=props) do_test(queue, cl.command_queue_info) prg = cl.Program(ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg, cl.program_info) do_test(prg, cl.program_build_info, lambda info: prg.get_build_info(device, info), try_attr_form=False) n = 2000 a_buf = cl.Buffer(ctx, 0, n*4) do_test(a_buf, cl.mem_info) kernel = prg.sum do_test(kernel, cl.kernel_info) evt = kernel(queue, (n,), None, a_buf) do_test(evt, cl.event_info) if profiling: evt.wait() do_test(evt, cl.profiling_info, lambda info: evt.get_profiling_info(info), try_attr_form=False) # crashes on intel... if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp, cl.sampler_info) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) assert img.shape == (128, 256) img.depth img.image.depth do_test(img, cl.image_info, lambda info: img.get_image_info(info))
def __init__(self, queue, block_size): self.queue = queue self.host_buf = np.empty(block_size, dtype=np.uint8) self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
# Build program in the specified context using the kernel source code prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror'], devices=[dev]) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Data v = np.arange(4, dtype=np.float32) print('Input: ' + str(v)) # Create output buffer v_buff = cl.Buffer(context, flags=cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=v) # Create user event user_event = cl.UserEvent(context) def read_complete(status, data): print('Output: ' + str(data)) # Enqueue kernel that waits for user event before executing global_size = (1,) local_size = None # __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) kernel_event = prog.user_event(queue, global_size, local_size, v_buff, wait_for=[user_event])
def eikonal(graph, signal, hp): """Does the mean-curvature evolution params: ------ graph: signal: A initial distance field, for m number of seeds it is of size (n X m=chnls). hp: hyperparameters return: ------ new_signal: """ ngbrs = graph.ngbrs wgts = graph.wgts k = graph.k ngbrs = ngbrs.astype('int32') wgts = wgts.astype('float32') n, chnl = signal.shape """ old notes, need to include the A set here. red = gray[:,0] # get the ids of the seed """ signal = np.reshape(signal, (n * chnl), order='F') signal = signal.astype('float32') print("signal", signal.shape) if bool_1 else print() print("n", n) if bool_1 else print() it = hp.it print("sucess till loading") if bool_1 else print() # create the opencl context platform = cl.get_platforms()[0] print(platform) device = platform.get_devices()[0] print(device) context = cl.Context([device]) print(context) program = cl.Program(context, open(mywf).read()).build() queue = cl.CommandQueue(context) print(queue) #create the buffers now. mem_flags = cl.mem_flags ngbrs_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=ngbrs) signal_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=signal) weight_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=wgts) #need to create new signal new_signal = np.ndarray(shape=(n * chnl, ), dtype=np.float32) new_signal_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, new_signal.nbytes) #run the kernel here in a loop for uv in range(0, it): program.laplacian_filter(queue, (n * chnl, ), None, signal_buf, new_signal_buf, ngbrs_buf, weight_buf, np.int32(k), np.int32(chnl)) signal_buf, new_signal_buf = new_signal_buf, signal_buf # copy the new intensity vec cl.enqueue_copy(queue, new_signal, new_signal_buf) # save the new intensity vec here print("finish") if bool_1 else print() return np.reshape(new_signal, (int(len(new_signal) / chnl), chnl), order="F")
def __init__(self, queue, block_size): self.queue = queue self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False): mf = cl.mem_flags cl_queue = cl.CommandQueue(ctx) # build program code = """ #if real_t == double #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif kernel void mandel( __global real_t *coords, __global uint *output, __global real_t *output_coord, const uint max_iter, const uint start_iter ){ uint id = get_global_id(0); real_t2 my_coords = vload2(id, coords); real_t2 my_value_coords = vload2(id, output_coord); real_t x = my_value_coords.x; real_t y = my_value_coords.y; uint iter = 0; for(iter=start_iter; iter<max_iter; ++iter){ if(x*x + y*y > 4.0f){ break; } real_t xtemp = x*x - y*y + my_coords.x; y = 2*x*y + my_coords.y; x = xtemp; } // copy the current x,y pair back real_t2 val = (real_t2){x, y}; vstore2(val, id, output_coord); output[id] = iter; } """ _cltype, _nptype = ("double", np.float64) if use_double else ("float", np.float32) prg = cl.Program(ctx, code).build( "-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype)) # Calculate the "viewport". x0 = x - ((Decimal(3) * zoom) / Decimal(2.)) y0 = y - ((Decimal(2) * zoom) / Decimal(2.)) x1 = x + ((Decimal(3) * zoom) / Decimal(2.)) y1 = y + ((Decimal(2) * zoom) / Decimal(2.)) # Create index map in x,y pairs xx = np.arange(0, width, 1, dtype=np.uint32) yy = np.arange(0, height, 1, dtype=np.uint32) index_map = np.dstack(np.meshgrid(xx, yy)) # and local "coordinates" (real, imaginary parts) coord_map = np.ndarray(index_map.shape, dtype=_nptype) coord_map[:] = index_map coord_map[:] *= (_nptype( (x1 - x0) / Decimal(width)), _nptype((y1 - y0) / Decimal(height))) coord_map[:] += (_nptype(x0), _nptype(y0)) coord_map = coord_map.flatten() index_map = index_map.flatten().astype(dtype=np.uint32) # Create input and output buffer buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes) buffer_out = np.zeros( width * height, dtype=np.uint32) # This will contain the iteration values of that run buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes) buffer_out_coords = np.zeros(width * height * 2, dtype=_nptype) # This the last x,y values buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes) # 2D Buffer to collect the iterations needed per pixel #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width)) iter_map = np.zeros(width * height, dtype=np.uint32).reshape( (height, width)) start_max_iter = 0 to_do = int(coord_map.size / 2) steps_size = int(max_iter / float(iter_steps)) while to_do > 0 and start_max_iter < max_iter: end_max_iter = min(max_iter, start_max_iter + steps_size) print(("Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do))) # copy x/y pairs to device cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do * 2]).wait() cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do * 2]).wait() # and finally call the ocl function prg.mandel(cl_queue, (to_do, ), None, buffer_in_cl, buffer_out_cl, buffer_out_coords_cl, np.uint32(end_max_iter), np.uint32(start_max_iter)).wait() # Copy the output back cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait() cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait() # Get indices of "found" escapes done = np.where(buffer_out[:to_do] < end_max_iter)[0] # and write the iterations to the coresponding cell index_reshaped = index_map[:to_do * 2].reshape((to_do, 2)) tmp = index_reshaped[done] iter_map[tmp[:, 1], tmp[:, 0]] = buffer_out[done] #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done] # Get the indices of non escapes undone = np.where(buffer_out[:to_do] == end_max_iter)[0] # and write them back to our "job" maps for the next loop tmp = buffer_out_coords[:to_do * 2].reshape((to_do, 2)) buffer_out_coords[:undone.size * 2] = tmp[undone].flatten() tmp = coord_map[:to_do * 2].reshape((to_do, 2)) coord_map[:undone.size * 2] = tmp[undone].flatten() index_map[:undone.size * 2] = index_reshaped[undone].flatten() to_do = undone.size start_max_iter = end_max_iter print(("%i done. %i unknown" % (done.size, undone.size))) # simple coloring by modulo 255 on the iter_map return (iter_map % 255).astype(np.uint8).reshape((height, width))
# simulation parametars deltatime = 0.001 eps = 0.001 # init particle's position and velocity particles = np.random.rand(size, 4).astype(np.float32) #* (size / 1024) velocity = np.zeros((size, 4), dtype=np.float32) # create opencl context and put it in program queue ctx = cl.Context(cl.get_platforms()[0].get_devices()) # quick fix queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # make buffers mf = cl.mem_flags particles_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=particles) velocity_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=velocity) # --- just for checking #print particles #print " --- " # define OpenCL local memory size (blocksize * vectorsize * itemsize) local_buf = cl.LocalMemory(block_size * 4 * particles.itemsize) # build program prg = cl.Program(ctx, kernel).build() # execute kernel exec_evt = prg.nbody_simulation( queue,
} // execute over n "work items" """ ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # create some data array to give as input to Kernel and get output SIZE = 4 a_np = np.arange(SIZE * 3).reshape(SIZE, 3).astype(np.float32) b_np = np.arange(SIZE * 3, SIZE * 3 + SIZE * 3).reshape(SIZE, 3).astype(np.float32) c_np = np.zeros((SIZE * SIZE, 3)).astype(np.float32) # create the buffers to hold the values of the input a_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a_np) b_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b_np) # create output buffer c_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, c_np.nbytes) #Compilation prg = cl.Program(ctx, source).build() # Kernel is now launched launch = prg.gpu_mul(queue, (3, SIZE, SIZE), None, a_buf, b_buf, c_buf) # wait till the process completes launch.wait()
context = cl.Context([device]) program = cl.Program( context, """ __kernel void matrix_dot_vector(__global const int *matrix, __global const int *vector, __global int *result) { int gid = get_global_id(0); result[gid] = dot(matrix[gid], vector[0]); } """).build() queue = cl.CommandQueue(context) mem_flags = cl.mem_flags matrix_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=matrix) vector_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=vector) matrix_dot_vector = numpy.zeros(10, numpy.float32) destination_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, matrix_dot_vector.nbytes) program.matrix_dot_vector(queue, matrix_dot_vector.shape, None, matrix_buf, vector_buf, destination_buf) cl.enqueue_copy(queue, matrix_dot_vector, destination_buf) print vector print matrix
def InitializeSolver(self): """ Calculate u_{-1} to start of the time looping. u_-1 = u_0 - dt*du_0 + 0.5*dt**2*ddu_0 """ # Allocate the np.array object in CPU. self.LM = np.zeros((self.lclNDof, self.nSmp)) # no synchronized self.LHS = np.zeros((self.lclNDof, self.nSmp)) # synchronized # Allocate the OpenCL source and result buffer memory objects on GPU device GMEM. mem_flags = cl.mem_flags self.nodes_buf = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.mesh.nodes[self.lclNodeIds]) # self.elmNodeIds_buf = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = self.mesh.elementNodeIds) # mesh coloring's color tags self.colorGps_buf = [ cl.Buffer( self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.mesh.lclElmNodeIds[self.mesh.colorGroups[i]]) for i in range(len(self.mesh.colorGroups)) ] self.colorGps_elmIds_buf = [ cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.mesh.colorGroups[i]) for i in range(len(self.mesh.colorGroups)) ] # for calculating M (mass) matrix, do not need to always exist in GPU memory thickness_buf = cl.Buffer( self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.mesh.vthickness[self.lclNodeIds]) # for calculating K (stiffness) matrix, thicknessE (nElms, nSmp) # -- Young's Modulus elmVerE = self.mesh.vE[self.mesh.elementNodeIds, :] elmVerE = elmVerE.swapaxes(1, 2) elmAveE = np.mean(elmVerE, axis=2) # -- thickness elmVerThick = self.mesh.vthickness[self.mesh.elementNodeIds, :] elmVerThick = elmVerThick.swapaxes(1, 2) # elmAveThick = np.mean(elmVerThick, axis=2) # - thickness x E elmTE = np.mean(elmVerE * elmVerThick, axis=2) self.elmTE_buf = [ cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=elmTE[self.mesh.colorGroups[i]]) for i in range(len(self.mesh.colorGroups)) ] self.elmE_buf = [ cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=elmAveE[self.mesh.colorGroups[i]]) for i in range(len(self.mesh.colorGroups)) ] # for calculating K (stiffness) matrix, D needs k = 5.0 / 6.0 v = self.mesh.v pVals = np.array([ self.mesh.density, v, 0.5 * (1.0 - v), 0.5 * k * (1.0 - v), (1.0 - v * v) ]) self.pVals_buf = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=pVals) # The initial displacement b.c. (nNodes*3,) u_buf = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.u) self.LM_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) self.Ku_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) self.P_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) # cl.enqueue_fill_buffer(self.queue, self.LM_buf, np.float64(0.0), 0, self.LM.nbytes) # cl.enqueue_fill_buffer(self.queue, self.Ku_buf, np.float64(0.0), 0, self.LM.nbytes) # cl.enqueue_fill_buffer(self.queue, self.P_buf, np.float64(0.0), 0, self.LM.nbytes) map_flags = cl.map_flags self.appTrac_buf = cl.Buffer(self.context, mem_flags.READ_ONLY, int(self.lclNNodes * 24)) self.pinned_appTrac = cl.Buffer( self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, int(self.lclNNodes * 24)) self.appTrac, _eventAppTrac = cl.enqueue_map_buffer( self.queue, self.pinned_appTrac, map_flags.WRITE, 0, (self.lclNNodes, 3), self.LM.dtype) self.appTrac[:, :] = 0.0 # prep_appTrac_event = cl.enqueue_copy(self.queue, self.appTrac_buf, self.appTrac) # 'Assemble' the inital M (mass) and Ku (stiffness) 'matrices'. # Kernel. initial_assemble_events = [] for iColorGroup in range(len(self.colorGps_buf)): initial_assemble_event = \ self.program.assemble_K_M_P(self.queue, (len(self.mesh.colorGroups[iColorGroup]),), (1,), np.int64(self.nSmp), np.float64(self.pressure), self.pVals_buf, self.nodes_buf, self.colorGps_buf[iColorGroup], thickness_buf, self.elmTE_buf[iColorGroup], u_buf, self.Ku_buf, self.LM_buf, self.P_buf, wait_for=initial_assemble_events) initial_assemble_events = [initial_assemble_event] initial_assemble_copy_event = \ cl.enqueue_copy(self.queue, self.LM, self.LM_buf, wait_for=initial_assemble_events) initial_assemble_copy_event.wait() # Synchronize the left-hand-side of each equition which is LM. # Copy the LM first to LHS. self.LHS[:, :] = self.LM # Synchronize. self.SyncCommNodes(self.LHS) # Copy into GPU device and prepared. self.LHS_buf = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.LHS) # Calculate accelaration u''. # ddu = (F0 - C*du - Ku)/M self.ddu = np.zeros((self.lclNDof, self.nSmp)) self.ddu_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) initial_calc_ddu_event = \ self.program.calc_ddu(self.queue, (self.globalWorkSize,), (self.localWorkSize,), np.int64(self.nSmp), np.int64(self.lclNDof), self.P_buf, self.Ku_buf, self.LHS_buf, self.ddu_buf) initial_ddu_copy_event = \ cl.enqueue_copy(self.queue, self.ddu, self.ddu_buf, wait_for=[initial_calc_ddu_event]) initial_ddu_copy_event.wait() # Synchronize the acceleration on common nodes. self.SyncCommNodes(self.ddu) # Add on the global force. self.ddu += self.appTrac.reshape(self.lclNDof, 1) / self.LHS # Prepare the memories. # Memory on GPU devices. map_flags = cl.map_flags self.ures_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) self.u_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) self.up_buf = cl.Buffer(self.context, mem_flags.READ_WRITE, self.LM.nbytes) self.stress_buf = cl.Buffer(self.context, mem_flags.WRITE_ONLY, int(self.nElms * self.nSmp * 40)) # Pinned memory on CPU. self.pinned_ures = cl.Buffer( self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, self.LM.nbytes) self.pinned_u = cl.Buffer( self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, self.LM.nbytes) self.pinned_up = cl.Buffer( self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, self.LM.nbytes) self.pinned_stress = cl.Buffer( self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, int(self.nElms * self.nSmp * 40)) # Map to CPU. self.srcURes, _eventSrcURes = cl.enqueue_map_buffer( self.queue, self.pinned_ures, map_flags.WRITE | map_flags.READ, 0, self.LM.shape, self.LM.dtype) self.srcU, _eventSrcU = cl.enqueue_map_buffer( self.queue, self.pinned_u, map_flags.WRITE | map_flags.READ, 0, self.LM.shape, self.LM.dtype) self.srcUP, _eventSrcUP = cl.enqueue_map_buffer( self.queue, self.pinned_up, map_flags.WRITE | map_flags.READ, 0, self.LM.shape, self.LM.dtype) self.stress, _eventStress = cl.enqueue_map_buffer( self.queue, self.pinned_stress, map_flags.READ, 0, (self.nElms, self.nSmp, 5), self.LM.dtype) # Use Taylor Expansion to get u_-1. self.srcU[:, :] = self.u[np.newaxis].transpose() self.srcUP[:, :] = self.srcU - self.dt * self.du[ np.newaxis].transpose() + self.dt**2 * self.ddu / 2.0 # copy up first to device prep_up_event = cl.enqueue_copy(self.queue, self.up_buf, self.srcUP) prep_u_event = cl.enqueue_copy(self.queue, self.u_buf, self.srcU)
} ''' # Get device and context, create command queue and program dev = utility.get_default_device() context = cl.Context(devices=[dev]) queue = cl.CommandQueue(context, dev) # Build program in the specified context using the kernel source code prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror'], devices=[dev]) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Create output buffer out = cl.array.vec.zeros_int4() buffer_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=out.itemsize) # Enqueue kernel (with argument specified directly) n_globals = (1, ) n_locals = None prog.op_test(queue, n_globals, n_locals, buffer_out) # Enqueue command to copy from buffer_out to host memory cl.enqueue_copy(queue, dest=out, src=buffer_out, is_blocking=True) print('Output: ' + str(out))
def __preexecute_kernels(self): total_dna_size = self.__population * self.__sample_chromosome.dna_total_length self.__fitnesses = numpy.zeros(self.__population, dtype=numpy.float32) self.__np_chromosomes = numpy.zeros(total_dna_size, dtype=numpy.int32) mf = cl.mem_flags # Random number should be given by Host program because OpenCL doesn't have a random number # generator. We just include one, Noise.cl. rnum = [ random.randint(0, 4294967295) for i in range(self.__population) ] ## note: numpy.random.rand() gives us a list float32 and we cast it to uint32 at the calling ## of kernel function. It just views the original byte order as uint32. self.__dev_rnum = cl.Buffer(self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=numpy.array(rnum, dtype=numpy.uint32)) self.__dev_chromosomes = cl.Buffer(self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.__np_chromosomes) self.__dev_fitnesses = cl.Buffer(self.__ctx, mf.WRITE_ONLY, self.__fitnesses.nbytes) self.__prepare_fitness_args() if self.__is_elitism_mode: self.__elites_updated = False self.__current_elites = numpy.zeros( self.__sample_chromosome.dna_total_length * self.__elitism_top, dtype=numpy.int32) self.__dev_current_elites = cl.Buffer( self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.__current_elites) self.__updated_elites = numpy.zeros( self.__sample_chromosome.dna_total_length * self.__elitism_top, dtype=numpy.int32) self.__dev_updated_elites = cl.Buffer( self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.__updated_elites) self.__updated_elite_fitnesses = numpy.zeros(self.__elitism_top, dtype=numpy.float32) self.__dev_updated_elite_fitnesses = cl.Buffer( self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.__updated_elite_fitnesses) # For statistics self.__dev_best_indices = cl.Buffer(self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.__best_indices) self.__dev_worst_indices = cl.Buffer(self.__ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.__worst_indices) cl.enqueue_copy(self.__queue, self.__dev_fitnesses, self.__fitnesses) ## call preexecute_kernels for internal data structure preparation self.__sample_chromosome.preexecute_kernels(self.__ctx, self.__queue, self.__population) ## dump information on kernel resources usage self.__dump_kernel_info(self.__prg, self.__ctx, self.__sample_chromosome)
def test_image_3d(ctx_factory): #test for image_from_array for 3d image of float2 context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) prg = cl.Program(context, """ __kernel void copy_image_plane( __global float2 *dest, __read_only image3d_t src, sampler_t samp, int stride0, int stride1) { int d0 = get_global_id(0); int d1 = get_global_id(1); int d2 = get_global_id(2); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1*stride1 + d2] = read_imagef( src, samp, (float4)(d2, d1, d0, 0)).xy; } """).build() num_channels = 2 shape = (3, 4, 2) a = np.random.random(shape + (num_channels,)).astype(np.float32) queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image_plane(queue, shape, None, a_dest, a_img, samp, np.int32(a.strides[0]/a.itemsize/num_channels), np.int32(a.strides[1]/a.itemsize/num_channels), ) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ("The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good
import numpy as np import sys platforms = cl.get_platforms() platform = platforms[0] devs = platform.get_devices(cl.device_type.GPU) dev = devs[0] mf = cl.mem_flags ctx = cl.Context([dev]) queue = cl.CommandQueue(ctx, dev) n1 = np.arange(10).astype(np.int32) n2 = np.arange(10).astype(np.int32) out = np.zeros(10).astype(np.int32) b_n1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=n1) b_n2 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=n2) b_out = cl.Buffer(ctx, mf.WRITE_ONLY, size=out.nbytes) prog = cl.Program( ctx, """ __kernel void prog( __global int *n1, __global int *n2, __global int *out) { int i = get_local_id(0); __local int a; a = i; barrier(CLK_LOCAL_MEM_FENCE); printf("%d:%d\\n", get_global_id(0), get_group_id(1));
for name in img_names[1:]: img1 = Image.open(name) # img1 = img.convert("YCbCr") img_arr = numpy.asarray(img1).astype(numpy.uint8) host_arr = numpy.concatenate((host_arr, img_arr.reshape(-1))) host_arr = host_arr.astype(numpy.uint8) print dim new_dim = (len(img_names), dim[0], dim[1], dim[2]) print "new dimensions are", new_dim ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_arr) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, host_arr.nbytes) print "[%d] Takes " % len(img_names), naturalsize(host_arr.nbytes) kernel_code = open("embed_1.cl").read() % (new_dim[1], new_dim[2], new_dim[3]) prg1 = cl.Program(ctx, kernel_code).build() stime = time.time() prg1.embed_one(queue, (new_dim[0], new_dim[1], new_dim[2]), None, a_buf, dest_buf) etime = time.time() print "[%d] GPU takes " % len(img_names), naturaltime(etime - stime)
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function import numpy as np import pyopencl as cl a_np = np.random.rand(50000).astype(np.float32) b_np = np.random.rand(50000).astype(np.float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) prg = cl.Program( ctx, """ __kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) { int gid = get_global_id(0); res_g[gid] = a_g[gid] + b_g[gid]; } """).build() res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) res_np = np.empty_like(a_np) cl.enqueue_copy(queue, res_np, res_g)
def miningThread(self): self.loadKernel() frame = 1.0 / self.frames unit = self.worksize * 256 globalThreads = unit * 10 queue = cl.CommandQueue(self.context) lastRatedPace = lastRated = lastNTime = time() base = lastHashRate = threadsRunPace = threadsRun = 0 f = np.zeros(8, np.uint32) output = np.zeros(OUTPUT_SIZE + 1, np.uint32) output_buf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=output) work = None while True: if self.stop: return if (not work) or (not self.workQueue.empty()): try: work = self.workQueue.get(True, 1) except Empty: continue else: if not work: continue noncesLeft = self.hashspace data = np.array(unpack('IIIIIIIIIIIIIIII', work['data'][128:].decode('hex')), dtype=np.uint32) state = np.array(unpack('IIIIIIII', work['midstate'].decode('hex')), dtype=np.uint32) target = np.array(unpack('IIIIIIII', work['target'].decode('hex')), dtype=np.uint32) state2 = partial(state, data, f) self.miner.search(queue, (globalThreads, ), (self.worksize, ), state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state2[1], state2[2], state2[3], state2[5], state2[6], state2[7], pack('I', base), f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7], output_buf) cl.enqueue_read_buffer(queue, output_buf, output) noncesLeft -= globalThreads threadsRunPace += globalThreads threadsRun += globalThreads base = uint32(base + globalThreads) now = time() t = now - lastRatedPace if (t > 1): rate = (threadsRunPace / t) / self.rateDivisor lastRatedPace = now threadsRunPace = 0 r = lastHashRate / rate if r < 0.9 or r > 1.1: globalThreads = max( unit * int((rate * frame * self.rateDivisor) / unit), unit) lastHashRate = rate t = now - lastRated if (t > self.rate): self.hashrate(int((threadsRun / t) / self.rateDivisor)) lastRated = now threadsRun = 0 if self.updateTime == '': if noncesLeft < TIMEOUT * globalThreads * self.frames: self.update = True noncesLeft += 0xFFFFFFFFFFFF elif 0xFFFFFFFFFFF < noncesLeft < 0xFFFFFFFFFFFF: self.sayLine('warning: job finished, miner is idle') work = None queue.finish() if output[OUTPUT_SIZE]: result = {} result['work'] = work result['data'] = np.array(data) result['state'] = np.array(state) result['target'] = target result['output'] = np.array(output) self.resultQueue.put(result) output.fill(0) cl.enqueue_write_buffer(queue, output_buf, output) if self.updateTime != '' and now - lastNTime > 1: data[1] = bytereverse(bytereverse(data[1]) + 1) state2 = partial(state, data, f) lastNTime = now
def __call__(self, size): return cl.Buffer(self.context, self.flags, size)
def clFindRSAll(h_bf_npointsrange, h_pointset, h_query, h_vecradius, thelier, nchunks, pointdim, signallength, gpuid): triallength = int(signallength / nchunks) # print 'Values:', pointdim, triallength, signallength, kth, thelier '''for platform in cl.get_platforms(): for device in platform.get_devices(): print("===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print("---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) print("Device max work group size:", device.max_work_group_size) print("Device max work item sizes:", device.max_work_item_sizes)''' # Set up OpenCL my_gpu_devices, context, queue = _get_device(gpuid) # Check memory resources. usedmem = int((h_query.nbytes + h_pointset.nbytes + h_vecradius.nbytes + h_bf_npointsrange.nbytes) // 1024 // 1024) totalmem = int(my_gpu_devices[gpuid].global_mem_size // 1024 // 1024) if (totalmem * 0.90) < usedmem: print('WARNING: {0} Mb used from a total of {1} Mb. GPU could get ' 'without memory.'.format(usedmem, totalmem)) # Create OpenCL buffers d_bf_query = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_query) d_bf_pointset = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_pointset) d_bf_vecradius = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_vecradius) d_bf_npointsrange = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_bf_npointsrange.nbytes) # Kernel Launch kernelLocation = resource_filename(__name__, 'gpuKnnBF_kernel.cl') kernelsource = open(kernelLocation).read() program = cl.Program(context, kernelsource).build() kernelBFRSAllshared = program.kernelBFRSAllshared kernelBFRSAllshared.set_scalar_arg_dtypes( [None, None, None, None, np.int32, np.int32, np.int32, np.int32, None]) # Size of workitems and NDRange if signallength / nchunks < my_gpu_devices[gpuid].max_work_group_size: workitems_x = 8 elif my_gpu_devices[gpuid].max_work_group_size < 256: workitems_x = my_gpu_devices[gpuid].max_work_group_size else: workitems_x = 256 if signallength % workitems_x != 0: temp = int(round(((signallength) / workitems_x), 0) + 1) else: temp = int(signallength / workitems_x) NDRange_x = workitems_x * temp # Local memory for rangesearch. Actually not used, better results with # private memory localmem = cl.LocalMemory(np.dtype(np.int32).itemsize * workitems_x) kernelBFRSAllshared(queue, (NDRange_x, ), (workitems_x, ), d_bf_query, d_bf_pointset, d_bf_vecradius, d_bf_npointsrange, pointdim, triallength, signallength, thelier, localmem) queue.finish() # Download results cl.enqueue_copy(queue, h_bf_npointsrange, d_bf_npointsrange) # Free buffers d_bf_npointsrange.release() d_bf_vecradius.release() d_bf_query.release() d_bf_pointset.release() return 1
def compress_image(img, num_centroids, iters): """compress_image compresses an image, given as an image.Image, using the K-Means clustering algorithm. Args: img_data: image.Image to be compressed Returns: image.Image with image data which has been compressed """ # Get OpenCL context and queue context, queue = setup_opencl() mf = cl.mem_flags # Gather image data img_data = img.raw_data(image.ImageDataFormat.FLATTENED_NORMALIZED) img_dims = img.shape() # Create buffers imgBuffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img_data) centroids = np.random.random_sample((num_centroids * 4)).astype(np.float32) centroidsBuffer = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=centroids) indices = np.zeros((img_dims[0] * img_dims[1], )).astype(np.int32) indicesBuffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=indices.itemsize * img_dims[0] * img_dims[1]) # Load and compile the kernel build_ops = [ "-D NUM_CENTROIDS={0}".format(num_centroids), "-D IMG_WIDTH={0}".format(img_dims[1]) ] program = cl.Program( context, open('kernels/image_kmeans.cl').read()).build(options=build_ops) # Get the kernel and set the arguments kernel = cl.Kernel(program, 'FindClosestCentroid') kernel.set_arg(0, imgBuffer) kernel.set_arg(1, centroidsBuffer) kernel.set_arg(2, indicesBuffer) for iter in range(iters): cl.enqueue_nd_range_kernel(queue, kernel, (img_dims[0], img_dims[1]), None) cl.enqueue_copy(queue, indices, indicesBuffer, is_blocking=True) indexCounts = [0] * num_centroids indexTotals = np.zeros((num_centroids, 3)) for i in range(0, len(indices)): idx = indices[i] indexCounts[idx] += 1 indexTotals[idx][0] += img_data[3 * i] indexTotals[idx][1] += img_data[3 * i + 1] indexTotals[idx][2] += img_data[3 * i + 2] for i in range(num_centroids): count = indexCounts[i] if (count == 0): continue else: total = indexTotals[i] centroids[i * 3] = total[0] / count centroids[i * 3 + 1] = total[1] / count centroids[i * 3 + 2] = total[2] / count cl.enqueue_copy(queue, centroidsBuffer, centroids, is_blocking=True) compressed_img = np.zeros(img_dims) for x in range(img_dims[1]): for y in range(img_dims[0]): img_idx = img_dims[1] * y + x centroids_idx = indices[img_idx] compressed_img[y][x][0] = int(centroids[3 * centroids_idx] * 256) compressed_img[y][x][1] = int(centroids[3 * centroids_idx + 1] * 256) compressed_img[y][x][2] = int(centroids[3 * centroids_idx + 2] * 256) return image.Image(image_data=compressed_img)
def mining_thread(self): say_line('started OpenCL miner on platform %d, device %d (%s)', (self.options.platform, self.device_index, self.device_name)) (self.defines, rate_divisor, hashspace) = (vectors_definition(), 500, 0x7FFFFFFF) if self.vectors else ('', 1000, 0xFFFFFFFF) self.defines += (' -DOUTPUT_SIZE=' + str(self.output_size)) self.defines += (' -DOUTPUT_MASK=' + str(self.output_size - 1)) self.load_kernel() frame = 1.0 / max(self.frames, 3) unit = self.worksize * 256 global_threads = unit * 10 queue = cl.CommandQueue(self.context) last_rated_pace = last_rated = last_n_time = last_temperature = time() base = last_hash_rate = threads_run_pace = threads_run = 0 output = bytearray((self.output_size + 1) * 4) output_buffer = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=output) self.kernel.set_arg(20, output_buffer) work = None temperature = 0 while True: if self.should_stop: return sleep(self.frameSleep) if (not work) or (not self.work_queue.empty()): try: work = self.work_queue.get(True, 1) except Empty: continue else: if not work: continue self.nonces_left = hashspace state = work.state f = [0] * 8 state2 = partial(state, work.merkle_end, work.time, work.difficulty, f) calculateF(state, work.merkle_end, work.time, work.difficulty, f, state2) self.kernel.set_arg(0, pack('<I', state[0])) self.kernel.set_arg(1, pack('<I', state[1])) self.kernel.set_arg(2, pack('<I', state[2])) self.kernel.set_arg(3, pack('<I', state[3])) self.kernel.set_arg(4, pack('<I', state[4])) self.kernel.set_arg(5, pack('<I', state[5])) self.kernel.set_arg(6, pack('<I', state[6])) self.kernel.set_arg(7, pack('<I', state[7])) self.kernel.set_arg(8, pack('<I', state2[1])) self.kernel.set_arg(9, pack('<I', state2[2])) self.kernel.set_arg(10, pack('<I', state2[3])) self.kernel.set_arg(11, pack('<I', state2[5])) self.kernel.set_arg(12, pack('<I', state2[6])) self.kernel.set_arg(13, pack('<I', state2[7])) self.kernel.set_arg(15, pack('<I', f[0])) self.kernel.set_arg(16, pack('<I', f[1])) self.kernel.set_arg(17, pack('<I', f[2])) self.kernel.set_arg(18, pack('<I', f[3])) self.kernel.set_arg(19, pack('<I', f[4])) if temperature < self.cutoff_temp: self.kernel.set_arg(14, pack('<I', base)) cl.enqueue_nd_range_kernel(queue, self.kernel, (global_threads, ), (self.worksize, )) self.nonces_left -= global_threads threads_run_pace += global_threads threads_run += global_threads base = uint32(base + global_threads) else: threads_run_pace = 0 last_rated_pace = time() sleep(self.cutoff_interval) now = time() if self.adapterIndex != None: t = now - last_temperature if temperature >= self.cutoff_temp or t > 1: last_temperature = now with adl_lock: temperature = self.get_temperature() t = now - last_rated_pace if t > 1: rate = (threads_run_pace / t) / rate_divisor last_rated_pace = now threads_run_pace = 0 r = last_hash_rate / rate if r < 0.9 or r > 1.1: global_threads = max( unit * int((rate * frame * rate_divisor) / unit), unit) last_hash_rate = rate t = now - last_rated if t > self.options.rate: self.update_rate(now, threads_run, t, work.targetQ, rate_divisor) last_rated = now threads_run = 0 queue.finish() cl.enqueue_read_buffer(queue, output_buffer, output) queue.finish() if output[-1]: result = Object() result.header = work.header result.merkle_end = work.merkle_end result.time = work.time result.difficulty = work.difficulty result.target = work.target result.state = list(state) result.nonces = output[:] result.job_id = work.job_id result.extranonce2 = work.extranonce2 result.server = work.server result.miner = self self.switch.put(result) output[:] = b'\x00' * len(output) cl.enqueue_write_buffer(queue, output_buffer, output) if self.switch.should_stop: self.stop() if not self.switch.update_time: if self.nonces_left < 3 * global_threads * self.frames: self.update = True self.nonces_left += 0xFFFFFFFFFFFF elif 0xFFFFFFFFFFF < self.nonces_left < 0xFFFFFFFFFFFF: say_line('warning: job finished, %s is idle', self.id()) work = None elif now - last_n_time > 1: work.time = bytereverse(bytereverse(work.time) + 1) state2 = partial(state, work.merkle_end, work.time, work.difficulty, f) calculateF(state, work.merkle_end, work.time, work.difficulty, f, state2) self.kernel.set_arg(8, pack('<I', state2[1])) self.kernel.set_arg(9, pack('<I', state2[2])) self.kernel.set_arg(10, pack('<I', state2[3])) self.kernel.set_arg(11, pack('<I', state2[5])) self.kernel.set_arg(12, pack('<I', state2[6])) self.kernel.set_arg(13, pack('<I', state2[7])) self.kernel.set_arg(15, pack('<I', f[0])) self.kernel.set_arg(16, pack('<I', f[1])) self.kernel.set_arg(17, pack('<I', f[2])) self.kernel.set_arg(18, pack('<I', f[3])) self.kernel.set_arg(19, pack('<I', f[4])) last_n_time = now self.update_time_counter += 1 if self.update_time_counter >= self.switch.max_update_time: self.update = True self.update_time_counter = 1
class Runner: def __init__(self, dims): import numpy as np self.np = np self.dims = dims self.width = dims[0] self.height = dims[1] self.regions = REGIONS nx = np.random.randint(0, self.width, size=self.regions, dtype=np.int16) ny = np.random.randint(0, self.height, size=self.regions, dtype=np.int16) self.points = np.dstack((nx, ny))[0] self.cols = np.random.randint(0, 256, size=(self.regions, 3), dtype=np.uint8) self.use_cl = False self.init_gpu() def init_gpu(self): try: import pyopencl as cl # print cl from pyopencl import array except Exception, e: import os print os.getenv('LD_LIBRARY_PATH') print e.message return self.use_cl = True self.cl = cl device = cl.get_platforms()[0].get_devices()[0] self.ctx = cl.Context([device]) self.queue = cl.CommandQueue(self.ctx) print(device) self.lut = self.np.zeros(self.regions + 1, cl.array.vec.char3) for idx, i in enumerate(self.cols): self.lut[idx][0] = i[0] self.lut[idx][1] = i[1] self.lut[idx][2] = i[2] # self.lut[-1][0] = 0 # self.lut[-1][1] = 0 # self.lut[-1][2] = 0 self.lut_opencl = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.lut) self.prg = cl.Program( self.ctx, """ __kernel void voronoi(__global uchar4 *img, const __global ushort2 *points, __constant uchar4 *lut, ushort const height, ushort const width, ushort const regions) { int x = get_global_id(0); int y = get_global_id(1); // int grid_width = get_num_groups(0) * get_local_size(0); int index = y * height + x; int h = -1; float dmin = hypot((float)width -1, (float)height -1); for(int i = 0; i < regions; i++) { float d = hypot((float)points[i].x - y, (float)points[i].y - x); if (d < dmin) { dmin = d; h = i; } } img[index] = lut[h]; } """).build()
kernel = content_file.read() prg = cl.Program(ctx, kernel).build() mixture_data_buff = np.zeros(3 * nmixtures * resolution, dtype=np.float32) mixture_data_buff[0:resolution * nmixtures] = 1.0 / nmixtures / 10 mixture_data_buff[resolution * nmixtures + 1:2 * resolution * nmixtures] = init_var params_list = [k, T, init_var, min_var] mog_params = np.array(params_list, dtype=np.float32) f = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) #Allocate memory for variables on the device mixture_data_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=mixture_data_buff) mog_params_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=mog_params) cap = cv2.VideoCapture(camera) time_begin = time.time() cnt = 0 while (True): #Read in image ret, frame = cap.read() if ret: img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGBA) img_g = cl.image_from_array(ctx, img, 4, mode='r', norm_int=True) img_shape = (img.shape[1], img.shape[0])
dlst = np.array([d1, d2, d2, d1, 0], dtype=np.float32) print 'dim (%d, %d, %d)' % (nx, ny, nz) total_bytes = nx * ny * nz * 4 * 12 if total_bytes / (1024**3) == 0: print 'mem %d MB' % (total_bytes / (1024**2)) else: print 'mem %1.2f GB' % (float(total_bytes) / (1024**3)) # memory allocate f = np.zeros((nx, ny, nz), 'f', order='F') #f = np.random.randn(nx*ny*nz).astype(np.float32).reshape((nx,ny,nz),order='F') cf = np.ones_like(f) * (S / 24) mf = cl.mem_flags ex_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f) ey_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f) ez_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f) hx_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f) hy_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f) hz_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f) cex_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf) cey_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf) cez_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf) chx_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf) chy_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf) chz_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf) # prepare kernels prg = cl.Program(ctx, kernels).build()