for (int j=0; j<4; j++){ g_grad[i+j] = grad[i+j]; } } """ # # OpenCL setup. # kernel_code = pyOpenCLNCS.loadNCSKernel() + kernel_code # Create context and command queue platform = cl.get_platforms()[0] devices = platform.get_devices() context = cl.Context(devices) queue = cl.CommandQueue( context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Open program file and build program = cl.Program(context, kernel_code) try: program.build() except: print("Build log:") print(program.get_build_info(devices[0], cl.program_build_info.LOG)) raise def test_calc_ll(): n_pts = 16
def testOpenCLFunction(file_kernel, T, time_step, params, data, platform_id=0): file_kernel = file_kernel abs_path = os.path.dirname(os.path.realpath(__file__)) file_test = abs_path + "/prob_test.cl" #file_test = "../../HMMLikelihood/prob_test.cl" n_kernel_param = np.array(params).shape[0] n_data_dim = np.array(data).shape[0] defines = "-D T={} -D n_kernel_param={} -D n_data_dim={} ".format( T, n_kernel_param, n_data_dim) + "-cl-std=CL1.2" # Generate Code code_kernel = open(file_kernel, "r").read() code_test = open(file_test, "r").read() code = code_kernel + "\n" + code_test # OpenCL Initialisation platform = cl.get_platforms()[platform_id] device = platform.get_devices()[0] context = cl.Context([device]) program = cl.Program(context, code).build(defines) queue = cl.CommandQueue(context) timestep_in = np.array([time_step]).astype(cl.cltypes.int) params_in = np.array(params).astype(cl.cltypes.float) data_in = np.array(data).astype(cl.cltypes.float) res_out = np.zeros(1, np.float32) # Buffer creation mem_flags = cl.mem_flags timestep_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=timestep_in) param_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=params_in) data_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=data_in) res_buf = cl.Buffer(context, mem_flags.READ_WRITE, res_out.nbytes) # Assign function execution kernel = program.prob_test # Set program arguments globalItems = (1, ) localItems = None kernel.set_arg(0, timestep_buf) kernel.set_arg(1, param_buf) kernel.set_arg(2, data_buf) kernel.set_arg(3, res_buf) completeEvent = cl.enqueue_nd_range_kernel(queue, kernel, globalItems, localItems) completeEvent.wait() cl.enqueue_copy(queue, res_out, res_buf, is_blocking=True) return res_out
############################ ############################ ## Import packages import carbontax.optimalcontrol import carbontax.scenarios import carbontax.plot import pyopencl as cl import numpy as np ## Create OpenCL kernel platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context([device]) # Initialize the Context queue = cl.CommandQueue(context) # Instantiate a Queue ## Results scenario = carbontax.scenarios.createScenarioFunctions( context, queue, inertia='no', evolution='default', #extraparams={'maxReduct': 0.05, 'progressRatio':0.7}, minEmissions=-10) output = carbontax.optimalcontrol.findOptimalCarbonPath( scenarioFunctions=scenario, T=86,
def main(): platform_ID = None xclbin = None globalbuffersize = 1024 * 1024 * 16 #16 MB typesize = 512 threshold = 40000 expected = np.array([ [300, 240, 450, 250, 250, 250], # 32 bits [600, 500, 1000, 500, 500, 500], # 64 bits [1100, 900, 1500, 1100, 1100, 1100], #128 bits [1500, 1500, 1900, 2200, 2200, 2200], #256 bits [1900, 2000, 2300, 3800, 3800, 3800] #512 bits ]) # Process cmd line args parser = OptionParser() parser.add_option("-k", "--kernel", help="xclbin path") parser.add_option("-d", "--device", help="device index") (options, args) = parser.parse_args() xclbin = options.kernel index = options.device if xclbin is None: print("No xclbin specified\nUsage: -k <path to xclbin>") sys.exit(1) if index is None: index = 0 #get default device platforms = cl.get_platforms() # get Xilinx platform for i in platforms: if i.name == "Xilinx": platform_ID = platforms.index(i) print("\nPlatform Information:") print("Platform name: %s" % platforms[platform_ID].name) print("Platform version: %s" % platforms[platform_ID].version) print("Platform profile: %s" % platforms[platform_ID].profile) print("Platform extensions: %s" % platforms[platform_ID].extensions) break if platform_ID is None: #make sure xrt is sourced #run clinfo to make sure Xilinx platform is discoverable print("ERROR: Plaform not found") sys.exit(1) # choose device devices = platforms[platform_ID].get_devices() if int(index) > len(devices) - 1: print("\nERROR: Index out of range. %d devices were found" % len(devices)) sys.exit(1) else: dev = devices[int(index)] if "qdma" in str(dev) or "qep" in str(dev): threshold = 30000 if "u2x4" in str(dev) or "U2x4" in str(dev): threshold = 10000 if "gen3x4" in str(dev): threshold = 20000 if "_u25_" in str(dev): # so that it doesn't set theshold for u250 threshold = 9000 ctx = cl.Context(devices=[dev]) if not ctx: print("ERROR: Failed to create context") sys.exit(1) commands = cl.CommandQueue( ctx, dev, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) if not commands: print("ERROR: Failed to create command queue") sys.exit(1) print("Loading xclbin") with open(xclbin, "rb") as f: src = f.read() prg = cl.Program(ctx, [dev], [src]) try: prg.build() except: print("ERROR:") print(prg.get_build_info(ctx, cl.program_build_info.LOG)) raise knl1 = prg.bandwidth1 knl2 = prg.bandwidth2 #input host and buffer lst = [i % 256 for i in range(globalbuffersize)] input_host1 = np.array(lst).astype(np.uint8) input_host2 = np.array(lst).astype(np.uint8) input_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=input_host1) input_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=input_host2) if input_buf1.int_ptr is None or input_buf2.int_ptr is None: print("ERROR: Failed to allocate source buffer") sys.exit(1) #output host and buffer output_host1 = np.empty_like(input_host1, dtype=np.uint8) output_host2 = np.empty_like(input_host2, dtype=np.uint8) output_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host1.nbytes) output_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host2.nbytes) if output_buf1.int_ptr is None or output_buf2.int_ptr is None: print("ERROR: Failed to allocate destination buffer") sys.exit(1) #copy dataset to OpenCL buffer globalbuffersizeinbeats = globalbuffersize / (typesize / 8) tests = int(math.log(globalbuffersizeinbeats, 2.0)) + 1 #lists dnsduration = [] dsduration = [] dbytes = [] dmbytes = [] bpersec = [] mbpersec = [] #run tests with burst length 1 beat to globalbuffersize #double burst length each test test = 0 beats = 16 throughput = [] while beats <= 1024: print("LOOP PIPELINE %d beats" % beats) usduration = 0 fiveseconds = 5 * 1000000 reps = 64 while usduration < fiveseconds: start = current_micro_time() knl1(commands, (1, ), (1, ), output_buf1, input_buf1, np.uint32(beats), np.uint32(reps)) knl2(commands, (1, ), (1, ), output_buf2, input_buf2, np.uint32(beats), np.uint32(reps)) commands.finish() end = current_micro_time() usduration = end - start cl.enqueue_copy(commands, output_host1, output_buf1).wait() cl.enqueue_copy(commands, output_host2, output_buf2).wait() # need to check, currently fails limit = int(beats * (typesize / 8)) if not np.array_equal(output_host1[:limit], input_host1[:limit]): print("ERROR: Failed to copy entries") input_buf1.release() input_buf2.release() output_buf1.release() output_buf2.release() sys.exit(1) if not np.array_equal(output_host2[:limit], input_host2[:limit]): print("ERROR: Failed to copy entries") input_buf1.release() input_buf2.release() output_buf1.release() output_buf2.release() sys.exit(1) # print("Reps = %d, Beats = %d, Duration = %lf us" %(reps, beats, usduration)) # for debug if usduration < fiveseconds: reps = reps * 2 dnsduration.append(usduration) dsduration.append(dnsduration[test] / 1000000) dbytes.append(reps * beats * (typesize / 8)) dmbytes.append(dbytes[test] / (1024 * 1024)) bpersec.append(2 * dbytes[test] / dsduration[test]) mbpersec.append(2 * bpersec[test] / (1024 * 1024)) throughput.append(mbpersec[test]) print("Test %d, Throughput: %d MB/s" % (test, throughput[test])) beats = beats * 4 test += 1 #cleanup input_buf1.release() input_buf2.release() output_buf1.release() output_buf2.release() del ctx print("TTTT: %d" % throughput[0]) print("Maximum throughput: %d MB/s" % max(throughput)) if max(throughput) < threshold: print("ERROR: Throughput is less than expected value of %d GB/sec" % (threshold / 1000)) sys.exit(1) print("PASSED")
#!/usr/bin/env python import sys import pyopencl as cl import numpy as np from my_cl_utils import print_device_info, get_optimal_global_work_size # Platform, Device, Context and Queue devices = [] platforms = cl.get_platforms() for platform in platforms: devices.extend(platform.get_devices()) #print_device_info(platforms, devices) device = devices[0] context = cl.Context((device, )) queue = cl.CommandQueue(context, device) # Parameter setup nx, ny, nz = 240, 256, 256 # 540 MB #nx, ny, nz = 512, 480, 480 # 3.96 GB #nx, ny, nz = 256, 480, 960 tmax, tgap = 200, 10 print '(%d, %d, %d)' % (nx, ny, nz), total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024**3) == 0: print '%d MB' % (total_bytes / (1024**2)) else: print '%1.2f GB' % (float(total_bytes) / (1024**3))
sequence_name = seq['name'] frame_num = seq['frame_num'] frame_resolution = seq['resolution'] in_path = './tests/' + sequence_name + '/input/in00%04d.jpg' gt_path = './tests/' + sequence_name + '/groundtruth/gt00%04d.png' out_path = './tests/' + sequence_name + '/output/out00%04d.jpg' # Kernel function kernel_src = 'mixture-of-gaussian.cl' if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # Choose graphic device and create context for it ctx = cl.Context([device_choose(pref_platform, pref_device)]) mf = cl.mem_flags # Create queue for each kernel execution queue = cl.CommandQueue(ctx) #Kernel function instantiation kernel = str() with open(kernel_src, 'r') as content_file: kernel = content_file.read() prg = cl.Program(ctx, kernel).build() mixture_data_buff = np.zeros(3 * nmixtures * frame_resolution, dtype=np.float32) mixture_data_buff[0:frame_resolution * nmixtures] = 1.0 / nmixtures mixture_data_buff[frame_resolution * nmixtures + 1:2 * frame_resolution *
def calc_fractal_opencl(chunks, maxiter): # List all the stuff in this computer platforms = cl.get_platforms() for platform in platforms: print("Found a device: {}".format(str(platform))) # Let's just go with platform zero ctx = cl.Context(dev_type=cl.device_type.ALL, properties=[(cl.context_properties.PLATFORM, platforms[0])]) # Create a command queue on the platform (device = None means OpenCL picks a device for us) queue = cl.CommandQueue(ctx, device = None) mf = cl.mem_flags # This is our OpenCL kernel. It does a single point (OpenCL is responsible for mapping it across the points in a chunk) # __kernel decorator just specifies its an opencl kernel # Can define multiple kernels in single cl.Program and call them with cl.methodName # float2 is a struct containing 2 floats (works for 2,3,4 dims (can use .x, .y, .w, .\)) # floatN can also be used prg = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable __kernel void mandelbrot(__global float2 *q, __global ushort *output, ushort const maxiter) { int gid = get_global_id(0); float cx = q->x; float cy = q->y; float x = 0.0f; float y = 0.0f; int its = 0; while (((x*x + y*y) < 4.0f) && (its < maxiter)) { float xtemp = x*x - y*y + cx; y = 2*x*y + cy; x = xtemp; its++; } // Assume point is not in set if reach maxiter if (its == maxiter) { output[gid] = 0; } else { output[gid] = its; } } """).build() output_chunks = [] output_chunks_on_device = [] chunk_shape = None for chunk_input in chunks: # Record the shape of input chunks chunk_shape = chunk_input.shape # These are our buffers to hold data on the device (on the device specified in ctx) chunk_input_on_device = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=chunk_input) chunk_output_on_device = cl.Buffer(ctx, mf.WRITE_ONLY, int(chunk_input.nbytes / 4)) # divided by 4 because our inputs are 64 bits but outputs are 16 bits # Call the kernel on this chunk # Notice we defined our function for a single point, but are passing a chunk # OpenCL handles parallelizing (partitioning) depending on context device # After none, we're passing params to our kernel prg.mandelbrot(queue, chunk_shape, None, chunk_input_on_device, chunk_output_on_device, np.uint16(maxiter)) # Add the output chunk to our list to keep track of it output_chunks_on_device.append(chunk_output_on_device) # Wait for all the chunks to be computed # In default single treaded mode for queue: chunks run serially (but work inside queue is parallelized) # Can use unordered queue to dump as much of the work onto the devices for even more parallelization queue.finish() for chunk_output_on_device in output_chunks_on_device: chunk_output = np.zeros(chunk_shape, dtype=np.uint16) # Wait until it is done and pull the data back cl.enqueue_copy(queue, chunk_output, chunk_output_on_device).wait() # Insert the chunk in our overall output output_chunks.append(chunk_output) return np.concatenate(output_chunks)
def __init__(self, kernel_file): if self.layers + 1 != len(self.layer_height): print("Bad network config.") exit() print("Running with {} hidden layers and {} layers total.".format( self.hidden_layers, self.layers)) print("OpenCL Version v{}".format(".".join( [str(i) for i in cl.get_cl_header_version()]))) print("Finding platform....") platform = self.findPlatform(VENDOR_NAME) if not platform: print("ERROR: Platform not found for name {0}".format(VENDOR_NAME)) exit(1) print("Getting devices...") devices = platform.get_devices(device_type=DEVICE_TYPE) if len(devices) < 1: print("ERROR: No device found for type {0}.".format(DEVICE_TYPE)) exit(1) devices = [devices[1]] self.ctx = cl.Context(devices=devices) if DEVICE_TYPE == cl.device_type.ACCELERATOR: print("Reading binary...") binary = kernel_file.read() binaries = [binary] * len(devices) print("Building...") program = cl.Program(self.ctx, devices, binaries) else: print("Reading program...") binary = kernel_file.read() program = cl.Program(self.ctx, binary.decode('utf-8')).build() self.kForward = program.forward self.kForwardSoftMax = program.forward_softmax # self.kBackwardFirstDelta = program.backward_first_delta # self.kBackward = program.backward self.kForward.set_scalar_arg_dtypes( [None, None, None, None, np.int32, np.int32, np.int32, np.int32]) self.kForwardSoftMax.set_scalar_arg_dtypes([None, np.int32, np.int32]) # self.kBackwardFirstDelta.set_scalar_arg_dtypes([None, None, None, np.int32, np.int32]) # self.kBackward.set_scalar_arg_dtypes( # [None, None, None, None, NN_T, NN_T, np.int32, np.int32, np.int32]) self.queue = cl.CommandQueue(self.ctx) print("Loading data...") _, (self.x_test, self.y_test) = input_data.load_data() self.y_test = self.y_test.reshape((10000, )) self.x_test = self.x_test.reshape(10000, self.layer_height[0]) self.x_test = self.x_test.astype('float32') self.x_test /= 255 self.correct_pred_fpga = 0 self.wrong_pred_fpga = 0 self.correct_pred_cpu = 0 self.wrong_pred_cpu = 0
def update_pixel_map_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n, subpixel, subsample, search_window, ss, fs): # demand that the data is float32 to avoid excess mem. usage assert(data.dtype == np.float32) ################################################################## # OpenCL crap ################################################################## import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() if subpixel: update_pixel_map_cl = program.update_pixel_map_subpixel else : update_pixel_map_cl = program.update_pixel_map update_pixel_map_cl.set_scalar_arg_dtypes( [None, None, None, None, None, None, None, None, None, None, np.float32, np.float32, np.float32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = update_pixel_map_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) ss = ss.ravel().astype(np.int32) fs = fs.ravel().astype(np.int32) ss_min, ss_max = (-(search_window[0]-1)//2, (search_window[0]+1)//2) fs_min, fs_max = (-(search_window[1]-1)//2, (search_window[1]+1)//2) print(ss_min, ss_max) print(fs_min, fs_max) # outputs: err_map = np.zeros(W.shape, dtype=np.float32) pixel_mapout = pixel_map.astype(np.float32) ################################################################## # End crap ################################################################## # evaluate err_map0 ssi = ss fsi = fs update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ssi), cl.SVM(fsi), n0, m0, subsample, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], 0, 1, 0, 1) queue.finish() pixel_mapout = pixel_map.astype(np.float32) err_map0 = err_map.copy() step = min(100, ss.shape[0]) it = tqdm.tqdm(np.arange(ss.shape[0])[::step], desc='updating pixel map') for i in it: ssi = ss[i:i+step:] fsi = fs[i:i+step:] update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ssi), cl.SVM(fsi), n0, m0, subsample, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max) queue.finish() er = np.mean(err_map[err_map>0]) it.set_description("updating pixel map: {:.2e}".format(er)) #it.set_description("updating pixel map: {:.2e}".format(np.sum(err_map) \ # / np.sum(err_map>0))) # only return filled values out = np.zeros((2,) + ss.shape, dtype=pixel_map.dtype) out[0] = pixel_mapout[0][ss, fs] out[1] = pixel_mapout[1][ss, fs] return out, {'error_map': err_map, 'error': np.sum(err_map)}
import numpy, sys, os, pyopencl, random platform = pyopencl.get_platforms()[0] mygpu = platform.get_devices(pyopencl.device_type.GPU)[0] context = pyopencl.Context(devices=[mygpu]) queue = pyopencl.CommandQueue(context) prog = pyopencl.Program(context, open("func.cl").read()).build(devices=[mygpu])
def try_kernel(platform_idx, device_idx): platforms = cl.get_platforms() device = platforms[platform_idx].get_devices()[device_idx] worksize = device.get_info(cl.device_info.MAX_WORK_GROUP_SIZE) context = cl.Context([device], None, None) output_size = 256 host_out_buffer = bytearray((output_size + 1) * 4) cl_out_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=len(host_out_buffer) # hostbuf=host_out_buffer ) defines = ( f'-D OUTPUT_SIZE={output_size} -D OUTPUT_MASK={output_size - 1} ' f'-D WORK_GROUP_SIZE={worksize}') if device.extensions.find('cl_amd_media_ops') != -1: print('AMD bitalign defined!') defines += ' -DBITALIGN' kernel_code = pkgutil.get_data('apoclypsebm', 'apoclypse-0.cl').decode('ascii') print(f'Building with {defines}') program = cl.Program(context, kernel_code).build(defines) kernel_bin_file_name = file_safe_sanitize( f'{device.platform.name}-{device.platform.version}-{device.name}.elf') with open(kernel_bin_file_name, 'wb') as binary: binary.write(program.binaries[0]) print(f'Wrote kernel to {kernel_bin_file_name}') kernel = program.search frame = 1.0 / DEFAULT_FRAMES unit = worksize * 256 global_threads = unit * 10 print( f'worksize: {worksize}, unit: {unit}, global_threads: {global_threads}' ) kernel.set_arg(20, cl_out_buffer) base = 0 work = None # TODO rate_divisor, hashspace = 1000, 0xFFFFFFFF # assumes not vectorized nonces_left = hashspace with open('../example_blochheader_block.txt') as block_file: binary_data = unhexlify(block_file.read())[:76] # Swap every uint32 for sha256… swapped_bin = bytearray() for i in range(-1, len(binary_data) - 1, 4): new_word = binary_data[i + 4:None if i == -1 else i:-1] swapped_bin += new_word # print(f'slicing {i + 3}:{i or None}:{-1} gives {new_word.hex()}') print(f'Initial: {binary_data.hex()}') print(f'SHA2 chunked: {swapped_bin.hex()}') midstate_input = list(unpack('<16I', swapped_bin[:64])) + ([0] * 48) midstate = sha256(STATE, midstate_input) merkle_end = uint32(unpack('<I', swapped_bin[64:68])[0]) time = uint32(unpack('<I', swapped_bin[68:72])[0]) difficulty = uint32(unpack('<I', swapped_bin[72:76])[0]) base = 316141196 - 0 # -10 from the winner print(f'') state = list(midstate) f = [0] * 8 state2 = partial(state, merkle_end, time, difficulty, f) print(f'state2 and f: {state2}, {f}') calculateF(state, merkle_end, time, difficulty, f, state2) print(f'state and f after fcalc: {state}, {f}') kernel.set_arg(0, uint32_as_bytes(state[0])) kernel.set_arg(1, uint32_as_bytes(state[1])) kernel.set_arg(2, uint32_as_bytes(state[2])) kernel.set_arg(3, uint32_as_bytes(state[3])) kernel.set_arg(4, uint32_as_bytes(state[4])) kernel.set_arg(5, uint32_as_bytes(state[5])) kernel.set_arg(6, uint32_as_bytes(state[6])) kernel.set_arg(7, uint32_as_bytes(state[7])) kernel.set_arg(8, uint32_as_bytes(state2[1])) kernel.set_arg(9, uint32_as_bytes(state2[2])) kernel.set_arg(10, uint32_as_bytes(state2[3])) kernel.set_arg(11, uint32_as_bytes(state2[5])) kernel.set_arg(12, uint32_as_bytes(state2[6])) kernel.set_arg(13, uint32_as_bytes(state2[7])) kernel.set_arg(15, uint32_as_bytes(f[0])) kernel.set_arg(16, uint32_as_bytes(f[1])) kernel.set_arg(17, uint32_as_bytes(f[2])) kernel.set_arg(18, uint32_as_bytes(f[3])) kernel.set_arg(19, uint32_as_bytes(f[4])) # This part usually done after temperature check: print(f'Starting with base {base}') kernel.set_arg(14, uint32_as_bytes(base)[::-1]) cmd_queue = cl.CommandQueue(context) cl.enqueue_copy(cmd_queue, cl_out_buffer, host_out_buffer) cl.enqueue_nd_range_kernel(cmd_queue, kernel, (global_threads, ), (worksize, )) cl.enqueue_copy(cmd_queue, host_out_buffer, cl_out_buffer) cmd_queue.finish() print(f'Got {len(host_out_buffer)} outputs:') print(' '.join([f'{nonce:02x}' for nonce in host_out_buffer])) nonces_left -= global_threads # threads_run_pace += global_threads # threads_run += global_threads base = uint32(base + global_threads)
def test_y_pbc_x_exchange(self): # instance nx, ny, nz = 40, 50, 60 #nx, ny, nz = 3, 4, 5 gpu_devices = common_gpu.gpu_device_list(print_info=False) context = cl.Context(gpu_devices) gpuf = gpu.Fields(context, gpu_devices[0], nx, ny, nz) cpuf = cpu.Fields(nx, ny, nz) mainf_list = [gpuf, cpuf] nodef = NodeFields(mainf_list) core = NodeCore(nodef) pbc = NodePbc(nodef, 'y') exchange = NodeExchange(nodef) # generate random source ehs_gpu = common_update.generate_random_ehs(nx, ny, nz, nodef.dtype) gpuf.set_eh_bufs(*ehs_gpu) ehs_gpu_dict = dict(zip(['ex', 'ey', 'ez', 'hx', 'hy', 'hz'], ehs_gpu)) ehs_cpu = common_update.generate_random_ehs(nx, ny, nz, nodef.dtype) cpuf.set_ehs(*ehs_cpu) ehs_cpu_dict = dict(zip(['ex', 'ey', 'ez', 'hx', 'hy', 'hz'], ehs_cpu)) # verify for mainf in mainf_list: mainf.update_e() pbc.update_e() exchange.update_e() for mainf in mainf_list: mainf.update_h() pbc.update_h() exchange.update_h() mainf_list[-1].enqueue_barrier() getf0, getf1 = {}, {} # x-axis exchange getf0['e'] = gpu.GetFields(gpuf, ['ey', 'ez'], (nx - 1, 0, 0), (nx - 1, ny - 2, nz - 2)) getf1['e'] = cpu.GetFields(cpuf, ['ey', 'ez'], (0, 0, 0), (0, ny - 2, nz - 2)) getf0['h'] = gpu.GetFields(gpuf, ['hy', 'hz'], (nx - 1, 1, 1), (nx - 1, ny - 1, nz - 1)) getf1['h'] = cpu.GetFields(cpuf, ['hy', 'hz'], (0, 1, 1), (0, ny - 1, nz - 1)) for getf in getf0.values() + getf1.values(): getf.get_event().wait() for eh in ['e', 'h']: g0 = getf0[eh].get_fields() g1 = getf1[eh].get_fields() norm = np.linalg.norm(g0 - g1) self.assertEqual(norm, 0, '%g, %s, %s' % (norm, 'x-axis exchange', eh)) # y-axis pbc gpu getf0['e'] = gpu.GetFields(gpuf, ['ex', 'ez'], (0, ny - 1, 0), (nx - 2, ny - 1, nz - 2)) getf1['e'] = gpu.GetFields(gpuf, ['ex', 'ez'], (0, 0, 0), (nx - 2, 0, nz - 2)) getf0['h'] = gpu.GetFields(gpuf, ['hx', 'hz'], (1, ny - 1, 1), (nx - 1, ny - 1, nz - 1)) getf1['h'] = gpu.GetFields(gpuf, ['hx', 'hz'], (1, 0, 1), (nx - 1, 0, nz - 1)) for getf in getf0.values() + getf1.values(): getf.get_event().wait() for eh in ['e', 'h']: g0 = getf0[eh].get_fields() g1 = getf1[eh].get_fields() norm = np.linalg.norm(g0 - g1) self.assertEqual(norm, 0, '%g, %s, %s' % (norm, 'y-axis pbc gpu', eh)) # y-axis pbc cpu getf0['e'] = cpu.GetFields(cpuf, ['ex', 'ez'], (0, ny - 1, 0), (nx - 2, ny - 1, nz - 2)) getf1['e'] = cpu.GetFields(cpuf, ['ex', 'ez'], (0, 0, 0), (nx - 2, 0, nz - 2)) getf0['h'] = cpu.GetFields(cpuf, ['hx', 'hz'], (1, ny - 1, 1), (nx - 1, ny - 1, nz - 1)) getf1['h'] = cpu.GetFields(cpuf, ['hx', 'hz'], (1, 0, 1), (nx - 1, 0, nz - 1)) for getf in getf0.values() + getf1.values(): getf.get_event().wait() for eh in ['e', 'h']: g0 = getf0[eh].get_fields() g1 = getf1[eh].get_fields() norm = np.linalg.norm(g0 - g1) self.assertEqual(norm, 0, '%g, %s, %s' % (norm, 'y-axis pbc cpu', eh))
import numpy as np import pyopencl as cl import numpy.linalg as la vector_dimension = 100 #vector_a = np.random.rand(vector_dimension).astype(np.float32) #vector_b = np.random.rand(vector_dimension).astype(np.float32) vector_a = np.random.randint(100, size=100) vector_b = np.random.randint(100, size=100) platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context([device]) queue = cl.CommandQueue(context) mf = cl.mem_flags a_g = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=vector_a) b_g = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=vector_b) program = cl.Program(context, """ __kernel void vectorSum(__global const int *a_g, __global const int *b_g, __global int *res_g) { int gid = get_global_id(0); res_g[gid] = a_g[gid] + b_g[gid]; } """).build() res_g = cl.Buffer(context, mf.WRITE_ONLY, vector_a.nbytes) program.vectorSum(queue, vector_a.shape, None, a_g, b_g, res_g)
def runTest(self): nx, ny, nz = self.args # instances buffer_dict = {} buffer_dict['x+'] = cpu.BufferFields('x+', ny, nz, '', 'single') buffer_dict['x-'] = cpu.BufferFields('x-', ny, nz, '', 'single') import pyopencl as cl from kemp.fdtd3d.util import common_gpu from kemp.fdtd3d import gpu gpu_devices = common_gpu.gpu_device_list(print_info=False) context = cl.Context(gpu_devices) mainf_list = [ gpu.Fields(context, gpu_devices[0], nx, ny, nz) ] #mainf_list = [ cpu.Fields(nx, ny, nz) ] nodef = node.Fields(mainf_list, buffer_dict) # generate random source dtype = nodef.dtype ehs = common_random.generate_ehs(nx, ny, nz, dtype) buf_ehs_p = common_random.generate_ehs(3, ny, nz, dtype) buf_ehs_m = common_random.generate_ehs(3, ny, nz, dtype) nodef.mainf_list[0].set_eh_bufs(*ehs) #nodef.mainf_list[0].set_ehs(*ehs) nodef.buffer_dict['x+'].set_ehs(*buf_ehs_p) nodef.buffer_dict['x-'].set_ehs(*buf_ehs_m) node.Core(nodef) # allocations for verify getf_dict = {'x+': {}, 'x-': {}} getf_buf_dict = {'x+': {}, 'x-': {}} getf_dict['x+']['e'] = gpu.GetFields(nodef.mainf_list[0], ['ey', 'ez'], (nx-1, 0, 0), (nx-1, ny-1, nz-1)) getf_dict['x+']['h'] = gpu.GetFields(nodef.mainf_list[0], ['hy', 'hz'], (nx-2, 0, 0), (nx-2, ny-1, nz-1)) getf_buf_dict['x+']['e'] = cpu.GetFields(nodef.buffer_dict['x+'], ['ey', 'ez'], (1, 0, 0), (1, ny-1, nz-1)) getf_buf_dict['x+']['h'] = cpu.GetFields(nodef.buffer_dict['x+'], ['hy', 'hz'], (0, 0, 0), (0, ny-1, nz-1)) getf_dict['x-']['e'] = gpu.GetFields(nodef.mainf_list[0], ['ey', 'ez'], (1, 0, 0), (1, ny-1, nz-1)) getf_dict['x-']['h'] = gpu.GetFields(nodef.mainf_list[0], ['hy', 'hz'], (0, 0, 0), (0, ny-1, nz-1)) getf_buf_dict['x-']['e'] = cpu.GetFields(nodef.buffer_dict['x-'], ['ey', 'ez'], (2, 0, 0), (2, ny-1, nz-1)) getf_buf_dict['x-']['h'] = cpu.GetFields(nodef.buffer_dict['x-'], ['hy', 'hz'], (1, 0, 0), (1, ny-1, nz-1)) # verify nodef.update_e() nodef.update_h() print 'nodef, instance_list', nodef.instance_list print 'mainf_list[0], instance_list', nodef.mainf_list[0].instance_list for direction in ['x+', 'x-']: for e_or_h in ['e', 'h']: getf = getf_dict[direction][e_or_h] getf_buf = getf_buf_dict[direction][e_or_h] getf.get_event().wait() getf_buf.get_event().wait() original = getf.get_fields() copy = getf_buf.get_fields() norm = np.linalg.norm(original - copy) self.assertEqual(norm, 0, '%s, %g, %s, %s' % (self.args, norm, direction, e_or_h))
def getParticleData(data, p): h = p["particles"] w = p["points_per_particle"] dim = 4 # four points: x, y, alpha, width offset = 1.0 - p["animationProgress"] tw = p["width"] th = p["height"] dh = len(data) dw = len(data[0]) result = np.zeros(tw * th, dtype=np.float32) # print "%s x %s x %s = %s" % (w, h, dim, len(result)) fData = np.array(data) fData = fData.astype(np.float32) fData = fData.reshape(-1) # print "%s x %s x 3 = %s" % (dw, dh, len(fData)) pData = np.array(p["particleProperties"]) pData = pData.astype(np.float32) pData = pData.reshape(-1) # print "%s x 3 = %s" % (h, len(pData)) # the kernel function src = """ static float lerp(float a, float b, float mu) { return (b - a) * mu + a; } static float det(float a0, float a1, float b0, float b1) { return a0 * b1 - a1 * b0; } static float2 lineIntersection(float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3) { float xd0 = x0 - x1; float xd1 = x2 - x3; float yd0 = y0 - y1; float yd1 = y2 - y3; float div = det(xd0, xd1, yd0, yd1); float2 intersection; intersection.x = -1.0; intersection.y = -1.0; if (div != 0.0) { float d1 = det(x0, y0, x1, y1); float d2 = det(x2, y2, x3, y3); intersection.x = det(d1, d2, xd0, xd1) / div; intersection.y = det(d1, d2, yd0, yd1) / div; } return intersection; } static float norm(float value, float a, float b) { float n = (value - a) / (b - a); if (n > 1.0) { n = 1.0; } if (n < 0.0) { n = 0.0; } return n; } static float wrap(float value, float a, float b) { if (value < a) { value = b - (a - value); } else if (value > b) { value = a + (value - b); } return value; } void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness); void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha); void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness) { int dx = abs(x1-x0); int dy = abs(y1-y0); if (dx==0 && dy==0) { return; } // draw the first line drawSingleLine(p, x0, y0, x1, y1, w, h, alpha); thickness--; if (thickness < 1) return; int stepX = 0; int stepY = 0; if (dx > dy) stepY = 1; else stepX = 1; // loop through thickness int offset = 1; for (int i=0; i<thickness; i++) { int xd = stepX * offset; int yd = stepY * offset; drawSingleLine(p, x0+xd, y0+yd, x1+xd, y1+yd, w, h, alpha); // alternate above and below offset *= -1; if (offset > 0) { offset++; } } } void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha) { // clamp x0 = clamp(x0, 0, w-1); x1 = clamp(x1, 0, w-1); y0 = clamp(y0, 0, h-1); y1 = clamp(y1, 0, h-1); int dx = abs(x1-x0); int dy = abs(y1-y0); if (dx==0 && dy==0) { return; } int sy = 1; int sx = 1; if (y0>=y1) { sy = -1; } if (x0>=x1) { sx = -1; } int err = dx/2; if (dx<=dy) { err = -dy/2; } int e2 = err; int x = x0; int y = y0; for(int i=0; i<w; i++){ p[y*w+x] = alpha; if (x==x1 && y==y1) { break; } e2 = err; if (e2 >-dx) { err -= dy; x += sx; } if (e2 < dy) { err += dx; y += sy; } } } __kernel void getParticles(__global float *data, __global float *pData, __global float *result){ int points = %d; int dw = %d; int dh = %d; float tw = %f; float th = %f; float offset = %f; float magMin = %f; float magMax = %f; float alphaMin = %f; float alphaMax = %f; float velocityMult = %f; float lineWidthMin = %f; float lineWidthMax = %f; float lineWidthLatMin = %f; float lineWidthLatMax = %f; // get current position int i = get_global_id(0); float dx = pData[i*3]; float dy = pData[i*3+1]; float doffset = pData[i*3+2]; // set starting position float x = dx * (tw-1); float y = dy * (th-1); for(int j=0; j<points; j++) { // get UV value int lon = (int) round(dx * (dw-1)); int lat = (int) round(dy * (dh-1)); int dindex = lat * dw * 3 + lon * 3; float u = data[dindex+1]; float v = data[dindex+2]; // check for invalid values if (u >= 999.0 || u <= -999.0) { u = 0.0; } if (v >= 999.0 || v <= -999.0) { v = 0.0; } // calc magnitude float mag = sqrt(u * u + v * v); mag = norm(mag, magMin, magMax); // determine alpha transparency/thickness based on magnitude and offset float jp = (float) j / (float) (points-1); float progressMultiplier = (jp + offset + doffset) - floor(jp + offset + doffset); float alpha = lerp(alphaMin, alphaMax, mag * progressMultiplier); float thickness = lerp(lineWidthMin, lineWidthMax, mag * progressMultiplier); // adjust thickness based on latitude float latMultiplier = (float) abs(lat - (dh/2)) / (float) (dh/2); float thicknessMultiplier = lerp(lineWidthLatMin, lineWidthLatMax, latMultiplier); thickness *= thicknessMultiplier; if (thickness < 1.0) thickness = 1.0; float x1 = x + u * velocityMult; float y1 = y + (-v) * velocityMult; // clamp y if (y1 < 0.0) { y1 = 0.0; } if (y1 > (th-1.0)) { y1 = th-1.0; } // check for no movement if (x==x1 && y==y1) { break; // check for invisible line } else if (alpha < 1.0) { // continue // wrap from left to right } else if (x1 < 0) { float2 intersection = lineIntersection(x, y, x1, y1, (float) 0.0, (float) 0.0, (float) 0.0, th); if (intersection.y > 0.0) { drawLine(result, (int) round(x), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); drawLine(result, (int) round((float) (tw-1.0) + x1), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); } // wrap from right to left } else if (x1 > tw-1.0) { float2 intersection = lineIntersection(x, y, x1, y1, (float) (tw-1.0), (float) 0.0, (float) (tw-1.0), th); if (intersection.y > 0.0) { drawLine(result, (int) round(x), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); drawLine(result, (int) round((float) x1 - (float)(tw-1.0)), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); } // draw it normally } else { drawLine(result, (int) round(x), (int) round(y), (int) round(x1), (int) round(y1), (int) tw, (int) th, round(alpha), (int) thickness); } // wrap x x1 = wrap(x1, 0.0, tw-1); dx = x1 / tw; dy = y1 / th; x = x1; y = y1; } } """ % (w, dw, dh, tw, th, offset, p["mag_range"][0], p["mag_range"][1], p["alpha_range"][0], p["alpha_range"][1], p["velocity_multiplier"], p["linewidth_range"][0], p["linewidth_range"][1], p["linewidth_lat_range"][0], p["linewidth_lat_range"][1]) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: # print "Using GPU" ctx = cl.Context(devices=GPUs) else: print("Warning: using CPU") ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=fData) inPData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pData) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) prg.getParticles(queue, (h, ), None, inData, inPData, outResult) # Copy result cl.enqueue_copy(queue, result, outResult) result = result.reshape((th, tw)) result = result.astype(np.uint8) return result
def quadratic_refinement_1d_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n): # demand that the data is float32 to avoid excess mem. usage assert(data.dtype == np.float32) import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() update_pixel_map_cl = program.pixel_map_err update_pixel_map_cl.set_scalar_arg_dtypes( 8*[None] + 2*[np.float32] + 7*[np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = update_pixel_map_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) # outputs: err_map = np.empty(W.shape, dtype=np.float32) pixel_shift = np.zeros(pixel_map.shape, dtype=np.float32) err_quad = np.empty((3,) + W.shape, dtype=np.float32) out = pixel_map.copy() import time d0 = time.time() # qudratic fit refinement pixel_shift.fill(0.) A = [] if data.shape[1] == 1: ss_shifts = [0] else : ss_shifts = [-1, 0, 1] if data.shape[2] == 1: fs_shifts = [0] else : fs_shifts = [-1, 0, 1] for ss_shift in ss_shifts: for fs_shift in fs_shifts: err_map.fill(9999) update_pixel_map_cl( queue, W.shape, (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapin), cl.SVM(dij_nin), cl.SVM(maskin), n0, m0, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_shift, fs_shift) queue.finish() if data.shape[1] == 1 : err_quad[fs_shift+1, :, :] = err_map A.append([fs_shift**2, fs_shift, 1]) else : err_quad[ss_shift+1, :, :] = err_map A.append([ss_shift**2, ss_shift, 1]) # now we have 3 equations and 3 unknowns # a x^2 + b x + c = err_i B = np.linalg.pinv(A) C = np.dot(B, np.transpose(err_quad, (1, 0, 2))) # minima is defined by # 2 a x + b = 0 # x = -b / 2a # where C = [a, b, c] # [0, 1, 2] det = 2*C[0] # make sure all sampled shifts have a valid error m = np.all(err_quad!=9999, axis=0) # make sure the determinant is non zero m = m * (det != 0) if data.shape[1] == 1 : pixel_shift[1][m] = (-C[1])[m] / det[m] #print(pixel_shift[1][m]) elif data.shape[2] == 1 : pixel_shift[0][m] = (-C[1])[m] / det[m] #print(pixel_shift[0][m]) # now only update pixels for which x**2 < 3**2 m = m * (np.sum(pixel_shift**2, axis=0) < 9) out[0][m] = out[0][m] + pixel_shift[0][m] out[1][m] = out[1][m] + pixel_shift[1][m] error = np.sum(np.min(err_quad, axis=0)) return out, {'pixel_shift': pixel_shift, 'error': error, 'err_quad': err_quad}
def getTemperatureImage(data, p): tRange = p["temperature_range"] gradient = p["gradient"] dataG = np.array(gradient) dataG = dataG.astype(np.float32) shape = data.shape h, w, dim = shape data = data.reshape(-1) dataG = dataG.reshape(-1) # the kernel function src = """ __kernel void lerpImage(__global float *d, __global float *grad, __global uchar *result){ int w = %d; int dim = %d; int gradLen = %d; float minValue = %f; float maxValue = %f; // get current position int posx = get_global_id(1); int posy = get_global_id(0); // get index int i = posy * w * dim + posx * dim; float temperature = d[i]; int r = 45; int g = 50; int b = 55; // assume large values are invalid if (temperature > -99.0 && temperature < 99.0) { // normalize the temperature float norm = (temperature - minValue) / (maxValue - minValue); // clamp if (norm > 1.0) { norm = 1.0; } if (norm < 0.0) { norm = 0.0; } // get color from gradient int gradientIndex = (int) round(norm * (gradLen-1)); gradientIndex = gradientIndex * 3; r = (int) round(grad[gradientIndex] * 255); g = (int) round(grad[gradientIndex+1] * 255); b = (int) round(grad[gradientIndex+2] * 255); } // set the color result[i] = r; result[i+1] = g; result[i+2] = b; } """ % (w, dim, len(gradient), tRange[0], tRange[1]) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: # print "Using GPU" ctx = cl.Context(devices=GPUs) else: print("Warning: using CPU") ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data) inG = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataG) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, (data.astype(np.uint8)).nbytes) prg.lerpImage(queue, [h, w], None, inData, inG, outResult) # Copy result result = np.empty_like(data) result = result.astype(np.uint8) cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) imOut = Image.fromarray(result, mode="RGB") return imOut
def __init__(self, features, reference, nr_features, nr_nodes): self.nr_points = len(features) self.features_np = np.asarray(features, dtype=np.float32) self.reference_np = np.asarray(reference, dtype=np.float32) self.result_np = np.empty(self.nr_points, dtype=np.float32) #self.ctx = cl.create_some_context(interactive=True, answers=None, cache_dir=None) platform = cl.get_platforms()[0] # Select the first platform [0] device = platform.get_devices()[0] # Select the first device on this platform [0] self.ctx = cl.Context([device]) # Create a context with your device self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.features_g = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.features_np) self.reference_g = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.reference_np) self.result_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, self.result_np.nbytes) self.program_fitness = cl.Program(self.ctx, """ __kernel void calculate_fitness( __global const float *r_g, __global const float *f_g, __global float *res_g, __global int *program) { int nr_features = """ + str(nr_features) + """; int nr_nodes = """ + str(nr_nodes) + """; int gid = get_global_id(0); int offset = gid * """ + str(nr_features) + """; float inputs[""" + str(nr_features + nr_nodes) + """] ; for (int i = 0; i < nr_features; i++) { inputs[i] = f_g[offset + i]; } for (int i = 0; i < nr_nodes; i++) { int id1 = program[i * 3]; int id2 = program[i * 3 + 1]; int op = program[i * 3 + 2]; float i1 = inputs[id1]; float i2 = inputs[id2]; if(op == 0) { inputs[i + nr_features] = i1 + i2; } else if(op == 1) { inputs[i + nr_features] = i1 - i2; } else if(op == 2) { inputs[i + nr_features] = i1 * i2; } else if(op == 3) { float safe_offset = 0;//(i2 > 0) ? FLT_EPSILON : -FLT_EPSILON; inputs[i + nr_features] = i1 / (i2 + safe_offset); } else if (op == 4) { inputs[i + nr_features] = tanh(i1); } else if(op == 5) { inputs[i + nr_features] = cos(i1); } else if (op == 6) { inputs[i + nr_features] = tan(i1); } else if(op == 7) { inputs[i + nr_features] = cosh(i1); } else if(op == 8) { inputs[i + nr_features] = M_PI; } else if(op == 9) { inputs[i + nr_features] = M_E; } else if(op == 10) { inputs[i + nr_features] = pow(i1,i2); } else if (op == 11) { inputs[i + nr_features] = acos(i1); } else if(op == 12) { inputs[i + nr_features] = atan(i1); } else if (op == 13) { inputs[i + nr_features] = acosh(i1); } else if(op == 14) { inputs[i + nr_features] = atanh(i1); } else if(op == 15) { inputs[i + nr_features] = sqrt(i1); } else if(op == 16) { inputs[i + nr_features] = 1; } else if(op == 17) { inputs[i + nr_features] = log(i1); } } float result1= inputs[nr_features + nr_nodes -1] - r_g[2 * gid]; float result2= inputs[nr_features + nr_nodes -2] - r_g[2 * gid + 1]; // result1 /= sqrt(r_g[2 * gid] * r_g[2 * gid] + 0.00001); float result = sqrt(result1 * result1 + result2 * result2); // result2 /= sqrt(r_g[2 * gid + 1] * r_g[2 * gid + 1] + 0.00001); // res_g[gid] = 1 / (result + 0.2) - result; res_g[gid] = -result; } """).build() self.program_predict = cl.Program(self.ctx, """ __kernel void predict( __global const float *f_g, __global float *result_predict_g, __global int *program) { int nr_features = """ + str(nr_features) + """; int nr_nodes = """ + str(nr_nodes) + """; int gid = get_global_id(0); int offset = gid * """ + str(nr_features) + """; float inputs[""" + str(nr_features + nr_nodes) + """] ; for (int i = 0; i < nr_features; i++) { inputs[i] = f_g[offset + i]; } // if (gid == 0) { for (int i = 0; i < nr_nodes; i++) { // if (i == 0) { int id1 = program[i * 3]; int id2 = program[i * 3 + 1]; int op = program[i * 3 + 2]; // result_predict_g[0] = id1; // result_predict_g[1] = id2; // result_predict_g[2] = op; float i1 = inputs[id1]; float i2 = inputs[id2]; // result_predict_g[3] = i1; // result_predict_g[4] = i2; if(op == 0) { inputs[i + nr_features] = i1 + i2; } else if(op == 1) { inputs[i + nr_features] = i1 - i2; } else if(op == 2) { inputs[i + nr_features] = i1 * i2; } else if(op == 3) { float safe_offset = 0;//(i2 > 0) ? FLT_EPSILON : -FLT_EPSILON; inputs[i + nr_features] = i1 / (i2 + safe_offset); } else if (op == 4) { inputs[i + nr_features] = tanh(i1); } else if(op == 5) { inputs[i + nr_features] = cos(i1); } else if (op == 6) { inputs[i + nr_features] = tan(i1); } else if(op == 7) { inputs[i + nr_features] = cosh(i1); } else if(op == 8) { inputs[i + nr_features] = M_PI; } else if(op == 9) { inputs[i + nr_features] = M_E; } else if(op == 10) { inputs[i + nr_features] = pow(i1,i2); } else if (op == 11) { inputs[i + nr_features] = acos(i1); } else if(op == 12) { inputs[i + nr_features] = atan(i1); } else if (op == 13) { inputs[i + nr_features] = acosh(i1); } else if(op == 14) { inputs[i + nr_features] = atanh(i1); } else if(op == 15) { inputs[i + nr_features] = sqrt(i1); } else if(op == 16) { inputs[i + nr_features] = 1; } else if(op == 17) { inputs[i + nr_features] = log(i1); } // } } result_predict_g[gid * 2] = inputs[nr_features + nr_nodes -1]; result_predict_g[gid * 2 + 1] = inputs[nr_features + nr_nodes -2]; // } } """).build()
def select_device(): global context, selected_device,selected_platform,selected_device_max_size,AMD_WARNING_SHOWN if context is not None: return log_version_info() log_platforms_info() #try to choose a platform and device using *our* heuristics / env options: options = {} log("select_device() environment preferred DEVICE_NAME=%s, DEVICE_TYPE=%s, DEVICE_PLATFORM=%s", PREFERRED_DEVICE_NAME, PREFERRED_DEVICE_TYPE, PREFERRED_DEVICE_PLATFORM) for platform in opencl_platforms: log("evaluating platform=%s", platform.name) if platform.name.startswith("AMD") and not AMD_WARNING_SHOWN: log.warn("Warning: the AMD OpenCL is loaded, it is known to interfere with signal delivery!") log.warn(" please consider disabling OpenCL or removing the AMD icd") AMD_WARNING_SHOWN = True devices = platform.get_devices() is_cuda = platform.name.find("CUDA")>=0 for d in devices: if d.available and d.compiler_available and d.get_info(pyopencl.device_info.IMAGE_SUPPORT): if not is_supported(platform.name) and (len(PREFERRED_DEVICE_PLATFORM)==0 or str(platform.name).find(PREFERRED_DEVICE_PLATFORM)<0): log("ignoring unsupported platform/device: %s / %s", platform.name, d.name) continue dtype = device_type(d) log("evaluating device type=%s, name=%s", dtype, d.name) if is_cuda: score = 0 elif dtype==PREFERRED_DEVICE_TYPE: score = 40 else: score = 10 if len(PREFERRED_DEVICE_NAME)>0 and d.name.find(PREFERRED_DEVICE_NAME)>=0: score += 50 if len(PREFERRED_DEVICE_PLATFORM)>0 and str(platform.name).find(PREFERRED_DEVICE_PLATFORM)>=0: score += 50 #Intel SDK does not work (well?) on AMD CPUs #and CUDA has problems doing YUV to RGB.. if platform.name.startswith("Intel"): if d.name.find("AMD")>=0 or is_cuda: score = max(0, score - 20) elif d.name.find("Intel")>=0: score += 10 options.setdefault(score, []).append((d, platform)) log("best device/platform option%s: %s", engs(options), options) for score in reversed(sorted(options.keys())): for d, p in options.get(score): try: log("trying platform: %s", platform_info(p)) log("with %s device: %s", device_type(d), device_info(d)) context = pyopencl.Context([d]) selected_device_max_size = d.get_info(pyopencl.device_info.IMAGE2D_MAX_WIDTH), d.get_info(pyopencl.device_info.IMAGE2D_MAX_HEIGHT) selected_platform = p selected_device = d log.info(" using platform: %s", platform_info(selected_platform)) log_device_info(selected_device) #save device costs: global selected_device_cpu_cost, selected_device_gpu_cost, selected_device_setup_cost if device_type(d)=="GPU": selected_device_cpu_cost = 0 selected_device_gpu_cost = 50 selected_device_setup_cost = 40 else: selected_device_cpu_cost = 100 selected_device_gpu_cost = 0 selected_device_setup_cost = 20 log("device is a %s, using CPU cost=%s, GPU cost=%s", device_type(d), selected_device_cpu_cost, selected_device_gpu_cost) log(" max image 2d size: %s", selected_device_max_size) return except Exception as e: log.warn(" failed to use %s", platform_info(p)) log.warn(" with %s device %s", device_type(d), device_info(d)) log.warn(" Error: %s", e) #fallback to pyopencl auto mode: log.warn("OpenCL Error: failed to find a working platform and device combination... trying with pyopencl's 'create_some_context'") context = pyopencl.create_some_context(interactive=False) devices = context.get_info(pyopencl.context_info.DEVICES) log.info("chosen context has %s device%s:", len(devices), engs(devices)) for d in devices: log_device_info(d) assert len(devices)==1, "we only handle a single device at a time, sorry!" selected_device = devices[0] assert context is not None and selected_device is not None
def match_dtype_to_c_struct(device, name, dtype, context=None): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) .. versionadded: 2013.1 This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '<u4'), ('value', '<f4')] >>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ fields = sorted(six.iteritems(dtype.fields), key=lambda name_dtype_offset: name_dtype_offset[1][1]) c_fields = [] for field_name, (field_dtype, offset) in fields: c_fields.append(" %s %s;" % (dtype_to_ctype(field_dtype), field_name)) c_decl = "typedef struct {\n%s\n} %s;\n\n" % ("\n".join(c_fields), name) cdl = _CDeclList(device) for field_name, (field_dtype, offset) in fields: cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join( "result[%d] = pycl_offsetof(%s, %s);" % (i + 1, name, field_name) for i, (field_name, (field_dtype, offset)) in enumerate(fields)) src = r""" #define pycl_offsetof(st, m) \ ((size_t) ((__local char *) &(dummy.m) \ - (__local char *)&dummy )) %(pre_decls)s %(my_decl)s __kernel void get_size_and_offsets(__global size_t *result) { result[0] = sizeof(%(my_type)s); __local %(my_type)s dummy; %(offset_code)s } """ % dict(pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) if context is None: context = cl.Context([device]) queue = cl.CommandQueue(context) prg = cl.Program(context, src) knl = prg.build(devices=[device]).get_size_and_offsets import pyopencl.array # noqa result_buf = cl.array.empty(queue, 1 + len(fields), np.uintp) knl(queue, (1, ), (1, ), result_buf.data) queue.finish() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) from pytools import any offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [offset for field_name, (field_dtype, offset) in fields] else: raise RuntimeError("cannot discover struct layout on '%s'" % device) result_buf.data.release() del knl del prg del queue del context try: dtype_arg_dict = { 'names': [field_name for field_name, (field_dtype, offset) in fields], 'formats': [field_dtype for field_name, (field_dtype, offset) in fields], 'offsets': [int(x) for x in offsets], 'itemsize': int(size_and_offsets[0]), } dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1) dtype = np.dtype(dtype_arg_dict) except NotImplementedError: def calc_field_type(): total_size = 0 padding_count = 0 for offset, (field_name, (field_dtype, _)) in zip(offsets, fields): if offset > total_size: padding_count += 1 yield ('__pycl_padding%d' % padding_count, 'V%d' % offset - total_size) yield field_name, field_dtype total_size = field_dtype.itemsize + offset dtype = np.dtype(list(calc_field_type())) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl
import pyopencl as cl import numpy as np import logging from scipy.misc import * # Get platforms, both CPU and GPU plat = cl.get_platforms() CPU = plat[0].get_devices() try: GPU = plat[1].get_devices() except IndexError: GPU = "none" #Create context for GPU/CPU if GPU != "none": ctx = cl.Context(GPU) else: ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags #Test sequence constants frame_num = 1700 rel_path = './input/in00%04d.jpg' dest_path = './output/out00%04d.jpg' # Kernel function kernel_src = 'median.cl'
def make_pixel_map_err(data, mask, W, O, pixel_map, n0, m0, dij_n, roi, search_window=20, grid=[20, 20]): # demand that the data is float32 to avoid excess mem. usage assert(data.dtype == np.float32) import time t0 = time.time() ################################################################## # OpenCL crap ################################################################## import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() make_error_map_subpixel = program.make_error_map_subpixel make_error_map_subpixel.set_scalar_arg_dtypes( [None, None, None, None, None, None, None, None, None, np.float32, np.float32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = make_error_map_subpixel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) # outputs: err_map = np.zeros((grid[0]*grid[1], search_window**2), dtype=np.float32) pixel_mapout = pixel_map.astype(np.float32) ################################################################## if type(search_window) is int : s_ss = search_window s_fs = search_window else : s_ss, s_fs = search_window ss_min, ss_max = (-(s_ss-1)//2, (s_ss+1)//2) fs_min, fs_max = (-(s_fs-1)//2, (s_fs+1)//2) # list the pixels for which to calculate the error grid ijs = [] for i in np.linspace(roi[0], roi[1]-1, grid[0]): for j in np.linspace(roi[2], roi[3]-1, grid[1]): ijs.append([round(i), round(j)]) ijs = np.array(ijs).astype(np.int32) for i in tqdm.trange(1, desc='calculating pixel map shift errors'): make_error_map_subpixel(queue, (1, ijs.shape[0]), (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ijs), n0, m0, ijs.shape[0], data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max) queue.finish() t1 = time.time() t = t1-t0 res = make_pixel_map_err_report(ijs, err_map, mask, search_window, roi, t) return ijs, err_map, res
import pyopencl as cl import numpy as np import sys platforms = cl.get_platforms() platform = platforms[0] devs = platform.get_devices(cl.device_type.GPU) dev = devs[0] mf = cl.mem_flags ctx = cl.Context([dev]) queue = cl.CommandQueue(ctx, dev) a = np.arange(24).astype(np.int32).reshape(3, 4, 2) b = np.zeros(2).astype(np.int32) b1 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) b2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b) prog = cl.Program( ctx, """ #define GL_ID (int4)(get_global_id(1), get_global_id(0), get_global_id(2), 0) __kernel void Buffer( __global int *ary) { int4 id = GL_ID; int i = id.z * 12 + id.y * 4 + id.x; ary[i] = ary[i] * 10; printf("%d %d %d: %d\\n", id.x, id.y, id.z, ary[i]); }
def addParticlesToImage(base, temperature, particles, p): basePx = np.array(base) basePx = basePx.astype(np.uint8) tempPx = np.array(temperature) tempPx = tempPx.astype(np.uint8) shape = basePx.shape h, w, dim = shape basePx = basePx.reshape(-1) tempPx = tempPx.reshape(-1) particles = particles.reshape(-1) # the kernel function src = """ __kernel void addParticles(__global uchar *base, __global uchar *colors, __global uchar *particles, __global uchar *result){ int w = %d; int dim = %d; float power = 1.0 - %f; // lower number = more visible lines int posx = get_global_id(1); int posy = get_global_id(0); int i = posy * w * dim + posx * dim; int j = posy * w + posx; float alpha = (float) particles[j] / 255.0; int r = colors[i]; int g = colors[i+1]; int b = colors[i+2]; int baseR = base[i]; int baseG = base[i+1]; int baseB = base[i+2]; // temp hack, convert to grayscale //float count = 3.0; //int baseAvg = (int) round((float) ((float) baseR + (float) baseG + (float) baseB) / count); //baseR = baseAvg; //baseG = baseAvg; //baseB = baseAvg; if (alpha > 0) { alpha = pow(alpha*alpha + alpha*alpha, power); if (alpha > 1.0) { alpha = 1.0; } //r = (int) round((r * alpha)); //g = (int) round((g * alpha)); //b = (int) round((b * alpha)); float inv = 1.0 - alpha; r = (int) round(((float) r * alpha) + ((float) baseR * inv)); g = (int) round(((float) g * alpha) + ((float) baseG * inv)); b = (int) round(((float) b * alpha) + ((float) baseB * inv)); } else { r = baseR; g = baseG; b = baseB; } result[i] = r; result[i+1] = g; result[i+2] = b; } """ % (w, dim, p["line_visibility"]) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: ctx = cl.Context(devices=GPUs) else: print("Warning: using CPU") ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=basePx) inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=tempPx) inC = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=particles) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, basePx.nbytes) prg.addParticles(queue, [h, w], None, inA, inB, inC, outResult) # Copy result result = np.empty_like(basePx) cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) return result
''' Listing 3.3: Copying and mapping buffer objects ''' import pyopencl as cl import numpy as np import utility kernel_src = ''' __kernel void blank(__global float *a, __global float *b) { } ''' # Get device and context, create command queue and program dev = utility.get_default_device() context = cl.Context(devices=[dev]) queue = cl.CommandQueue(context, dev) prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror'], devices=[dev]) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) data_one = np.arange(start=0, stop=100, step=1, dtype=np.float32) data_two = -np.arange(start=0, stop=100, step=1, dtype=np.float32) # Create buffers flags = cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR buffer_one = cl.Buffer(context, flags, hostbuf=data_one)
def lerpData(dataA, dataB, mu, offset=0): dataLen = len(dataA) if dataLen != len(dataB): print("Warning: data length mismatch") shape = (len(dataA[0]), len(dataA[0][0]), 3) h, w, dim = shape result = np.empty(h * w * dim, dtype=np.float32) # read data as floats dataA = np.array(dataA) dataA = dataA.astype(np.float32) dataB = np.array(dataB) dataB = dataB.astype(np.float32) # convert to 1-dimension dataA = dataA.reshape(-1) dataB = dataB.reshape(-1) # the kernel function src = """ static bool isValid(float value) { return (value > -999.0 && value < 999.0); } __kernel void lerpData(__global float *a, __global float *b, __global float *result){ int dlen = %d; int h = %d; int w = %d; int dim = %d; float mu = %f; int offsetX = %d; // get current position int posx = get_global_id(1); int posy = get_global_id(0); // convert position from 0,360 to -180,180 int posxOffset = posx; if (offsetX > 0 || offsetX < 0) { if (posx < offsetX) { posxOffset = posxOffset + offsetX; } else { posxOffset = posxOffset - offsetX; } } // get indices int j = posy * w * dim + posx * dim; // get the mean values for a and b datasets float a1 = 0; float a2 = 0; float a3 = 0; float b1 = 0; float b2 = 0; float b3 = 0; float a1count = 0; float a2count = 0; float a3count = 0; float b1count = 0; float b2count = 0; float b3count = 0; for(int k=0; k<dlen; k++) { int i = k * h * w * dim + posy * w * dim + posxOffset * dim; if (isValid(a[i])) { a1 = a1 + a[i]; a1count = a1count + 1.0; } if (isValid(a[i+1])) { a2 = a2 + a[i+1]; a2count = a2count + 1.0; } if (isValid(a[i+2])) { a3 = a3 + a[i+2]; a3count = a3count + 1.0; } if (isValid(b[i])) { b1 = b1 + b[i]; b1count = b1count + 1.0; } if (isValid(b[i+1])) { b2 = b2 + b[i+1]; b2count = b2count + 1.0; } if (isValid(b[i+2])) { b3 = b3 + b[i+2]; b3count = b3count + 1.0; } } if (a1count > 0) { a1 = a1 / a1count; } // else { a1 = -9999.0; } if (a2count > 0) { a2 = a2 / a2count; } if (a3count > 0) { a3 = a3 / a3count; } if (b1count > 0) { b1 = b1 / b1count; } // else { b1 = -9999.0; } if (b2count > 0) { b2 = b2 / b2count; } if (b3count > 0) { b3 = b3 / b3count; } // set result float t = a1 + mu * (b1-a1); float u = a2 + mu * (b2-a2); float v = a3 + mu * (b3-a3); // if (a1 <= -9999.0 || b1 <= -9999.0) t = -9999.0; result[j] = t; result[j+1] = u; result[j+2] = v; } """ % (dataLen, h, w, dim, mu, offset) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: ctx = cl.Context(devices=GPUs) else: print("Warning: using CPU") ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataA) inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataB) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) prg.lerpData(queue, [h, w], None, inA, inB, outResult) # Copy result cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) return result
def gpu_chain(self, silent=False): # Calculate binomarl distribution self.defines_dic0 = { "T": str(self.T), "dim": str(self.dim), "computeUnits": str(self.computeUnits), "wrkUnit": str(self.wrkUnit), "n_elem": "np.ceil((dim*dim+wrkUnit)/wrkUnit).astype(np.int)", # For temporary result matrix "n_wrkGroups": "computeUnits", "matSize": "dim*dim", "n_kernel_param": str(self.n_kernel_param), "n_data_dim": str(self.n_data_dim), "minVal": str(0.000001) } definesToLocals(self.defines_dic0) self.defines0 = definesFromDict(self.defines_dic0) + " -cl-std=CL1.2" diag0 = np.zeros((dim, n_kernel_param)).astype(cl.cltypes.float) diag0 = self.kernels.astype(cl.cltypes.float) #diag0[:,4] = 0 #data points mat0 = np.zeros((T, n_data_dim)).astype(cl.cltypes.float) #mat0 = self.data.astype(cl.cltypes.float) self.res0 = np.zeros((T, dim), np.float32) self.code_kernel = open(self.file_diag_kernel, "r").read() self.code_diag = open(self.file_diag_normal, "r").read() self.code0 = self.code_kernel + "\n" + self.code_diag self.platform = cl.get_platforms()[self.platform_id] self.device = self.platform.get_devices()[0] if (not silent): print("Using Device : ", self.device.name) print("Compute Units: ", self.computeUnits) self.context = cl.Context([self.device]) self.program0 = cl.Program(self.context, self.code0).build(self.defines0) self.queue = cl.CommandQueue(self.context) # Buffer creation mem_flags = cl.mem_flags self.mat_buf0 = cl.Buffer(self.context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=mat0) self.diag_buf0 = cl.Buffer(self.context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=diag0) self.res_buf0 = cl.Buffer(self.context, mem_flags.READ_WRITE, self.res0.nbytes) self.kernel0 = self.program0.diag_normal # Set program arguments self.globalItems0 = (T, ) self.localItems0 = None # (32, ) self.kernel0.set_arg(0, self.diag_buf0) self.kernel0.set_arg(1, self.mat_buf0) self.kernel0.set_arg(2, self.res_buf0) # Diagonal Matrix multiplying defines_dic1 = { "T": str(self.T), "dim": str(self.dim), "computeUnits": str(self.computeUnits), "wrkUnit": str(self.wrkUnit), "n_elem": "np.ceil((dim*dim+wrkUnit)/wrkUnit).astype(np.int)", # For temporary result matrix "n_wrkGroups": "computeUnits", "matSize": "dim*dim", "n_kernel_param": str(self.n_kernel_param), "n_data_dim": str(self.n_data_dim), "minVal": str(0.000001) } definesToLocals(defines_dic1) self.defines1 = definesFromDict(defines_dic1) + " -cl-std=CL1.2" # For use with diag_mat_mulB #n_dat_mat1 = np.zeros(wrkUnit*computeUnits + 1).astype(np.int) #n_dat_mat1[1:] = int(T / (wrkUnit*computeUnits)) #n_dat_mat1[1:T%(wrkUnit*computeUnits) + 1] += 1 #n_dat_mat1 = np.cumsum(n_dat_mat1) #n_dat_mat1 = n_dat_mat1.astype(cl.cltypes.int) #self.n_dat_mat1 = n_dat_mat1 n_dat_mat1 = np.zeros(n_wrkGroups + 1).astype(np.int) n_dat_mat1[1:] = int((T) / n_wrkGroups) n_dat_mat1[1:(T) % n_wrkGroups + 1] += 1 n_dat_mat1 = np.cumsum(n_dat_mat1) n_dat_mat1 = n_dat_mat1.astype(cl.cltypes.int) self.n_dat_mat1 = n_dat_mat1 # transistion matrix mat1 = np.zeros((dim, dim)).astype(cl.cltypes.float) mat1 = self.transistion_matrix.astype(cl.cltypes.float) # Calculated from data and kernels - taken from previous kernel #diag1 = np.random.random((T, dim)).astype(cl.cltypes.float) # Result self.res1 = np.zeros((T, dim, dim), np.float32) self.code1 = open(self.file_diag_mat_mul, "r").read() self.program1 = cl.Program(self.context, self.code1).build(self.defines1) # Buffer creation mem_flags = cl.mem_flags self.n_dat_mat_buf1 = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=n_dat_mat1) self.mat_buf1 = cl.Buffer(self.context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=mat1) #self.diag_buf1 = cl.Buffer(self.context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=diag1) self.res_buf1 = cl.Buffer(self.context, mem_flags.READ_WRITE, self.res1.nbytes) self.kernel1 = self.program1.diag_mat_mul # Set program arguments # For use with diag_mat_mulB #self.globalItems1 = ( computeUnits*wrkUnit, ) #self.localItems1 = None # (32, ) self.globalItems1 = (computeUnits * dim, ) self.localItems1 = (dim, ) #print(self.globalItems1) self.kernel1.set_arg(0, self.n_dat_mat_buf1) self.kernel1.set_arg(1, self.mat_buf1) # Transistion matrix self.kernel1.set_arg( 2, self.res_buf0) # Diagonal matrix values from previous step self.kernel1.set_arg(3, self.res_buf1) # MatrixMatrix Multiplying self.defines_dic2 = { "T": str(self.T), "dim": str(self.dim), "computeUnits": str(self.computeUnits), "wrkUnit": str(self.wrkUnit), "n_elem": "np.ceil((dim*dim+wrkUnit)/wrkUnit).astype(np.int)", # For temporary result matrix "n_wrkGroups": "computeUnits", "matSize": "dim*dim", "n_kernel_param": str(self.n_kernel_param), "n_data_dim": str(self.n_data_dim), "minVal": str(0.000001) } definesToLocals(self.defines_dic2) self.defines2 = definesFromDict(self.defines_dic2) + " -cl-std=CL1.2" n_element_mat2 = np.zeros(wrkUnit + 1).astype(np.int) n_element_mat2[1:] = int(dim * dim / wrkUnit) n_element_mat2[1:(dim * dim + 1) % wrkUnit] += 1 n_element_mat2 = np.cumsum(n_element_mat2) n_element_mat2 = n_element_mat2.astype(cl.cltypes.int) self.n_element_mat2 = n_element_mat2 n_mat_mat2 = np.zeros(n_wrkGroups + 1).astype(np.int) n_mat_mat2[1:] = int((T) / n_wrkGroups) n_mat_mat2[1:(T) % n_wrkGroups + 1] += 1 n_mat_mat2 = np.cumsum(n_mat_mat2) n_mat_mat2 = n_mat_mat2.astype(cl.cltypes.int) self.n_mat_mat2 = n_mat_mat2 # Matrix input - already made as output from previous step #mat2 = np.random.random((T, dim, dim)).astype(cl.cltypes.float) / T # Matrix outputs self.res2 = np.zeros((computeUnits, dim, dim), np.float32) # Scaling coefficients used in matrix multiplication self.resCoef2 = np.zeros((computeUnits), np.int32) self.code2 = open(self.file_matrixmul, "r").read() self.program2 = cl.Program(self.context, self.code2).build(self.defines2) # Buffer creation self.n_element_mat_buf2 = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.n_element_mat2) self.n_mat_mat_buf2 = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=self.n_mat_mat2) #mat_buf2 = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=mat2) self.res_buf2 = cl.Buffer(self.context, mem_flags.READ_WRITE, self.res2.nbytes) self.resCoef_buf2 = cl.Buffer(self.context, mem_flags.READ_WRITE, self.resCoef2.nbytes) self.kernel2 = self.program2.matrixmul # Set program arguments self.globalItems2 = (computeUnits * wrkUnit, ) self.localItems2 = (wrkUnit, ) self.kernel2.set_arg(0, self.n_element_mat_buf2) self.kernel2.set_arg(1, self.n_mat_mat_buf2) #self.kernel2.set_arg(2, self.mat_buf2 ) self.kernel2.set_arg(2, self.res_buf1) # input argument for MM self.kernel2.set_arg(3, self.res_buf2) self.kernel2.set_arg(4, self.resCoef_buf2) cl.enqueue_copy(self.queue, self.mat_buf0, self.data.astype(cl.cltypes.float), is_blocking=True) cl.enqueue_copy(self.queue, self.mat_buf1, self.transistion_matrix.astype(cl.cltypes.float), is_blocking=True) cl.enqueue_copy(self.queue, self.diag_buf0, self.kernels.astype(cl.cltypes.float), is_blocking=True)
def lerpImage(imA, imB, mu): # read pixels and floats pxA = np.array(imA) pxA = pxA.astype(np.uint8) pxB = np.array(imB) pxB = pxB.astype(np.uint8) shape = pxA.shape h, w, dim = shape pxA = pxA.reshape(-1) pxB = pxB.reshape(-1) # the kernel function src = """ __kernel void lerpImage(__global uchar *a, __global uchar *b, __global float *mu, __global uchar *result){ int w = %d; int dim = %d; float m = *mu; int posx = get_global_id(1); int posy = get_global_id(0); int i = posy * w * dim + posx * dim; result[i] = a[i] + m * (b[i]-a[i]); result[i+1] = a[i+1] + m * (b[i+1]-a[i+1]); result[i+2] = a[i+2] + m * (b[i+2]-a[i+2]); } """ % (w, dim) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: ctx = cl.Context(devices=GPUs) else: print("Warning: using CPU") ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pxA) inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pxB) inMu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.float32(mu)) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, pxA.nbytes) prg.lerpImage(queue, shape, None, inA, inB, inMu, outResult) # Copy result result = np.empty_like(pxA) cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) imOut = Image.fromarray(result, mode="RGB") return imOut
VECTOR_SIZE = 50000 # Elements of vector # Create two random vectors a & b a_host = np.random.rand(VECTOR_SIZE).astype(np.float32) b_host = np.random.rand(VECTOR_SIZE).astype(np.float32) # Create a empty vector for the result res_host = np.zeros(VECTOR_SIZE).astype(np.float32) # Create CL context platform = cl.get_platforms()[0] device = platform.get_devices()[0] #get first gpu available print "Running: ", platform print "On GPU: ", device ctx = cl.Context([device]) queue = cl.CommandQueue(ctx) # Transfer host (CPU) memory to device (GPU) memory mf = cl.mem_flags a_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_host) b_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_host) # Kernel code prg = cl.Program( ctx, """ __kernel void sum(__global const float *a_gpu, __global const float *b_gpu, __global float *res_gpu) { int gid = get_global_id(0); res_gpu[gid] = a_gpu[gid] + b_gpu[gid]; } """).build()
import pyopencl as CL from pyopencl import array import numpy CL.tools.clear_first_arg_caches() c = CL.Context([CL.get_platforms()[0].get_devices()[0]]) k = CL.Program( c, """ #include \"lambda.cl\" kernel void test(global const ulong *in, global ulong *out) { uint i = get_global_id(0); out[i] = lex(in[i], 0); }""").build("-I./src/cl") q = CL.CommandQueue(c) flags = CL.mem_flags # 290 = i i # 1323270 = (k i) k # 659718 = (k* i) k # 72218 = Ω b_in = numpy.zeros((1, 1), CL.array.vec.ulong8) #b_in[0, 0] = (290, 1323270, 659718, 72218) b_in[0, 0] = (4, 48, 16, 88, 90466, 0, 0, 0) b_out = numpy.zeros(5, numpy.uint64) mem_in = CL.Buffer(c, flags.READ_ONLY | flags.COPY_HOST_PTR, hostbuf=b_in)