def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices=devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking=blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking=blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking=blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking=blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking=blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for=[in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def finalize(self, tree, program_config): arg_cfg = program_config[0] fn = WarpImg2DConcreteOcl() program = clCreateProgramWithSource(fn.context, tree.codegen()).build() ptr = program[self.entry_point] return fn.finalize(ptr, arg_cfg[0][1])
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value kernel_name = self._gen_unique_kernel_name() kernel_src = StringTemplate(""" __kernel void $kernel_name(__global float * $arr) { int x = get_global_id(0); float sum = $arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += $arr[i * $size + x]; } $arr[x] = sum; } """, {'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'size': C.Constant(size), 'kernel_name': C.SymbolRef(kernel_name)}) program = cl.clCreateProgramWithSource( latte.config.cl_ctx, kernel_src.codegen()).build() kernel = program[kernel_name] self.kernels[kernel_name] = kernel kernel.setarg(0, self.cl_buffers[var], ctypes.sizeof(cl.cl_mem)) return StringTemplate( """ size_t global_size_{kernel_name}[1] = {{{looplen1}}}; clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL); clFinish(queue); """.format( kernel_name=kernel_name, looplen1=size) )
def finalize(self, tree, program_config): arg_cfg, tune_cfg = program_config param_types = [ np.ctypeslib.ndpointer(arg.dtype, arg.ndim, arg.shape) for arg in arg_cfg + (self.output, ) ] if self.backend == StencilOclTransformer: param_types.append(param_types[0]) fn = OclStencilFunction() program = clCreateProgramWithSource(fn.context, tree.codegen()).build() stencil_kernel_ptr = program['stencil_kernel'] global_size = tuple(dim - 2 * self.kernel.ghost_depth for dim in arg_cfg[0].shape) finalized = fn.finalize(stencil_kernel_ptr, global_size, self.kernel.ghost_depth, self.output) else: param_types.append(POINTER(c_float)) kernel_sig = CFUNCTYPE(c_void_p, *param_types) fn = StencilFunction() finalized = fn.finalize(tree, "stencil_kernel", kernel_sig, self.output) self.output = None return finalized
def finalize(self, tree, program_config): arg_cfg, tune_cfg = program_config len_x = arg_cfg[0][1][0] len_y = arg_cfg[0][1][1] fn = OclFunc2() program = clCreateProgramWithSource(fn.context, tree.codegen()).build() ptr = program[self.entry_point] return fn.finalize(ptr, (len_x, len_y))
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom_diff = symbol_table[sinks[0]] count = np.prod(bottom_diff.shape) num = bottom_diff.shape[0] spatial_dim = int(np.prod(bottom_diff.shape[2:])) kernels = Template(""" __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_scale(float scale, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] *= scale; } } __kernel void SoftmaxLossBackwardGPU(global const float* label, global float* bottom_diff) { int index = get_global_id(0); if (index < $global_size) { const int n = index / $spatial_dim; const int s = index % $spatial_dim; const int label_value = (int)label[n * $spatial_dim + s]; bottom_diff[n * $dim + label_value * $spatial_dim + s] -= 1; } }""").substitute(count=count, spatial_dim=spatial_dim, dim=np.prod(bottom_diff.shape[1:]), global_size=num*spatial_dim) program = cl.clCreateProgramWithSource(context, kernels).build() copy = program['kernel_copy'] copy.argtypes = (cl.cl_mem, cl.cl_mem) scale = program['kernel_scale'] scale.argtypes = (cl.cl_float, cl.cl_mem) backward = program['SoftmaxLossBackwardGPU'] backward.argtypes = (cl.cl_mem, cl.cl_mem) class Launcher(object): def compile(self): pass def launch(self, symbol_table): bottom_diff = symbol_table[sinks[0]] # top_diff = symbol_table[sources[0]] label = symbol_table[sources[1]] prob = symbol_table[sources[2]] copy(prob.ocl_buf, bottom_diff.ocl_buf).on( queue, (np.prod(prob.shape),)) backward(label.ocl_buf, bottom_diff.ocl_buf).on( queue, (num * spatial_dim), ) loss_weight = 1.0 scale(np.float32(loss_weight / float(num)), bottom_diff.ocl_buf).on(queue, (np.prod(prob.shape), )) return Launcher()
def finalize(self, tree, program_config): arg_cfg, tune_cfg = program_config fn = OclFunc() program = clCreateProgramWithSource( fn.context, tree.codegen()).build() ptr = program[self.entry_point] return fn.finalize( ptr, (int(arg_cfg[0][2][1] / 2), int(arg_cfg[0][2][0] / 2)), self.output_name )
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ A = program_config[0] len_A = np.prod(A._shape_) inner_type = A._dtype_.type() # browser_show_ast(py_ast,'tmp.png') apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_kernel = FunctionDecl( None, "apply_kernel", params=[SymbolRef("A", A()).set_global()], defn=[ Assign(SymbolRef("i", ct.c_int()), get_global_id(0)), If(Lt(SymbolRef("i"), Constant(len_A)), [ Assign( ArrayRef(SymbolRef("A"), SymbolRef("i")), FunctionCall( SymbolRef("apply"), [ArrayRef(SymbolRef("A"), SymbolRef("i"))])), ], []), ]).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate( r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf) { size_t global = $n; size_t local = 32; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'n': Constant(len_A + 32 - (len_A % 32))}) proj = Project([kernel, CFile("generated", [control])]) fn = OpFunction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def finalize(self, transform_result, program_config): ocl_kernel = transform_result[0] c_controller = transform_result[1] proj = Project([ocl_kernel, c_controller]) fn = ConcreteReduction() # define the ConcreteSpecializeFunction subclass to use program = cl.clCreateProgramWithSource(fn.context, ocl_kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def finalize(self, transform_result, program_config): kernel, cfile = transform_result proj = Project([kernel, cfile]) fn = OpFunction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def finalize(self, tree, entry_type, entry_point): fn = DLAConcreteOCL(self.output) self.output = None self.fusable_nodes = [] kernel = tree.files[-1] program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() kernel_ptr = program[kernel.body[0].name] return fn.finalize(tree, ct.CFUNCTYPE(*entry_type), entry_point, kernel_ptr)
def compile(self): if self.kernel is None: params = set(self.sources) | set(self.sinks) seen_decls = set() seen_params = set() decls = [] filtered = set() for param in params: if param.level == 'register': if param.name not in seen_decls: seen_decls.add(param.name) decls.append('float {}'.format(param.name)) else: if param.name not in seen_params: seen_params.add(param.name) filtered.add(param) self.params = list(filtered) self.params.sort(key=lambda x: x.name) params = [] sinks = set(sink.name for sink in self.sinks) for param in self.params: if param.name in sinks: str = "global float* {}".format(param.name) else: str = "global const float* {}".format(param.name) params.append(str) params_str = ", ".join(params) decls = ";\n\t\t\t".join(decls) + ";\n" kernel_name = get_unique_kernel_name() kernel = Template(""" __kernel void $name($params) { int index = get_global_id(0); if (index < $num_work_items) { $decls $body } } """).substitute(name=kernel_name, params=params_str, body=self.body, decls=decls, num_work_items=self.launch_parameters[0]) # print([p.name for p in self.params]) print(kernel) self.kernel_str = kernel kernel = cl.clCreateProgramWithSource( context, kernel).build()[kernel_name] kernel.argtypes = tuple(cl.cl_mem for _ in self.params) self.kernel = kernel
def old_transform(self, tree, program_config): call_args = program_config[0] base_size = call_args.base_shape[0] * call_args.base_shape[1] border = call_args.border body = StringTemplate(""" void __kernel matrix_powers_copy_base_layer(__global const $type* input, __global $type* output) { int x = get_global_id(0); int y = get_global_id(1); output[y * $len_x + x] = input[y * $len_x + x]; } void __kernel matrix_powers_compute_next_step(__global $type* matrix, const int power) { int x = get_global_id(0); int y = get_global_id(1); matrix[(power+1) * $base_size + y * $len_x + x] = 0.1f * matrix[ power * $base_size + clamp(y-1, $border, $len_y-$border-1) * $len_x + clamp(x, $border, $len_x-$border-1) ] + 0.1f * matrix[ power * $base_size + clamp(y+1, $border, $len_y-$border-1) * $len_x + clamp(x, $border, $len_x-$border-1) ] + 0.4f * matrix[ power * $base_size + clamp(y, $border, $len_y-$border-1) * $len_x + clamp(x-1, $border, $len_x-$border-1) ] + 0.4f * matrix[ power * $base_size + clamp(y, $border, $len_y-$border-1) * $len_x + clamp(x+1, $border, $len_x-$border-1) ] + 1.0f * matrix[ power * $base_size + clamp(y, $border, $len_y-$border-1) * $len_x + clamp(x, $border, $len_x-$border-1) ]; } """, { 'type': SymbolRef('float'), 'len_x': Constant(call_args.base_shape[1]), 'len_y': Constant(call_args.base_shape[0]), 'base_size': Constant(base_size), 'border': Constant(border), }) fn = OclMatrixPowers() kernel = OclFile("kernel", [body]) # print(kernel.codegen()) program = clCreateProgramWithSource(fn.context, kernel.codegen()).build() ptr = program['matrix_powers_copy_base_layer'] ptr2 = program['matrix_powers_compute_next_step'] return fn.finalize(ptr, ptr2, (call_args.base_shape[1], call_args.base_shape[0]))
def build_kernel(self, kernel_src, kernel_name, kernel_args): kernel_src = C.CFile('generated', [StringTemplate( """ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) """ ), kernel_src]) try: program = cl.clCreateProgramWithSource( latte.config.cl_ctx, kernel_src.codegen()).build() kernel = program[kernel_name] except cl.BuildProgramFailureError as e: logger.error("Failed build program:\n %s", kernel_src.codegen()) raise e self.kernels[kernel_name] = kernel for index, arg in enumerate(kernel_args): kernel.setarg(index, self.cl_buffers[arg], ctypes.sizeof(cl.cl_mem)) logger.debug(kernel_src)
def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices = devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def get_launcher(cls, sources, sinks, keyword, symbol_table): bottoms = sources concat_kern = Template(""" __kernel void concat(global const float* bottom, global float* top, int top_offset, int bot_offset) { int index = get_global_id(0); top[index + top_offset] = bottom[index + bot_offset]; } """).substitute() program = cl.clCreateProgramWithSource(context, concat_kern).build() kernel = program['concat'] kernel.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int, cl.cl_int) class Launcher(): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): top = symbol_table[sinks[0].name] bots = [symbol_table[b.name] for b in bottoms] evts = [] concat_off = 0 for i in range(len(bottoms)): count = np.prod(bots[i].shape[1:]) for n in range(bots[i].shape[0]): top_offset = n * np.prod( top.shape[1:]) + concat_off * np.prod( top.shape[2:]) evt = kernel(bots[i].ocl_buf, top.ocl_buf, top_offset, n * count).on(queues[n % len(queues)], (count, ), wait_for=wait_for) evts.append(evt) concat_off += bots[i].shape[1] return evts return Launcher(sources, sinks)
def ocl_init( ocl_src ): platforms = cl.clGetPlatformIDs() use_devices = None for platform in platforms: try: devices = cl.clGetDeviceIDs(platform,device_type=cl.CL_DEVICE_TYPE_GPU) use_devices = devices[0:1] # arbitraily choose first device except cl.DeviceNotFoundError: pass if use_devices is not None: break if use_devices is None: raise ValueError( "no GPU openCL device found" ) assert use_devices is not None print( "OpenCL use_devices: " + str(use_devices) ) context = cl.clCreateContext(use_devices) queue = cl.clCreateCommandQueue(context) prog = cl.clCreateProgramWithSource( context, ocl_src ).build() print prog #run_mxplusb( prog, queue ) run_conv( prog, queue )
def transform(self, tree, program_config): kernelFunc = program_config[0] kernelPath = os.path.join(os.getcwd(), "..", "templates", "trainingkernels.tmpl.c") kernelInserts = { "kernelFunc": SymbolRef(kernelFunc), } kernel = OclFile("training_kernel", [FileTemplate(kernelPath, kernelInserts)]) wrapperPath = os.path.join(os.getcwd(), "..", "templates", "ocltrain.tmpl.c") wrapperInserts = { "kernel_path": kernel.get_generated_path_ref(), "kernelFunc": SymbolRef(kernelFunc) } wrapper = CFile("train", [FileTemplate(wrapperPath, wrapperInserts)]) fn = OclTrainFunction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() return fn.finalize(program, Project([kernel, wrapper]), "train")
def get_launcher(cls, sources, sinks, keyword, symbol_table): bottoms = sources concat_kern = Template(""" __kernel void concat(global const float* bottom, global float* top, int top_offset, int bot_offset) { int index = get_global_id(0); top[index + top_offset] = bottom[index + bot_offset]; } """).substitute() program = cl.clCreateProgramWithSource(context, concat_kern).build() kernel = program['concat'] kernel.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int, cl.cl_int) class Launcher(): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): top = symbol_table[sinks[0].name] bots = [symbol_table[b.name] for b in bottoms] evts = [] concat_off = 0 for i in range(len(bottoms)): count = np.prod(bots[i].shape[1:]) for n in range(bots[i].shape[0]): top_offset = n * np.prod(top.shape[1:]) + concat_off * np.prod(top.shape[2:]) evt = kernel( bots[i].ocl_buf, top.ocl_buf, top_offset, n * count).on( queues[n % len(queues)], (count, ), wait_for=wait_for) evts.append(evt) concat_off += bots[i].shape[1] return evts return Launcher(sources, sinks)
def ocl_init(ocl_src): platforms = cl.clGetPlatformIDs() use_devices = None for platform in platforms: try: devices = cl.clGetDeviceIDs(platform, device_type=cl.CL_DEVICE_TYPE_GPU) use_devices = devices[0:1] # arbitraily choose first device except cl.DeviceNotFoundError: pass if use_devices is not None: break if use_devices is None: raise ValueError("no GPU openCL device found") assert use_devices is not None print ("OpenCL use_devices: " + str(use_devices)) context = cl.clCreateContext(use_devices) queue = cl.clCreateCommandQueue(context) prog = cl.clCreateProgramWithSource(context, ocl_src).build() print prog # run_mxplusb( prog, queue ) run_conv(prog, queue)
def transform(self, tree, program_config): # TODO: Have to flip indices, figure out why arg_cfg, tune_cfg = program_config output_name = unique_name() params = [ SymbolRef(self.array_name, POINTER(c_float)(), _global=True, _const=True), SymbolRef(arg_cfg[0][0], POINTER(c_float)(), _global=True, _const=True), SymbolRef(output_name, POINTER(c_float)(), _global=True) ] defn = [] defn.extend([ Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d)) for d in range(len(arg_cfg[0][2])) ]) index = StringTemplate('element_id1 * $len_x + element_id0', {'len_x': Constant(arg_cfg[0][2][1])}) defn.append( Assign( ArrayRef(SymbolRef(params[-1].name), index), tree( ArrayRef(SymbolRef(params[0].name), index), ArrayRef(SymbolRef(params[1].name), index), ) ) ) entry_point = unique_kernel_name() tree = FunctionDecl(None, entry_point, params, defn) tree.set_kernel() fn = ArrayOpConcrete(self.array, self.generate_output(output_name)) kernel = OclFile("kernel", [tree]) program = clCreateProgramWithSource( fn.context, kernel.codegen() ).build() ptr = program[entry_point] return fn.finalize(ptr, (arg_cfg[0][2][1], arg_cfg[0][2][0]))
def visit_KernelCall(self, node): args = [] for i, arg in enumerate(node.args): size = SizeOf(SymbolRef(arg.name)) setter = clSetKernelArg(node.name, i, size, Ref(SymbolRef(arg.name))) setter.lift(params=arg._lift_params) args.append(setter) kernel_decl = SymbolRef(node.name, cl.cl_kernel()) kernel_symbol = kernel_decl.copy() call = clEnqueueNDRangeKernel(node.location.symbol.copy(), kernel_symbol, work_dim=Constant(1), global_size=node.global_size, local_size=node.local_size) kernel = RefConverter().visit(node.kernel) for param in kernel.params: param.type = param.type.ptr_type kernel.defn.insert(0, Assign(SymbolRef("i", c_int()), get_global_id(0))) kernel_src = kernel.codegen() call.body.append(CppComment(kernel_src)) context = node.location.queue.context kernel_ptr = cl.clCreateProgramWithSource(context, kernel_src).build()[node.name] call.lift(params=[(kernel_decl, kernel_ptr)]) return args + [call]
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0]].shape channels_col = channels * kernel_h * kernel_w height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 col_data = hmarray((channels_col, height_col * width_col)) bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0]].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col col2im_global_size = channels * height * width kernels = Template(""" __kernel void col2im(global float* data_col, global float* data_im, int im_offset) { if (get_global_id(0) < $col2im_global_size) { int index = get_global_id(0); float val = 0; int w = index % $width + $pad_w; int h = (index / $width) % $height + $pad_h; int c = index / ($width * $height); // compute the start and end of the output int w_col_start = (w < $kernel_w) ? 0 : (w - $kernel_w) / $stride_w + 1; int w_col_end = min(w / $stride_w + 1, $width_col); int h_col_start = (h < $kernel_h) ? 0 : (h - $kernel_h) / $stride_h + 1; int h_col_end = min(h / $stride_h + 1, $height_col); // equivalent implementation int offset = (c * $kernel_h * $kernel_w + h * $kernel_w + w) * \ $height_col * $width_col; int coeff_h_col = (1 - $stride_h * $kernel_w * $height_col) * \ $width_col; int coeff_w_col = (1 - $stride_w * $height_col * $width_col); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; } } data_im[im_offset + index] = val; } } __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int w_out = index % $width_col; int h_index = index / $width_col; int h_out = h_index % $height_col; int channel_in = h_index / $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; for (int i = 0; i < $kernel_h; ++i) { for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col, col2im_global_size=col2im_global_size) program = cl.clCreateProgramWithSource(context, kernels).build() im2col = program['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) col2im = program['col2im'] col2im.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = [ast.Name(s, ast.Load()) for s in sources] self.sinks = [ast.Name(s, ast.Load()) for s in sinks] def compile(self): pass def launch(self, symbol_table, wait_for): queue = queues[0] bottom = symbol_table[sources[0]] bot_offset = np.prod(bottom.shape[1:]) top_diff = symbol_table[sources[1]] top_offset = np.prod(top_diff.shape[1:]) weights = symbol_table[sources[2]] bottom_diff = symbol_table[sinks[0]] bottom_diff.fill(0) bottom_diff.sync_ocl() weights_diff = symbol_table[sinks[1]] weights_diff.fill(0) weights_diff.sync_ocl() bias_diff = symbol_table[sinks[2]] bias_diff.fill(0) bias_diff.sync_ocl() for i in range(bottom.shape[0]): n = np.prod(top_diff.shape[2:]) sgemv(False, top_diff.shape[1], n, 1.0, top_diff, i * top_offset, n, bias_multiplier, 0, 1, 1.0, bias_diff, 0, 1) im2col(bottom.ocl_buf, col_data.ocl_buf, i * bot_offset).on(queue, im2col_global_size) m = top_diff.shape[1] n = col_data.shape[0] k = col_data.shape[1] sgemm(False, True, 1.0, top_diff, i * top_offset, k, col_data, 0, k, 1.0, weights_diff, 0, n, m, n, k) m = weights.shape[1] n = col_data.shape[1] k = weights.shape[0] sgemm(True, False, 1.0, weights, 0, m, top_diff, i * top_offset, n, 0.0, col_data, 0, n, m, n, k) col2im(col_data.ocl_buf, bottom_diff.ocl_buf, i * bot_offset).on(queue, col2im_global_size) return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0]] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) loss = hmarray(scale_shape) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" // @begin=cl@ __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_channel_max(global const float* data, global float* out) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } __kernel void kernel_channel_subtract(global const float* channel_max, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } __kernel void kernel_exp(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = exp(data[index]); } } __kernel void kernel_channel_sum(global const float* data, global float* channel_sum) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void kernel_channel_div(global const float* channel_sum, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } __kernel void SoftmaxLossForward(global const float* prob_data, global const float* label, global float* loss) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); const int n = index / $spatial_dim; const int s = index % $spatial_dim; const int label_value = (int) label[n * $spatial_dim + s]; loss[index] = -log( max(prob_data[n * $dim + label_value * $spatial_dim + s], FLT_MIN)); } } // @end=cl@ """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) program = cl.clCreateProgramWithSource(context, kernels).build() copy_kern = program['kernel_copy'] copy_kern.argtypes = (cl.cl_mem, cl.cl_mem) max_kern = program['kernel_channel_max'] max_kern.argtypes = (cl.cl_mem, cl.cl_mem) sub_kern = program['kernel_channel_subtract'] sub_kern.argtypes = (cl.cl_mem, cl.cl_mem) exp_kern = program['kernel_exp'] exp_kern.argtypes = (cl.cl_mem, cl.cl_mem) sum_kern = program['kernel_channel_sum'] sum_kern.argtypes = (cl.cl_mem, cl.cl_mem) div_kern = program['kernel_channel_div'] div_kern.argtypes = (cl.cl_mem, cl.cl_mem) loss_forward = program['SoftmaxLossForward'] loss_forward.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem) class SoftmaxLauncher(object): def compile(self): pass def launch(self, symbol_table): bottom = symbol_table[sources[0]] label = symbol_table[sources[1]] prob = symbol_table[sources[2]] top = symbol_table[sinks[0]] copy_kern(bottom.ocl_buf, prob.ocl_buf).on(queue, (count, )) max_kern(prob.ocl_buf, scale.ocl_buf).on(queue, (num * spatial_dim, )) sub_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, )) exp_kern(prob.ocl_buf, prob.ocl_buf).on(queue, (count, )) sum_kern(prob.ocl_buf, scale.ocl_buf).on(queue, (num * spatial_dim,)) div_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, )) loss_forward(prob.ocl_buf, label.ocl_buf, loss.ocl_buf).on(queue, (num * spatial_dim, )) loss.sync_host() top[0] = np.sum(loss) / np.float32(num) top.sync_ocl() return SoftmaxLauncher()
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[ sinks[0].name].shape[1:] is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \ stride_w == 1 and pad_h == 0 and pad_w == 0 if not is_1x1: col_datas = [ hmarray((channels_col, height_col * width_col)) for _ in range(len(queues)) ] bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; #pragma unroll for (int i = 0; i < $kernel_h; ++i) { #pragma unroll for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) im2col = cl.clCreateProgramWithSource(context, im2col).build()['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) if im2col_global_size % 64: padded = (im2col_global_size + 63) & (~63) else: padded = im2col_global_size class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] top_offset = np.prod(top.shape[1:]) m = weights.shape[0] n = np.prod(top.shape[2:]) k = np.prod(weights.shape[1:]) # cl.clFinish(queues[0]) evts = [] if is_1x1: for i in range(bottom.shape[0]): evt = sgemm(False, False, 1.0, weights, 0, k, bottom, i * bot_offset, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=wait_for) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) else: for i in range(bottom.shape[0]): evt = im2col(bottom.ocl_buf, col_datas[i % len(queues)].ocl_buf, i * bot_offset).on( queues[i % len(queues)], (padded, ), wait_for=wait_for) evt = sgemm(False, False, 1.0, weights, 0, k, col_datas[i % len(queues)], 0, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=evt) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) return evts # for q in queues: # cl.clFinish(q) return ConvLauncher(sources, sinks)
def finalize(self, transform_result, program_config): project = Project(transform_result) arg_config, tuner_config = program_config self.output = self.generate_output(program_config) param_types = [ np.ctypeslib.ndpointer(arg.dtype, arg.ndim, arg.shape) for arg in arg_config + (self.output, ) ] if self.backend == StencilOclTransformer: entry_point = "stencil_control" param_types.append(param_types[0]) entry_type = [c_int32, cl.cl_command_queue, cl.cl_kernel] if self.kernel.is_copied: for _ in range(self.kernel.dim): entry_type.append(cl.cl_kernel) entry_type.extend(cl_mem for _ in range(len(arg_config) + 1)) entry_type = CFUNCTYPE(*entry_type) else: entry_point = "stencil_kernel" param_types.append(POINTER(c_float)) entry_type = CFUNCTYPE(c_int32, *param_types) if self.backend == StencilOclTransformer: concrete_function = OclStencilFunction() if self.kernel.is_copied: args = [ project, entry_type, entry_point, ] kernels = [] for index, kernel in enumerate(project.find_all(OclFile)): # print("XXX index {} kernel {}".format(index, kernel.name)) print("Kernel Codegen\n".format(kernel.codegen())) program = clCreateProgramWithSource( concrete_function.context, kernel.codegen()).build() if index == 0: ocl_kernel_name = 'stencil_kernel' else: ocl_kernel_name = kernel.name kernel_ptr = program[ocl_kernel_name] kernels.append(kernel_ptr) args.append(kernels) args.append(self.output) finalized = concrete_function.finalize(*args) else: kernel = project.find(OclFile) program = clCreateProgramWithSource(concrete_function.context, kernel.codegen()).build() stencil_kernel_ptr = program['stencil_kernel'] finalized = concrete_function.finalize( project, entry_type, entry_point, stencil_kernel_ptr, self.output ) else: concrete_function = ConcreteStencil() finalized = concrete_function.finalize(project, entry_point, entry_type, self.output) self.output = None self.fusable_nodes = [] return finalized concrete_function = ConcreteStencil() return concrete_function.finalize(entry_point, project, entry_type)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_channel_max(global const float* data, global float* out) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } __kernel void kernel_channel_subtract(global const float* channel_max, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } __kernel void kernel_exp(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = exp(data[index]); } } __kernel void kernel_channel_sum(global const float* data, global float* channel_sum) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void kernel_channel_div(global const float* channel_sum, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) program = cl.clCreateProgramWithSource(context, kernels).build() copy_kern = program['kernel_copy'] copy_kern.argtypes = (cl.cl_mem, cl.cl_mem) max_kern = program['kernel_channel_max'] max_kern.argtypes = (cl.cl_mem, cl.cl_mem) sub_kern = program['kernel_channel_subtract'] sub_kern.argtypes = (cl.cl_mem, cl.cl_mem) exp_kern = program['kernel_exp'] exp_kern.argtypes = (cl.cl_mem, cl.cl_mem) sum_kern = program['kernel_channel_sum'] sum_kern.argtypes = (cl.cl_mem, cl.cl_mem) div_kern = program['kernel_channel_div'] div_kern.argtypes = (cl.cl_mem, cl.cl_mem) class SoftmaxLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sources def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] if count % 16: padded_count = (count + 15) & (~15) else: padded_count = count num_times_spatial = num * spatial_dim if num_times_spatial % 16: padded_num_times_spatial = (num_times_spatial + 15) & (~15) else: padded_num_times_spatial = num_times_spatial evt = copy_kern(bottom.ocl_buf, top.ocl_buf).on( queue, (padded_count,), wait_for=wait_for) evt = max_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = sub_kern(scale.ocl_buf, top.ocl_buf).on( queue, (padded_count, ), wait_for=evt) evt = exp_kern(top.ocl_buf, top.ocl_buf).on( queue, (padded_count, ), wait_for=evt) evt = sum_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = div_kern(scale.ocl_buf, top.ocl_buf).on( queue, (padded_count, ), wait_for=evt) return [evt] return SoftmaxLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:] is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \ stride_w == 1 and pad_h == 0 and pad_w == 0 if not is_1x1: col_datas = [hmarray((channels_col, height_col * width_col)) for _ in range(len(queues))] bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; #pragma unroll for (int i = 0; i < $kernel_h; ++i) { #pragma unroll for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) im2col = cl.clCreateProgramWithSource( context, im2col ).build()['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) if im2col_global_size % 64: padded = (im2col_global_size + 63) & (~63) else: padded = im2col_global_size class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] top_offset = np.prod(top.shape[1:]) m = weights.shape[0] n = np.prod(top.shape[2:]) k = np.prod(weights.shape[1:]) # cl.clFinish(queues[0]) evts = [] if is_1x1: for i in range(bottom.shape[0]): evt = sgemm(False, False, 1.0, weights, 0, k, bottom, i * bot_offset, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=wait_for) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) else: for i in range(bottom.shape[0]): evt = im2col(bottom.ocl_buf, col_datas[i % len(queues)].ocl_buf, i * bot_offset ).on(queues[i % len(queues)], (padded, ), wait_for=wait_for) evt = sgemm(False, False, 1.0, weights, 0, k, col_datas[i % len(queues)], 0, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=evt) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) return evts # for q in queues: # cl.clFinish(q) return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_channel_max(global const float* data, global float* out) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } __kernel void kernel_channel_subtract(global const float* channel_max, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } __kernel void kernel_exp(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = exp(data[index]); } } __kernel void kernel_channel_sum(global const float* data, global float* channel_sum) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void kernel_channel_div(global const float* channel_sum, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) program = cl.clCreateProgramWithSource(context, kernels).build() copy_kern = program['kernel_copy'] copy_kern.argtypes = (cl.cl_mem, cl.cl_mem) max_kern = program['kernel_channel_max'] max_kern.argtypes = (cl.cl_mem, cl.cl_mem) sub_kern = program['kernel_channel_subtract'] sub_kern.argtypes = (cl.cl_mem, cl.cl_mem) exp_kern = program['kernel_exp'] exp_kern.argtypes = (cl.cl_mem, cl.cl_mem) sum_kern = program['kernel_channel_sum'] sum_kern.argtypes = (cl.cl_mem, cl.cl_mem) div_kern = program['kernel_channel_div'] div_kern.argtypes = (cl.cl_mem, cl.cl_mem) class SoftmaxLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sources def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] if count % 16: padded_count = (count + 15) & (~15) else: padded_count = count num_times_spatial = num * spatial_dim if num_times_spatial % 16: padded_num_times_spatial = (num_times_spatial + 15) & (~15) else: padded_num_times_spatial = num_times_spatial evt = copy_kern(bottom.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=wait_for) evt = max_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = sub_kern(scale.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=evt) evt = exp_kern(top.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=evt) evt = sum_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = div_kern(scale.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=evt) return [evt] return SoftmaxLauncher(sources, sinks)
def transform(self, tree, program_config): call_args = program_config[0] base_size = call_args.base_shape[0] * call_args.base_shape[1] border = call_args.border c_float_type = c_float c_int_type = c_int transformer = PyBasicConversions() output = unique_name() init_entry_point = unique_kernel_name() init_params = [ SymbolRef('input', POINTER(c_float_type)(), _global=True, _const=True), SymbolRef(output, POINTER(c_float_type)(), _global=True), ] init_defn = [] init_defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), ]) body = """{output}[y * {len_x} + x] = input[y * {len_x} + x]""".format( output=output, len_x=call_args.base_shape[0] ) print(body) tree_body = ast.parse(body).body init_defn.extend(tree_body) init_tree = FunctionDecl(None, init_entry_point, init_params, init_defn) init_tree.set_kernel() init_kernel = OclFile('kernel', [init_tree]) init_kernel = transformer.visit(init_kernel) print("init kernel codegen") print(init_kernel.codegen()) compute_entry_point = unique_kernel_name() compute_params = [ SymbolRef(output, POINTER(c_float_type)(), _global=True), SymbolRef('power', c_int(), _const=True), ] compute_defn = [] compute_defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), ]) body = """{matrix}[(power+1) * {base_size} + y * {len_x} + x] = 0.1 * {matrix}[ power * {base_size} + clamp(y-1, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] + 0.1 * {matrix}[ power * {base_size} + clamp(y+1, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] + 0.4 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x-1, {border}, {len_x}-{border}-1) ] + 0.4 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x+1, {border}, {len_x}-{border}-1) ] + 1.0 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] """.format( matrix=output, base_size=base_size, len_y=call_args.base_shape[0], len_x=call_args.base_shape[1], border=border, ) body = re.sub("""\s\s*""", " ", body) print(body) tree_body = ast.parse(body).body compute_defn.extend(tree_body) compute_tree = FunctionDecl(None, compute_entry_point, compute_params, compute_defn) compute_tree.set_kernel() compute_kernel = OclFile('kernel', [compute_tree]) compute_kernel = transformer.visit(compute_kernel) print("compute kernel codegen") print(compute_kernel.codegen()) fn = OclMatrixPowers() init_program = clCreateProgramWithSource(fn.context, init_kernel.codegen()).build() init_ptr = init_program[init_entry_point] compute_program = clCreateProgramWithSource(fn.context, compute_kernel.codegen()).build() compute_ptr = compute_program[compute_entry_point] return fn.finalize(init_ptr, compute_ptr, (call_args.base_shape[1], call_args.base_shape[0]))
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = A.dtype.type() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("len", ct.c_int()) ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), # getting the group id for this work group Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), # getting the global id for this work item Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), # getting the local id for this work item For(Assign(SymbolRef('i', ct.c_int()), Constant(1)), # for(int i=1; i<WORK_GROUP_SIZE; i *= 2) Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)), MulAssign(SymbolRef('i'), Constant(2)), [ If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))), # if statement checks Constant(0)), Lt(Add(SymbolRef('globalId'), SymbolRef('i')), SymbolRef("len"))), [ Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')), FunctionCall(SymbolRef('apply'), [ ArrayRef(SymbolRef('A'), SymbolRef('globalId')), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), SymbolRef('i'))) ])), ] ), FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')]) ] ), If(Eq(SymbolRef('localId'), Constant(0)), [ Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')), ArrayRef(SymbolRef('A'), SymbolRef('globalId'))) ] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $n; size_t local = $local; intptr_t len = $length; cl_mem swap; for (int runs = 0; runs < $run_limit ; runs++){ clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, sizeof(intptr_t), &len); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); swap = buf; buf = out_buf; out_buf = swap; len = len/local + (len % local != 0); } } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)), 'length': Constant(len_A), 'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE))) }) proj = Project([kernel, CFile("generated", [control])]) fn = ConcreteXorReduction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def get_launcher(cls, sources, sinks, keywords, symbol_table): num, channels, height, width = symbol_table[sources[0].name].shape local_size = keywords['local_size'] alpha = keywords['alpha'] k = keywords['k'] beta = keywords['beta'] compute_global = (num * channels * height * width, ) fill_global = (num * height * width, ) kernel = Template(""" // @begin=cl@ __kernel void LRNFillScale(global const float* in, global float* scale) { if (get_global_id(0) < $fill_global) { int index = get_global_id(0); int w = index % $width; int h = (index / $width) % $height; int n = index / $width / $height; int offset = (n * $channels * $height + h) * $width + w; int step = $height * $width; in += offset; scale += offset; int head = 0; int pre_pad = ($local_size - 1) / 2; int post_pad = $local_size - pre_pad - 1; float accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < $channels) { accum_scale += in[head * step] * in[head * step]; ++head; } // both add and subtract while (head < $channels) { accum_scale += in[head * step] * in[head * step]; if (head - $local_size >= 0) { accum_scale -= in[(head - $local_size) * step] * \ in[(head - $local_size) * step]; } scale[(head - post_pad) * step] = $k + accum_scale * $alpha_over_size; ++head; } // subtract only while (head < $channels + post_pad) { if (head - $local_size >= 0) { accum_scale -= in[(head - $local_size) * step] * \ in[(head - $local_size) * step]; } scale[(head - post_pad) * step] = $k + accum_scale * $alpha_over_size; ++head; } } } __kernel void LRNComputeOutput(global const float* in, global const float* scale, global float* out) { if (get_global_id(0) < $compute_global) { int index = get_global_id(0); out[index] = in[index] * pow(scale[index], (float)$negative_beta); } } // @end=cl@ """).substitute(width=width, height=height, channels=channels, local_size=local_size, alpha_over_size=float(alpha) / local_size, k=k, negative_beta=-beta, fill_global=fill_global[0], compute_global=compute_global[0]) program = cl.clCreateProgramWithSource( context, kernel ).build() fill_kern = program['LRNFillScale'] fill_kern.argtypes = (cl.cl_mem, cl.cl_mem) compute_kern = program['LRNComputeOutput'] compute_kern.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem) class LrnLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] scale = symbol_table[sinks[1].name] if fill_global[0] % 16: padded = (fill_global[0] + 15) & (~15) else: padded = fill_global[0] evt = fill_kern(bottom.ocl_buf, scale.ocl_buf).on(queue, (padded,), wait_for=wait_for) if compute_global[0] % 16: padded = (compute_global[0] + 15) & (~15) else: padded = compute_global[0] evt = compute_kern(bottom.ocl_buf, scale.ocl_buf, top.ocl_buf).on(queue, (padded,), wait_for=evt) return [evt] return LrnLauncher(sources, sinks)