def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg tree = PyBasicConversions().visit(tree) tree = Backend(arg_cfg, self.symbol_table).visit(tree) tree = ConstantFold().visit(tree) tree.name = self.original_tree.body[0].name return tree
def transform(self, tree, program_config): arg_types = program_config[0]['arg_typesig'] tree = PyBasicConversions().visit(tree.body[0]) tree.return_type = arg_types[0]() for param, ty in zip(tree.params, arg_types): param.type = ty() return [CFile(tree.name, [tree])]
def transform(self, tree, program_config): tree = PyBasicConversions().visit(tree) fib_fn = tree.find(FunctionDecl, name="apply") arg_type = program_config.args_subconfig['arg_type'] fib_fn.return_type = arg_type() fib_fn.params[0].type = arg_type() c_translator = CFile("generated", [tree]) return [c_translator]
def visit_Lambda(self, node): self.generic_visit(node) macro_name = "LAMBDA_" + str(self.lambda_counter) LambdaLifter.lambda_counter += 1 node = PyBasicConversions().visit(node) node.name = macro_name macro = CppDefine(macro_name, node.params, node.defn[0].value) self.lifted_functions.append(macro) return SymbolRef(macro_name)
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg # tree = Desugar().visit(tree) inliner = InlineEnvironment(self.symbol_table) tree = inliner.visit(tree) tree = PyBasicConversions().visit(tree) tree.body = inliner.files + tree.body # tree.find(C.For).pragma = 'omp parallel for' tree.name = self.original_tree.body[0].name tree.body.insert(0, StringTemplate("#include <math.h>")) # print(tree) return [tree]
def transform(self, tree, program_config): args_subconfig, tuning_config = program_config function = tree.body[0] c_func = PyBasicConversions().visit(function) #print(c_func) c_func.defn = c_func.defn[1:] #print(c_func) c_func.params[0].type = ctree.types.get_ctype(args_subconfig) c_func.params[1].type =c_func.params[0].type c_func.params.append(SymbolRef('arr', ctypes.POINTER(ctypes.c_int32)())) #print(c_func) return CFile(body=[c_func])
def transform(self, tree, program_config): args_subconfig, tuning_config = program_config function = tree.body[0] c_func = PyBasicConversions().visit(function) #print(c_func) c_func.defn = c_func.defn[1:] #print(c_func) c_func.params[0].type = ctree.types.get_ctype(args_subconfig) c_func.params[1].type = c_func.params[0].type c_func.params.append(SymbolRef('arr', ctypes.POINTER(ctypes.c_int32)())) #print(c_func) return CFile(body=[c_func])
def transform(self, tree, program_config): arg_cfg = program_config[0] self.entry_point = unique_kernel_name() ctypeObject = c_float() ctype = c_float len_x = arg_cfg[0][1][0] len_y = arg_cfg[0][1][1] output = unique_name() params = [ SymbolRef("input", POINTER(ctype)(), _global=True, _const=True), SymbolRef(output, POINTER(ctype)(), _global=True) ] defn = [] defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), Assign(SymbolRef('temp', ctypeObject), Constant(0)), ]) body = \ """ temp = .5 * input[clamp(x/2, 0, (len_x / 2) - 1) * len_y + clamp(y/2, 0, (len_y / 2) - 1)] if (x & 0x1): temp += .25 * input[clamp(x/2 + 1, 0, (len_x / 2) - 1) * len_y + clamp(y/2, 0, (len_y / 2) - 1)] else: temp += .25 * input[clamp(x/2 - 1, 0, (len_x / 2) - 1) * len_y + clamp(y/2, 0, (len_y / 2) - 1)] if (y & 0x1): temp += .25 * input[clamp(x/2, 0, (len_x / 2) - 1) * len_y + clamp(y/2 + 1, 0, (len_y / 2) - 1)] else: temp += .25 * input[clamp(x/2, 0, (len_x / 2) - 1) *len_y + clamp(y/2 - 1, 0, (len_y / 2) - 1)] output[x * len_y + y] = temp """ body = ast.parse(body).body name_dict = { 'output': output } const_dict = { 'len_x': len_x, 'len_y': len_y, } transformation = PyBasicConversions(name_dict, const_dict) defn.extend(body) tree = FunctionDecl(None, self.entry_point, params, defn) tree.set_kernel() kernel = OclFile("kernel", [tree]) kernel = transformation.visit(kernel) return kernel
def test_fib(self): py_ast = get_ast(fib).body[0] c_ast = PyBasicConversions().visit(py_ast) filled_ast = DeclarationFiller().visit(c_ast) print(filled_ast) expected = """ void fib(n) { double a = 0.0; double b = 1.0; char* k = "hello"; while (n > 0) { double ____temp__a = b; double ____temp__b = a + b; a = ____temp__a; b = ____temp__b; n -= 1; } return a; }""" stripped_actual = str(filled_ast).replace(" ", "").replace("\n", "") stripped_expected = expected.replace(" ", "").replace("\n", "") self.assertEqual(stripped_actual, stripped_expected)
def transform(self, tree, program_config): """Convert the Python AST to a C AST.""" param_types = [] for arg in program_config[0]: param_types.append(NdPointer(arg[1], arg[2], arg[3])) kernel_sig = FuncType(Void(), param_types) tune_cfg = program_config[1] # block_factor = 2**tune_cfg['block_factor'] unroll_factor = 2**tune_cfg['unroll_factor'] for transformer in [StencilTransformer(self.input_grids, self.output_grid, self.constants ), PyBasicConversions()]: tree = transformer.visit(tree) first_For = tree.find(For) inner_For = FindInnerMostLoop().find(first_For) # self.block(inner_For, first_For, block_factor) self.unroll(inner_For, unroll_factor) # remove self param # TODO: Better way to do this? params = tree.find(FunctionDecl, name="kernel").params params.pop(0) self.gen_array_macro_definition(tree, params) entry_point = tree.find(FunctionDecl, name="kernel") entry_point.set_typesig(kernel_sig) return tree, entry_point.get_type().as_ctype()
def transform(self, py_ast, program_cfg): arg_cfg, tune_cfg = program_cfg tree = PyBasicConversions().visit(py_ast) param_dict = {} tree.body[0].params.append(C.SymbolRef("retval", arg_cfg[0]())) # Annotate arguments for param, type in zip(tree.body[0].params, arg_cfg): param.type = type() param_dict[param.name] = type._dtype_ length = np.prod(arg_cfg[0]._shape_) transformer = MapTransformer("i", param_dict, "retval") body = list(map(transformer.visit, tree.body[0].defn)) tree.body[0].defn = [C.For( C.Assign(C.SymbolRef("i", ct.c_int()), C.Constant(0)), C.Lt(C.SymbolRef("i"), C.Constant(length)), C.PostInc(C.SymbolRef("i")), body=body, pragma="ivdep" )] tree = DeclarationFiller().visit(tree) defns = [] tree = HwachaVectorize(param_dict, defns).visit(tree) file_body = [ StringTemplate("#include <stdlib.h>"), StringTemplate("#include <stdint.h>"), StringTemplate("#include <assert.h>"), StringTemplate("extern \"C\" void __hwacha_body(void);"), ] file_body.extend(defns) file_body.append(tree) return [CFile("generated", file_body)]
def test_multiple_assign_constant(self): node = ast.Assign([ ast.Tuple(elts=(ast.Name(id="x", ctx=None), ast.Name(id="y", ctx=None))) ], ast.Tuple(elts=(Constant(1), Constant(2)))) transformed_node = PyBasicConversions().visit(node) self.assertEqual(str(transformed_node), "\nx = 1;\ny = 2;\n")
def mini_transform(self, node): """ This method acts as a simulation of a specializer's transform() method. It's the bare minimum required of a transform() method by the specializer writer. :param node: the node to transform :return: the node transformed through PyBasicConversions into a rough C-AST. """ transformed_node = PyBasicConversions().visit(node) transformed_node.name = "apply" transformed_node.return_type = ct.c_float() for param in transformed_node.params: param.type = ct.c_float() return transformed_node
def eval_with_loop(self, elts): new_elts = [] for elt in elts: elt = self.replace_loopvars_as_constants(copy.deepcopy(elt)) elt = PyBasicConversions().visit(elt) elt = ConstantFold().visit(elt) new_elts.append(elt.value) return tuple(new_elts)
def emit(cls, sources, sinks, keywords, symbol_table): tree = get_ast(cls.fn) tree = PyBasicConversions().visit(tree) body = tree.body[0].defn mapping = {arg.name: source for arg, source in zip(tree.body[0].params, sources)} visitor = MapTransformer(mapping, sinks[0]) body = [visitor.visit(s) for s in body] return "\n".join([str(s) + ";" for s in body])
def visit_AugAssign(self, node): # TODO: Handle all types? value = self.visit(node.value) # HACK to get this to work, PyBasicConversions will skip this AugAssign node # TODO Figure out why value = PyBasicConversions().visit(value) if type(node.op) is ast.Add: return AddAssign(self.visit(node.target), value) if type(node.op) is ast.Sub: return SubAssign(self.visit(node.target), value)
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype(np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype( np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate( r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, {'length': Constant(length)}) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[array_add_template]) ], 'omp') return [array_op]
def test_multiple_assign_dependent(self): node = ast.Assign([ ast.Tuple(elts=(ast.Name(id="x", ctx=None), ast.Name(id="y", ctx=None))) ], ast.Tuple(elts=(ast.Name(id="y", ctx=None), ast.Name(id="x", ctx=None)))) transformed_node = PyBasicConversions().visit(node) self.assertEqual( str(transformed_node), "\n____temp__x = x;\n____temp__y = y;\ny = ____temp__y;\nx = ____temp__x;\n" )
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = data_type apply_one.return_type = data_type array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(input1[i], input2[i]); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("input1", pointer()), SymbolRef("input2", pointer()), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def test_multiple_assign_dependent(self): node = ast.Assign( [ ast.Tuple(elts=(ast.Name(id="x", ctx=None), ast.Name(id="y", ctx=None))) ], ast.Tuple( elts=(FunctionCall(func='square', args=[Constant(5), Constant(5)]), FunctionCall(func='square', args=[Constant(5), Constant(5)])))) transformed_node = PyBasicConversions().visit(node) self.assertEqual( str(transformed_node), "\n____temp__x = square(5, 5);\n____temp__y = square(5, 5);\nx = ____temp__x;\ny = ____temp__y;\n" )
def visit_FunctionCall(self, node): if node.func.name in {'min', 'max'}: node.func.name = "f" + node.func.name # TODO: Add support for all math funcs self.includes.add("math.h") return super(Backend, self).generic_visit(node) # FIXME: This is specific for handling a map function # do we have to generalize? node.args = [self.visit(arg) for arg in node.args] func_tree = get_ast(self.symbol_table[node.func.name]) func_tree = PyBasicConversions().visit(func_tree).body[0] func_tree = self.visit(func_tree) func_tree.name = C.SymbolRef(node.func.name) func_tree.set_static() func_tree.set_inline() self.defns.append(func_tree) # FIXME: Infer type for p in func_tree.params: p.type = ct.c_float() func_tree.return_type = ct.c_float() return node
def test_fmin(self): def func(): a = 3.0 b = 4.0 c = fmax(a + b, 0.0) return c py_ast = get_ast(func).body[0] c_ast = PyBasicConversions().visit(py_ast) filled_ast = DeclarationFiller().visit(c_ast) expected = """ void func() { double a = 3.0; double b = 4.0; double c = fmax(a + b, 0.0); return c; }""" stripped_actual = str(filled_ast).replace(" ", "").replace("\n", "") stripped_expected = expected.replace(" ", "").replace("\n", "") self.assertEqual(stripped_actual, stripped_expected)
def test_simple_transform(self): class Kernel(StencilKernel): def kernel(self, in_img, out_img): for x in out_img.interior_points(): for y in in_img.neighbors(x, 1): out_img[x] += in_img[y] kernel = Kernel() kernel.should_unroll = False out_grid = StencilGrid([5]) out_grid.ghost_depth = radius in_grid = StencilGrid([5]) in_grid.ghost_depth = radius for x in range(0, 5): in_grid.data[x] = 1 tree1 = ctree.get_ast(Kernel.kernel) tree2 = PyBasicConversions().visit(tree1) actual = StencilOmpTransformer([in_grid], out_grid, kernel).visit(tree2) self.assertEqual(actual, second)
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] num_2d_layers = np.prod(input_data.num_frames) data_height = np.prod(input_data.data_height) layer_length = np.prod(input_data.size // num_2d_layers) segment_length = np.prod(input_data.segment_length) inp_type = get_c_type_from_numpy_dtype(input_data.dtype)() input_pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) output_pointer = np.ctypeslib.ndpointer(input_data.dtype, 1, (input_data.size, 1)) # Get the kernel function, apply_one apply_one = PyBasicConversions().visit(py_ast).find(FunctionDecl) apply_one.return_type = inp_type apply_one.params[0].type = inp_type apply_one.params[1].type = inp_type # Naming our kernel method apply_one.name = 'apply' num_pfovs = int(layer_length / segment_length) # print ("num layers: ", num_2d_layers) # print ("input size: ", input_data.size) # print ("layer length: ", layer_length) # TODO: TIME TO START CPROFILING THINGS! reduction_template = StringTemplate(r""" #pragma omp parallel for collapse(2) for (int level = 0; level < $num_2d_layers; level++) { for (int i=0; i<$num_pfovs ; i++) { int level_offset = level * $layer_length; double avg = 0.0; // #pragma omp parallel for reduction (+:avg) for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; // printf ("Index: %i, I: %i, J: %i\n", index, i, j); avg += input_arr[level_offset + index]; } avg = avg / $pfov_length; // #pragma omp parallel for for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; output_arr[level_offset + index] = input_arr[level_offset + index] - avg; } } } """, { 'num_2d_layers': Constant(num_2d_layers), 'layer_length': Constant(layer_length), 'num_pfovs': Constant(num_pfovs), 'pfov_length': Constant(segment_length), 'data_height': Constant(data_height), }) reducer = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, REDUCTION_FUNC_NAME, params=[ SymbolRef("input_arr", input_pointer()), SymbolRef("output_arr", output_pointer()) ], defn=[ reduction_template ]) ], 'omp') return [reducer]
def test_List(self): array = ast.parse("[1, 5, 7, 3]") array = PyBasicConversions().visit(array).find(Array) self.assertEqual(str(array), "{1, 5, 7, 3}")
def visit_Assign(self, node): target = PyBasicConversions().visit(self.visit(node.targets[0])) value = PyBasicConversions().visit(self.visit(node.value)) return Assign(target, value)
def test_Equals(self): comp = ast.parse("5 == foo == 6") comp = PyBasicConversions().visit(comp).find(BinaryOp) self.assertEqual(str(comp), "5 == foo && foo == 6")
def test_minus(self): op = ast.parse("- foo") op = PyBasicConversions().visit(op).find(UnaryOp) self._check(op, "- foo")
def test_not(self): op = ast.parse("not foo") op = PyBasicConversions().visit(op).find(UnaryOp) self._check(op, "! foo")
def test_LessThan(self): comp = ast.parse("5 < foo < 6") comp = PyBasicConversions().visit(comp).find(BinaryOp) self.assertEqual(str(comp), "5 < foo && foo < 6")
def test_CUnaryOp(self): op = Not(SymbolRef("foo")) op = PyBasicConversions().visit(op).find(UnaryOp) self._check(str(op), "! foo")
def transform(self, tree, program_config): dirname = self.config_to_dirname(program_config) A = program_config[0] len_A = np.prod(A.shape) data_type = get_c_type_from_numpy_dtype(A.dtype) # Get the ctype class for the data type for the parameters pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree).find(FunctionDecl) apply_one.name = 'apply' # Naming our kernel method # Assigning a data_type instance for the # # return type, and the parameter types... # apply_one.return_type = data_type() apply_one.params[0].type = data_type() apply_one.params[1].type = data_type() responsible_size = int(len_A / WORK_GROUP_SIZE) # Get the appropriate number of threads for parallelizing # Creating our controller function (called "apply_kernel") to control # # the parallelizing of our computation, using ctree syntax... # apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ), For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)), PostInc(SymbolRef('offset')), [ Assign(SymbolRef('localResult'), FunctionCall(apply_one.name, [SymbolRef('localResult'), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), Mul(SymbolRef('offset'), Constant(WORK_GROUP_SIZE))))]) ), ] ), Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() # Hardcoded OpenCL code to compensate to begin execution of parallelized computation control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A), }) ocl_kernel = OclFile("kernel", [apply_one, apply_kernel]) c_controller = CFile("generated", [control]) return [ocl_kernel, c_controller]
def transform(self, tree, program_config): call_args = program_config[0] base_size = call_args.base_shape[0] * call_args.base_shape[1] border = call_args.border c_float_type = c_float c_int_type = c_int transformer = PyBasicConversions() output = unique_name() init_entry_point = unique_kernel_name() init_params = [ SymbolRef('input', POINTER(c_float_type)(), _global=True, _const=True), SymbolRef(output, POINTER(c_float_type)(), _global=True), ] init_defn = [] init_defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), ]) body = """{output}[y * {len_x} + x] = input[y * {len_x} + x]""".format( output=output, len_x=call_args.base_shape[0] ) print(body) tree_body = ast.parse(body).body init_defn.extend(tree_body) init_tree = FunctionDecl(None, init_entry_point, init_params, init_defn) init_tree.set_kernel() init_kernel = OclFile('kernel', [init_tree]) init_kernel = transformer.visit(init_kernel) print("init kernel codegen") print(init_kernel.codegen()) compute_entry_point = unique_kernel_name() compute_params = [ SymbolRef(output, POINTER(c_float_type)(), _global=True), SymbolRef('power', c_int(), _const=True), ] compute_defn = [] compute_defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), ]) body = """{matrix}[(power+1) * {base_size} + y * {len_x} + x] = 0.1 * {matrix}[ power * {base_size} + clamp(y-1, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] + 0.1 * {matrix}[ power * {base_size} + clamp(y+1, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] + 0.4 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x-1, {border}, {len_x}-{border}-1) ] + 0.4 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x+1, {border}, {len_x}-{border}-1) ] + 1.0 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] """.format( matrix=output, base_size=base_size, len_y=call_args.base_shape[0], len_x=call_args.base_shape[1], border=border, ) body = re.sub("""\s\s*""", " ", body) print(body) tree_body = ast.parse(body).body compute_defn.extend(tree_body) compute_tree = FunctionDecl(None, compute_entry_point, compute_params, compute_defn) compute_tree.set_kernel() compute_kernel = OclFile('kernel', [compute_tree]) compute_kernel = transformer.visit(compute_kernel) print("compute kernel codegen") print(compute_kernel.codegen()) fn = OclMatrixPowers() init_program = clCreateProgramWithSource(fn.context, init_kernel.codegen()).build() init_ptr = init_program[init_entry_point] compute_program = clCreateProgramWithSource(fn.context, compute_kernel.codegen()).build() compute_ptr = compute_program[compute_entry_point] return fn.finalize(init_ptr, compute_ptr, (call_args.base_shape[1], call_args.base_shape[0]))
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = get_c_type_from_numpy_dtype(A.dtype)() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type responsible_size = int(len_A / WORK_GROUP_SIZE) apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', ct.c_int()), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ) ] + [Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))])) for i in range(1, responsible_size)] + [ Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; cl_mem swap; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A) }) c_controller = CFile("generated", [control]) return [kernel, c_controller]
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = A.dtype.type() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("len", ct.c_int()) ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), # getting the group id for this work group Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), # getting the global id for this work item Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), # getting the local id for this work item For(Assign(SymbolRef('i', ct.c_int()), Constant(1)), # for(int i=1; i<WORK_GROUP_SIZE; i *= 2) Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)), MulAssign(SymbolRef('i'), Constant(2)), [ If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))), # if statement checks Constant(0)), Lt(Add(SymbolRef('globalId'), SymbolRef('i')), SymbolRef("len"))), [ Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')), FunctionCall(SymbolRef('apply'), [ ArrayRef(SymbolRef('A'), SymbolRef('globalId')), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), SymbolRef('i'))) ])), ] ), FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')]) ] ), If(Eq(SymbolRef('localId'), Constant(0)), [ Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')), ArrayRef(SymbolRef('A'), SymbolRef('globalId'))) ] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $n; size_t local = $local; intptr_t len = $length; cl_mem swap; for (int runs = 0; runs < $run_limit ; runs++){ clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, sizeof(intptr_t), &len); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); swap = buf; buf = out_buf; out_buf = swap; len = len/local + (len % local != 0); } } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)), 'length': Constant(len_A), 'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE))) }) proj = Project([kernel, CFile("generated", [control])]) fn = ConcreteXorReduction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def test_GreaterThan(self): comp = ast.parse("5 >= foo >= 6") comp = PyBasicConversions().visit(comp).find(BinaryOp) self.assertEqual(str(comp), "5 >= foo && foo >= 6")