def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg tree = PyBasicConversions().visit(tree) tree = Backend(arg_cfg, self.symbol_table).visit(tree) tree = ConstantFold().visit(tree) tree.name = self.original_tree.body[0].name return tree
def visit_Lambda(self, node): self.generic_visit(node) macro_name = "LAMBDA_" + str(self.lambda_counter) LambdaLifter.lambda_counter += 1 node = PyBasicConversions().visit(node) node.name = macro_name macro = CppDefine(macro_name, node.params, node.defn[0].value) self.lifted_functions.append(macro) return SymbolRef(macro_name)
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg # tree = Desugar().visit(tree) inliner = InlineEnvironment(self.symbol_table) tree = inliner.visit(tree) tree = PyBasicConversions().visit(tree) tree.body = inliner.files + tree.body # tree.find(C.For).pragma = 'omp parallel for' tree.name = self.original_tree.body[0].name tree.body.insert(0, StringTemplate("#include <math.h>")) # print(tree) return [tree]
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg # tree = Desugar().visit(tree) inliner = InlineEnvironment(self.symbol_table) tree = inliner.visit(tree) tree = PyBasicConversions().visit(tree) tree.body = inliner.files + tree.body # tree.find(C.For).pragma = 'omp parallel for' tree.name = self.original_tree.body[0].name tree.body.insert(0, StringTemplate("#include <math.h>")) # print(tree) return [tree]
def mini_transform(self, node): """ This method acts as a simulation of a specializer's transform() method. It's the bare minimum required of a transform() method by the specializer writer. :param node: the node to transform :return: the node transformed through PyBasicConversions into a rough C-AST. """ transformed_node = PyBasicConversions().visit(node) transformed_node.name = "apply" transformed_node.return_type = ct.c_float() for param in transformed_node.params: param.type = ct.c_float() return transformed_node
def mini_transform(self, node): """ This method acts as a simulation of a specializer's transform() method. It's the bare minimum required of a transform() method by the specializer writer. :param node: the node to transform :return: the node transformed through PyBasicConversions into a rough C-AST. """ transformed_node = PyBasicConversions().visit(node) transformed_node.name = "apply" transformed_node.return_type = ct.c_float() for param in transformed_node.params: param.type = ct.c_float() return transformed_node
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype(np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype( np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate( r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, {'length': Constant(length)}) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[array_add_template]) ], 'omp') return [array_op]
def visit_FunctionCall(self, node): if node.func.name in {'min', 'max'}: node.func.name = "f" + node.func.name # TODO: Add support for all math funcs self.includes.add("math.h") return super(Backend, self).generic_visit(node) # FIXME: This is specific for handling a map function # do we have to generalize? node.args = [self.visit(arg) for arg in node.args] func_tree = get_ast(self.symbol_table[node.func.name]) func_tree = PyBasicConversions().visit(func_tree).body[0] func_tree = self.visit(func_tree) func_tree.name = C.SymbolRef(node.func.name) func_tree.set_static() func_tree.set_inline() self.defns.append(func_tree) # FIXME: Infer type for p in func_tree.params: p.type = ct.c_float() func_tree.return_type = ct.c_float() return node
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = data_type apply_one.return_type = data_type array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(input1[i], input2[i]); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("input1", pointer()), SymbolRef("input2", pointer()), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] num_2d_layers = np.prod(input_data.num_frames) data_height = np.prod(input_data.data_height) layer_length = np.prod(input_data.size // num_2d_layers) segment_length = np.prod(input_data.segment_length) inp_type = get_c_type_from_numpy_dtype(input_data.dtype)() input_pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) output_pointer = np.ctypeslib.ndpointer(input_data.dtype, 1, (input_data.size, 1)) # Get the kernel function, apply_one apply_one = PyBasicConversions().visit(py_ast).find(FunctionDecl) apply_one.return_type = inp_type apply_one.params[0].type = inp_type apply_one.params[1].type = inp_type # Naming our kernel method apply_one.name = 'apply' num_pfovs = int(layer_length / segment_length) # print ("num layers: ", num_2d_layers) # print ("input size: ", input_data.size) # print ("layer length: ", layer_length) # TODO: TIME TO START CPROFILING THINGS! reduction_template = StringTemplate(r""" #pragma omp parallel for collapse(2) for (int level = 0; level < $num_2d_layers; level++) { for (int i=0; i<$num_pfovs ; i++) { int level_offset = level * $layer_length; double avg = 0.0; // #pragma omp parallel for reduction (+:avg) for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; // printf ("Index: %i, I: %i, J: %i\n", index, i, j); avg += input_arr[level_offset + index]; } avg = avg / $pfov_length; // #pragma omp parallel for for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; output_arr[level_offset + index] = input_arr[level_offset + index] - avg; } } } """, { 'num_2d_layers': Constant(num_2d_layers), 'layer_length': Constant(layer_length), 'num_pfovs': Constant(num_pfovs), 'pfov_length': Constant(segment_length), 'data_height': Constant(data_height), }) reducer = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, REDUCTION_FUNC_NAME, params=[ SymbolRef("input_arr", input_pointer()), SymbolRef("output_arr", output_pointer()) ], defn=[ reduction_template ]) ], 'omp') return [reducer]
def transform(self, tree, program_config): dirname = self.config_to_dirname(program_config) A = program_config[0] len_A = np.prod(A.shape) data_type = get_c_type_from_numpy_dtype(A.dtype) # Get the ctype class for the data type for the parameters pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree).find(FunctionDecl) apply_one.name = 'apply' # Naming our kernel method # Assigning a data_type instance for the # # return type, and the parameter types... # apply_one.return_type = data_type() apply_one.params[0].type = data_type() apply_one.params[1].type = data_type() responsible_size = int(len_A / WORK_GROUP_SIZE) # Get the appropriate number of threads for parallelizing # Creating our controller function (called "apply_kernel") to control # # the parallelizing of our computation, using ctree syntax... # apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ), For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)), PostInc(SymbolRef('offset')), [ Assign(SymbolRef('localResult'), FunctionCall(apply_one.name, [SymbolRef('localResult'), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), Mul(SymbolRef('offset'), Constant(WORK_GROUP_SIZE))))]) ), ] ), Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() # Hardcoded OpenCL code to compensate to begin execution of parallelized computation control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A), }) ocl_kernel = OclFile("kernel", [apply_one, apply_kernel]) c_controller = CFile("generated", [control]) return [ocl_kernel, c_controller]