def transform(self, tree, program_config): arg_types = program_config[0]['arg_typesig'] tree = PyBasicConversions().visit(tree.body[0]) tree.return_type = arg_types[0]() for param, ty in zip(tree.params, arg_types): param.type = ty() return [CFile(tree.name, [tree])]
def mini_transform(self, node): """ This method acts as a simulation of a specializer's transform() method. It's the bare minimum required of a transform() method by the specializer writer. :param node: the node to transform :return: the node transformed through PyBasicConversions into a rough C-AST. """ transformed_node = PyBasicConversions().visit(node) transformed_node.name = "apply" transformed_node.return_type = ct.c_float() for param in transformed_node.params: param.type = ct.c_float() return transformed_node
def mini_transform(self, node): """ This method acts as a simulation of a specializer's transform() method. It's the bare minimum required of a transform() method by the specializer writer. :param node: the node to transform :return: the node transformed through PyBasicConversions into a rough C-AST. """ transformed_node = PyBasicConversions().visit(node) transformed_node.name = "apply" transformed_node.return_type = ct.c_float() for param in transformed_node.params: param.type = ct.c_float() return transformed_node
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype(np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype( np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate( r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, {'length': Constant(length)}) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[array_add_template]) ], 'omp') return [array_op]
def visit_FunctionCall(self, node): if node.func.name in {'min', 'max'}: node.func.name = "f" + node.func.name # TODO: Add support for all math funcs self.includes.add("math.h") return super(Backend, self).generic_visit(node) # FIXME: This is specific for handling a map function # do we have to generalize? node.args = [self.visit(arg) for arg in node.args] func_tree = get_ast(self.symbol_table[node.func.name]) func_tree = PyBasicConversions().visit(func_tree).body[0] func_tree = self.visit(func_tree) func_tree.name = C.SymbolRef(node.func.name) func_tree.set_static() func_tree.set_inline() self.defns.append(func_tree) # FIXME: Infer type for p in func_tree.params: p.type = ct.c_float() func_tree.return_type = ct.c_float() return node
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = data_type apply_one.return_type = data_type array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(input1[i], input2[i]); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("input1", pointer()), SymbolRef("input2", pointer()), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = A.dtype.type() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("len", ct.c_int()) ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), # getting the group id for this work group Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), # getting the global id for this work item Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), # getting the local id for this work item For(Assign(SymbolRef('i', ct.c_int()), Constant(1)), # for(int i=1; i<WORK_GROUP_SIZE; i *= 2) Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)), MulAssign(SymbolRef('i'), Constant(2)), [ If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))), # if statement checks Constant(0)), Lt(Add(SymbolRef('globalId'), SymbolRef('i')), SymbolRef("len"))), [ Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')), FunctionCall(SymbolRef('apply'), [ ArrayRef(SymbolRef('A'), SymbolRef('globalId')), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), SymbolRef('i'))) ])), ] ), FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')]) ] ), If(Eq(SymbolRef('localId'), Constant(0)), [ Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')), ArrayRef(SymbolRef('A'), SymbolRef('globalId'))) ] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $n; size_t local = $local; intptr_t len = $length; cl_mem swap; for (int runs = 0; runs < $run_limit ; runs++){ clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, sizeof(intptr_t), &len); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); swap = buf; buf = out_buf; out_buf = swap; len = len/local + (len % local != 0); } } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)), 'length': Constant(len_A), 'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE))) }) proj = Project([kernel, CFile("generated", [control])]) fn = ConcreteXorReduction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] num_2d_layers = np.prod(input_data.num_frames) data_height = np.prod(input_data.data_height) layer_length = np.prod(input_data.size // num_2d_layers) segment_length = np.prod(input_data.segment_length) inp_type = get_c_type_from_numpy_dtype(input_data.dtype)() input_pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) output_pointer = np.ctypeslib.ndpointer(input_data.dtype, 1, (input_data.size, 1)) # Get the kernel function, apply_one apply_one = PyBasicConversions().visit(py_ast).find(FunctionDecl) apply_one.return_type = inp_type apply_one.params[0].type = inp_type apply_one.params[1].type = inp_type # Naming our kernel method apply_one.name = 'apply' num_pfovs = int(layer_length / segment_length) # print ("num layers: ", num_2d_layers) # print ("input size: ", input_data.size) # print ("layer length: ", layer_length) # TODO: TIME TO START CPROFILING THINGS! reduction_template = StringTemplate(r""" #pragma omp parallel for collapse(2) for (int level = 0; level < $num_2d_layers; level++) { for (int i=0; i<$num_pfovs ; i++) { int level_offset = level * $layer_length; double avg = 0.0; // #pragma omp parallel for reduction (+:avg) for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; // printf ("Index: %i, I: %i, J: %i\n", index, i, j); avg += input_arr[level_offset + index]; } avg = avg / $pfov_length; // #pragma omp parallel for for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; output_arr[level_offset + index] = input_arr[level_offset + index] - avg; } } } """, { 'num_2d_layers': Constant(num_2d_layers), 'layer_length': Constant(layer_length), 'num_pfovs': Constant(num_pfovs), 'pfov_length': Constant(segment_length), 'data_height': Constant(data_height), }) reducer = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, REDUCTION_FUNC_NAME, params=[ SymbolRef("input_arr", input_pointer()), SymbolRef("output_arr", output_pointer()) ], defn=[ reduction_template ]) ], 'omp') return [reducer]
def transform(self, tree, program_config): dirname = self.config_to_dirname(program_config) A = program_config[0] len_A = np.prod(A.shape) data_type = get_c_type_from_numpy_dtype(A.dtype) # Get the ctype class for the data type for the parameters pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree).find(FunctionDecl) apply_one.name = 'apply' # Naming our kernel method # Assigning a data_type instance for the # # return type, and the parameter types... # apply_one.return_type = data_type() apply_one.params[0].type = data_type() apply_one.params[1].type = data_type() responsible_size = int(len_A / WORK_GROUP_SIZE) # Get the appropriate number of threads for parallelizing # Creating our controller function (called "apply_kernel") to control # # the parallelizing of our computation, using ctree syntax... # apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ), For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)), PostInc(SymbolRef('offset')), [ Assign(SymbolRef('localResult'), FunctionCall(apply_one.name, [SymbolRef('localResult'), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), Mul(SymbolRef('offset'), Constant(WORK_GROUP_SIZE))))]) ), ] ), Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() # Hardcoded OpenCL code to compensate to begin execution of parallelized computation control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A), }) ocl_kernel = OclFile("kernel", [apply_one, apply_kernel]) c_controller = CFile("generated", [control]) return [ocl_kernel, c_controller]
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = get_c_type_from_numpy_dtype(A.dtype)() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type responsible_size = int(len_A / WORK_GROUP_SIZE) apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', ct.c_int()), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ) ] + [Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))])) for i in range(1, responsible_size)] + [ Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; cl_mem swap; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A) }) c_controller = CFile("generated", [control]) return [kernel, c_controller]