def get_type(self, env=None): if isinstance(self.op, Op.ArrayRef): if isinstance(self.left, SymbolRef) and env is not None \ and env._has_key(self.left.name): type = env._lookup(self.left.name)._dtype_ return get_c_type_from_numpy_dtype(type)() # FIXME: integer promotions and stuff like that if hasattr(self.left, 'get_type'): left_type = self.left.get_type() elif isinstance(self.left, SymbolRef) and env is not None \ and env._has_key(self.left.name): left_type = env._lookup(self.left.name) elif hasattr(self.left, 'type'): left_type = self.left.type else: left_type = None if hasattr(self.right, 'get_type'): right_type = self.right.get_type() elif isinstance(self.right, SymbolRef) and env is not None \ and env._has_key(self.right.name): right_type = env._lookup(self.right.name) elif hasattr(self.right, 'type'): right_type = self.right.type else: right_type = None if isinstance(self.op, Op.ArrayRef): ptr_type = left_type._type_ return ptr_type() if hasattr(ptr_type, '__call__') else left_type return get_common_ctype( filter(lambda x: x is not None, [right_type, left_type]))
def get_type(self, env=None): if isinstance(self.op, Op.ArrayRef): if isinstance(self.left, SymbolRef) and env is not None \ and env._has_key(self.left.name): type = env._lookup(self.left.name)._dtype_ return get_c_type_from_numpy_dtype(type)() # FIXME: integer promotions and stuff like that if hasattr(self.left, 'get_type'): left_type = self.left.get_type() elif isinstance(self.left, SymbolRef) and env is not None \ and env._has_key(self.left.name): left_type = env._lookup(self.left.name) elif hasattr(self.left, 'type'): left_type = self.left.type else: left_type = None if hasattr(self.right, 'get_type'): right_type = self.right.get_type() elif isinstance(self.right, SymbolRef) and env is not None \ and env._has_key(self.right.name): right_type = env._lookup(self.right.name) elif hasattr(self.right, 'type'): right_type = self.right.type else: right_type = None return get_common_ctype(filter(lambda x: x is not None, [right_type, left_type]))
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype(np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype( np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate( r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, {'length': Constant(length)}) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[array_add_template]) ], 'omp') return [array_op]
def finalize(self, transform_result, program_config): tree = transform_result[0] # Get the argument type data input_data = program_config[0] pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) scalar_data_type_referenced = get_c_type_from_numpy_dtype(np.dtype(input_data.scalar_type)) entry_type = CFUNCTYPE(None, pointer, scalar_data_type_referenced, pointer) # Instantiation of the concrete function fn = ConcreteElemWiseArrayScalarOp() return fn.finalize(Project([tree]), FUNC_NAME, entry_type)
def finalize(self, transform_result, program_config): tree = transform_result[0] # Get the argument type data input_data = program_config[0] pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) scalar_data_type_referenced = get_c_type_from_numpy_dtype( np.dtype(input_data.scalar_type)) entry_type = CFUNCTYPE(None, pointer, scalar_data_type_referenced, pointer) # Instantiation of the concrete function fn = ConcreteElemWiseArrayScalarOp() return fn.finalize(Project([tree]), FUNC_NAME, entry_type)
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = data_type apply_one.return_type = data_type array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(input1[i], input2[i]); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("input1", pointer()), SymbolRef("input2", pointer()), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] num_2d_layers = np.prod(input_data.num_frames) data_height = np.prod(input_data.data_height) layer_length = np.prod(input_data.size // num_2d_layers) segment_length = np.prod(input_data.segment_length) inp_type = get_c_type_from_numpy_dtype(input_data.dtype)() input_pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) output_pointer = np.ctypeslib.ndpointer(input_data.dtype, 1, (input_data.size, 1)) # Get the kernel function, apply_one apply_one = PyBasicConversions().visit(py_ast).find(FunctionDecl) apply_one.return_type = inp_type apply_one.params[0].type = inp_type apply_one.params[1].type = inp_type # Naming our kernel method apply_one.name = 'apply' num_pfovs = int(layer_length / segment_length) # print ("num layers: ", num_2d_layers) # print ("input size: ", input_data.size) # print ("layer length: ", layer_length) # TODO: TIME TO START CPROFILING THINGS! reduction_template = StringTemplate(r""" #pragma omp parallel for collapse(2) for (int level = 0; level < $num_2d_layers; level++) { for (int i=0; i<$num_pfovs ; i++) { int level_offset = level * $layer_length; double avg = 0.0; // #pragma omp parallel for reduction (+:avg) for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; // printf ("Index: %i, I: %i, J: %i\n", index, i, j); avg += input_arr[level_offset + index]; } avg = avg / $pfov_length; // #pragma omp parallel for for (int j=0; j<$pfov_length; j++) { int in_layer_offset = ($pfov_length * i + j) / ($layer_length / $data_height); int index = (in_layer_offset + ($pfov_length * i + j) * $data_height) % $layer_length; output_arr[level_offset + index] = input_arr[level_offset + index] - avg; } } } """, { 'num_2d_layers': Constant(num_2d_layers), 'layer_length': Constant(layer_length), 'num_pfovs': Constant(num_pfovs), 'pfov_length': Constant(segment_length), 'data_height': Constant(data_height), }) reducer = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, REDUCTION_FUNC_NAME, params=[ SymbolRef("input_arr", input_pointer()), SymbolRef("output_arr", output_pointer()) ], defn=[ reduction_template ]) ], 'omp') return [reducer]
def transform(self, tree, program_config): dirname = self.config_to_dirname(program_config) A = program_config[0] len_A = np.prod(A.shape) data_type = get_c_type_from_numpy_dtype(A.dtype) # Get the ctype class for the data type for the parameters pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree).find(FunctionDecl) apply_one.name = 'apply' # Naming our kernel method # Assigning a data_type instance for the # # return type, and the parameter types... # apply_one.return_type = data_type() apply_one.params[0].type = data_type() apply_one.params[1].type = data_type() responsible_size = int(len_A / WORK_GROUP_SIZE) # Get the appropriate number of threads for parallelizing # Creating our controller function (called "apply_kernel") to control # # the parallelizing of our computation, using ctree syntax... # apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ), For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)), PostInc(SymbolRef('offset')), [ Assign(SymbolRef('localResult'), FunctionCall(apply_one.name, [SymbolRef('localResult'), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), Mul(SymbolRef('offset'), Constant(WORK_GROUP_SIZE))))]) ), ] ), Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() # Hardcoded OpenCL code to compensate to begin execution of parallelized computation control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A), }) ocl_kernel = OclFile("kernel", [apply_one, apply_kernel]) c_controller = CFile("generated", [control]) return [ocl_kernel, c_controller]
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = get_c_type_from_numpy_dtype(A.dtype)() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type responsible_size = int(len_A / WORK_GROUP_SIZE) apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', ct.c_int()), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ) ] + [Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))])) for i in range(1, responsible_size)] + [ Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; cl_mem swap; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A) }) c_controller = CFile("generated", [control]) return [kernel, c_controller]