def transform(self, py_ast, program_cfg): arg_cfg, tune_cfg = program_cfg tree = PyBasicConversions().visit(py_ast) param_dict = {} tree.body[0].params.append(C.SymbolRef("retval", arg_cfg[0]())) # Annotate arguments for param, type in zip(tree.body[0].params, arg_cfg): param.type = type() param_dict[param.name] = type._dtype_ length = np.prod(arg_cfg[0]._shape_) transformer = MapTransformer("i", param_dict, "retval") body = list(map(transformer.visit, tree.body[0].defn)) tree.body[0].defn = [C.For( C.Assign(C.SymbolRef("i", ct.c_int()), C.Constant(0)), C.Lt(C.SymbolRef("i"), C.Constant(length)), C.PostInc(C.SymbolRef("i")), body=body, pragma="ivdep" )] tree = DeclarationFiller().visit(tree) defns = [] tree = HwachaVectorize(param_dict, defns).visit(tree) file_body = [ StringTemplate("#include <stdlib.h>"), StringTemplate("#include <stdint.h>"), StringTemplate("#include <assert.h>"), StringTemplate("extern \"C\" void __hwacha_body(void);"), ] file_body.extend(defns) file_body.append(tree) return [CFile("generated", file_body)]
def visit_If(self, node): check = [ util.contains_symbol(node, var) for var in list(self.unrolled_vars) + [self.target_var] ] if any(check): body = [] for i in range(self.factor): stmt = deepcopy(node) for var in self.unrolled_vars: stmt = util.replace_symbol(var, C.SymbolRef(var + "_" + str(i)), stmt) if self.unroll_type == 0: body.append( util.replace_symbol( self.target_var, C.Add(C.SymbolRef(self.target_var), C.Constant(i)), stmt)) elif self.unroll_type == 1: body.append( util.replace_symbol( self.target_var, C.Add( C.Mul(C.Constant(self.factor), C.SymbolRef(self.target_var)), C.Constant(i)), stmt)) else: assert (false) return body return node
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value kernel_name = self._gen_unique_kernel_name() kernel_src = StringTemplate(""" __kernel void $kernel_name(__global float * $arr) { int x = get_global_id(0); float sum = $arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += $arr[i * $size + x]; } $arr[x] = sum; } """, {'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'size': C.Constant(size), 'kernel_name': C.SymbolRef(kernel_name)}) program = cl.clCreateProgramWithSource( latte.config.cl_ctx, kernel_src.codegen()).build() kernel = program[kernel_name] self.kernels[kernel_name] = kernel kernel.setarg(0, self.cl_buffers[var], ctypes.sizeof(cl.cl_mem)) return StringTemplate( """ size_t global_size_{kernel_name}[1] = {{{looplen1}}}; clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL); clFinish(queue); """.format( kernel_name=kernel_name, looplen1=size) )
def visit_BinaryOp(self, node): if isinstance(node.op, C.Op.Assign): check = [ util.contains_symbol(node.right, var) for var in list(self.unrolled_vars) + [self.target_var] ] if any(check): body = [] if hasattr(node.left, 'type') and node.left.type is not None: self.unrolled_vars.add(node.left.name) for i in range(self.factor): stmt = deepcopy(node) for var in self.unrolled_vars: stmt = util.replace_symbol( var, C.SymbolRef(var + "_" + str(i)), stmt) if self.unroll_type == 0: body.append( util.replace_symbol( self.target_var, C.Add(C.SymbolRef(self.target_var), C.Constant(i)), stmt)) elif self.unroll_type == 1: body.append( util.replace_symbol( self.target_var, C.Add( C.Mul(C.Constant(self.factor), C.SymbolRef(self.target_var)), C.Constant(i)), stmt)) else: assert (false) return body return node
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value return StringTemplate(""" //{ // ContinueNode *$reduce_node = new ContinueNode(&graph, [=]() { parallel_for(0,$size, [=](int low, int high) { #pragma simd for (int x = low; x < high; ++x) { float sum = _$arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += _$arr[i * $size + x]; } _$arr[x] = sum; } }); // }); // for (int i = 0; i < $looplen1; i+=$loopincr) { // make_edge($node_list[i], $reduce_node); // } //}; """, {'size': C.Constant(size), 'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'node_list': C.SymbolRef("node_list_"), 'reduce_node': C.SymbolRef("reduce_node_"), 'looplen1': C.Constant(looplen1.value), 'loopincr': C.Constant(loopincr) })
def insert_malloc(body, shape, name, dtype, _global=False): shape_str = "".join("[{}]".format(d) for d in shape[1:]) size = 1 for d in shape: size *= d body.insert( 0, StringTemplate( "$global$type (* $arg_name0)$shape = ($global$type (*)$cast) $arg_name1;", { "arg_name0": C.SymbolRef(name), "arg_name1": C.FunctionCall(C.SymbolRef('_mm_malloc'), [ C.Mul( C.Constant(size), C.FunctionCall(C.SymbolRef('sizeof'), [ ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype) ()) ])), C.Constant(64) ]), "shape": C.SymbolRef(shape_str), "cast": C.SymbolRef(shape_str), "type": C.SymbolRef( ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype)())), "global": C.SymbolRef("__global " if _global else "") }))
def test_mul_by_1(self): tree = C.Mul(C.Constant(1), C.SymbolRef("b")) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("b")) tree = C.Mul(C.SymbolRef("b"), C.Constant(1)) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("b"))
def test_add_zero(self): tree = C.Add(C.SymbolRef("a"), C.Constant(0)) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("a")) tree = C.Add(C.Constant(0), C.SymbolRef("a")) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("a"))
def test_sub_zero(self): tree = C.Sub(C.SymbolRef("a"), C.Constant(0)) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("a")) tree = C.Sub(C.Constant(0), C.SymbolRef("a")) tree = ConstantFold().visit(tree) self.assertEqual(str(tree), str(C.Sub(C.SymbolRef("a"))))
def test_recursive_fold(self): tree = C.Assign( C.SymbolRef("c"), C.Add(C.Add(C.Constant(2), C.Constant(-2)), C.SymbolRef("b"))) tree = ConstantFold().visit(tree) self.assertEqual( str(tree), str(C.Assign(C.SymbolRef("c"), C.SymbolRef("b"))))
def visit_Call(self, node): if node.func.id == "sgemm": node.func.id = "cblas_sgemm" for i in range(2): node.args[i] = C.Cast( CBLAS_TRANSPOSE(), C.Constant(112) if node.args[i].id == "True" else C.Constant(111)) node.args.insert(0, C.Cast(CBLAS_LAYOUT(), C.Constant(101))) return node
def visit_For(self, node): node.body = [self.visit(s) for s in node.body] if node.init.left.name == self.target_var: node.incr = C.AddAssign(C.SymbolRef(self.target_var), C.Constant(self.factor)) visitor = UnrollStatements(self.target_var, self.factor) node.body = util.flatten([visitor.visit(s) for s in node.body]) if node.test.right.value == self.factor: return [ util.replace_symbol(node.init.left.name, C.Constant(0), s) for s in node.body ] return node
def visit_BinaryOp(self, node): if isinstance(node.op, C.Op.ArrayRef): if isinstance(node.left, C.SymbolRef): target = node.left.name if target in self.cfg_dict: target = self.cfg_dict[target] # if type(target) in {int, float}: # return C.Constant(target) if isinstance(node.right, ast.Tuple): loopvars = node.right.elts loopvars = tuple(var.name for var in loopvars) node.right = self.gen_loop_index( loopvars, target.shape) elif isinstance(node.right, C.SymbolRef): if node.right.name in self.loop_var_map: loopvars = self.loop_var_map[node.right.name] node.right = self.gen_loop_index( loopvars, target.shape) return node if isinstance(node.left, ast.Attribute): if node.left.value.name in self.cfg_dict: attr = getattr(self.cfg_dict[node.left.value.name], node.left.attr) return C.Constant(attr[node.right.value]) else: raise NotImplementedError() node.left = self.visit(node.left) node.right = self.visit(node.right) return node
def visit_For(self, node): node.body = util.flatten([self.visit(s) for s in node.body]) #TODO: assumption is that every loop starts with zero, not negative init = -1 incr = -1 test = -1 if isinstance(node.init, C.BinaryOp) and \ isinstance(node.init.op, C.Op.Assign) and \ isinstance(node.init.left, C.SymbolRef) and \ isinstance(node.init.right, C.Constant): init = node.init.right.value if isinstance(node.test, C.BinaryOp) and \ isinstance(node.test.op, C.Op.Lt) and \ isinstance(node.test.left, C.SymbolRef) and \ isinstance(node.test.right, C.Constant): test = node.test.right.value if isinstance(node.incr, C.AugAssign) and \ isinstance(node.incr.op, C.Op.Add) and \ isinstance(node.incr.target, C.SymbolRef) and \ isinstance(node.incr.value, C.Constant): incr = node.incr.value.value if init != -1 and test != -1 and incr != -1 and (init+incr) >= test: return [util.replace_symbol(node.init.left.name, C.Constant(init), s) for s in node.body] return node
def gen_loop_nest(self, loopvars, cfg): body = [] node = C.For( C.Assign(C.SymbolRef(loopvars[0], ct.c_int()), C.Constant(0)), C.Lt(C.SymbolRef(loopvars[0]), C.Constant(cfg.shape[0])), C.PostInc(C.SymbolRef(loopvars[0])), body) curr_node = node for loopvar, dim in zip(loopvars[1:], cfg.shape[1:]): curr_node = C.For( C.Assign(C.SymbolRef(loopvar, ct.c_int()), C.Constant(0)), C.Lt(C.SymbolRef(loopvar), C.Constant(dim)), C.PostInc(C.SymbolRef(loopvar)), []) body.append(curr_node) body = curr_node.body self.loop_shape_map[loopvars] = cfg.shape return node, curr_node
def block_loop(self, node): loopvar = node.init.left.name loopvar += loopvar self.nest.insert( 0, C.For( C.Assign(C.SymbolRef(loopvar, node.init.left.type), node.init.right), C.Lt(C.SymbolRef(loopvar), node.test.right), C.AddAssign(C.SymbolRef(loopvar), C.Constant(self.block_factor)), [None])) node.init.right = C.SymbolRef(loopvar) node.test.right = C.FunctionCall(C.SymbolRef("fmin"), [ C.Add(C.SymbolRef(loopvar), C.Constant(self.block_factor)), node.test.right ])
def gen_loop_index(self, loopvars, shape): curr = C.SymbolRef(loopvars[-1]) for i in reversed(range(len(loopvars) - 1)): curr = C.Add( C.Mul(C.SymbolRef(loopvars[i]), C.Constant(np.prod(shape[i + 1:]))), curr) return curr
def rewrite_arg(self, arg): if isinstance(arg, C.UnaryOp) and isinstance( arg.op, C.Op.Ref) and isinstance( arg.arg, C.BinaryOp) and isinstance( arg.arg.op, C.Op.ArrayRef): curr_node = arg.arg elif isinstance(arg, C.BinaryOp) and isinstance(arg.op, C.Op.ArrayRef): curr_node = arg else: curr_node = None idx = self.dim num_zeroes = self.prefetch_num_zeroes while (idx + 1 != 0): if num_zeroes > 0: curr_node.right = C.Constant(0) num_zeroes -= 1 curr_node = curr_node.left idx += 1 old_expr = curr_node.right #if isinstance(old_expr, C.BinaryOp) and isinstance(old_expr.op, C.Op.Add): # old_expr = old_expr.left #new_expr = C.Add(old_expr, C.Mul(C.Add(C.SymbolRef(self.prefetch_loop_var), C.SymbolRef(self.prefetch_constant)), C.SymbolRef(self.prefetch_multiplier))) new_expr = C.Mul( C.Add(C.SymbolRef(self.prefetch_loop_var), C.SymbolRef(self.prefetch_constant)), C.SymbolRef(self.prefetch_multiplier)) curr_node.right = new_expr if isinstance(arg, C.BinaryOp) and isinstance(arg.op, C.Op.ArrayRef): return C.Ref(arg) return arg
def visit_For(self, node): node.body = [self.visit(s) for s in node.body] if node.init.left.name == self.target_var: if self.unroll_type == 0: node.incr = C.AddAssign(C.SymbolRef(self.target_var), C.Constant(self.factor)) node.incr = C.AddAssign(C.SymbolRef(self.target_var), C.Constant(self.factor)) elif self.unroll_type == 1: assert (node.test.right.value % self.factor == 0) node.test.right.value = node.test.right.value // self.factor else: assert (0) visitor = UnrollStatements(self.target_var, self.factor, self.unroll_type) node.body = util.flatten([visitor.visit(s) for s in node.body]) return node
def replace_loopvars_as_constants(self, node): if isinstance(node, ast.Name): for var in self.loop_vars: if var[0] == node.id: return C.Constant(var[1]) elif isinstance(node, ast.BinOp): node.left = self.replace_loopvars_as_constants(node.left) node.right = self.replace_loopvars_as_constants(node.right) return node
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg channels, height, width = arg_cfg[0] cfg = { 'pad_h': C.Constant(self.pad_h), 'pad_w': C.Constant(self.pad_w), 'stride_h': C.Constant(self.stride_h), 'stride_w': C.Constant(self.stride_w), 'kernel_h': C.Constant(self.kernel_h), 'kernel_w': C.Constant(self.kernel_w), 'channels': C.Constant(channels), 'height': C.Constant(height), 'width': C.Constant(width), } im2col = C.FunctionDecl( None, C.SymbolRef("im2col"), [C.SymbolRef("data_im", arg_cfg[1]()), C.SymbolRef("data_col", arg_cfg[1]())], [StringTemplate(""" int stride_h = $stride_h; int stride_w = $stride_w; int pad_h = $pad_h; int pad_w = $pad_w; int kernel_h = $kernel_h; int kernel_w = $kernel_w; int channels = $channels; int height = $height; int width = $width; int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; for (int c = 0; c < channels_col; ++c) { int w_offset = c % kernel_w; int h_offset = (c / kernel_w) % kernel_h; int c_im = c / kernel_h / kernel_w; for (int h = 0; h < height_col; ++h) { for (int w = 0; w < width_col; ++w) { int h_pad = h * stride_h - pad_h + h_offset; int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_col[(c * height_col + h) * width_col + w] = data_im[(c_im * height + h_pad) * width + w_pad]; else data_col[(c * height_col + h) * width_col + w] = 0; } } } """, cfg)]) return [C.CFile('im2col', [im2col])]
def visit_For(self, node): node.body = util.flatten([self.visit(s) for s in node.body]) if node.init.left.name == self.enclosing_loop_var: new_body = [] added_code = False prefetch_count = self.prefetch_count for stmt in node.body: new_body.append(stmt) if prefetch_count > 0 and isinstance(stmt, C.BinaryOp) and isinstance(stmt.op, C.Op.Assign) and \ isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name \ and ("_load_" in stmt.right.func.name or "_set1" in stmt.right.func.name or "_broadcast" in stmt.right.func.name): ast.dump(stmt.right.args[0]) if check_name(stmt.right.args[0], self.prefetch_field): array_ref = deepcopy(stmt.right.args[0]) new_array_ref = self.rewrite_arg(array_ref) where_to_add = new_body prefetch_count -= 1 if node.init.left.name != self.prefetch_dest_loop: where_to_add = HoistPrefetch.escape_body added_code = True where_to_add.append( C.FunctionCall( C.SymbolRef(prefetch_symbol_table[ self.cacheline_hint]), [ C.Add(new_array_ref, C.SymbolRef("prefetch_offset_var")) ])) where_to_add.append( C.Assign( C.SymbolRef("prefetch_offset_var"), C.Add(C.SymbolRef("prefetch_offset_var"), C.Constant(self.prefetch_offset)))) if added_code: InitPrefetcher.init_body.append( C.Assign( C.SymbolRef("prefetch_offset_var", ctypes.c_int()), C.Constant(0))) node.body = new_body return node
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value return StringTemplate(""" #pragma omp parallel for simd for (int x = 0; x < $size; ++x) { float sum = _$arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += _$arr[i * $size + x]; } _$arr[x] = sum; } """, {'size': C.Constant(size), 'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'node_list': C.SymbolRef("node_list_"), 'reduce_node': C.SymbolRef("reduce_node_"), 'looplen1': C.Constant(looplen1.value), 'loopincr': C.Constant(loopincr) })
def visit_FunctionDecl(self, node): new_body = [] count = 0 for statement in node.defn: if isinstance(statement, ast.For) or isinstance(statement, C.For): pre = C.SubAssign(C.ArrayRef(C.SymbolRef('times'), C.Constant(count)),C.FunctionCall('omp_get_wtime', [])) post = C.AddAssign(C.ArrayRef(C.SymbolRef('times'), C.Constant(count)),C.FunctionCall('omp_get_wtime', [])) new_body.append(pre) new_body.append(statement) new_body.append(post) count = count + 1 else: new_body.append(statement) memset = C.Assign(C.SymbolRef('times'), C.FunctionCall(C.SymbolRef('calloc_doubles'),[C.Constant(count)])) new_body.insert(0, memset) new_body.insert(0, C.Assign(C.SymbolRef("*times", ctypes.c_double()), C.Constant(0))) for i in range(0,count): print_stmt = C.FunctionCall(C.SymbolRef('printf'),[C.String("\ttimes[%d] = %g\\n"), C.Constant(i), C.ArrayRef(C.SymbolRef('times'), C.Constant(i))]) new_body.append(print_stmt) node.defn = new_body return node
def visit_BinaryOp(self, node): node.left = self.visit(node.left) node.right = self.visit(node.right) if isinstance(node.left, C.Constant) and \ isinstance(node.right, C.Constant): return C.Constant(op_map[node.op.__class__]( node.left.value, node.right.value)) elif isinstance(node.op, C.Op.Add): return self.fold_add(node) elif isinstance(node.op, C.Op.Sub): return self.fold_sub(node) elif isinstance(node.op, C.Op.Mul): return self.fold_mul(node) return node
def rewrite_arg(self, arg): if isinstance(arg, C.UnaryOp) and isinstance( arg.op, C.Op.Ref) and isinstance( arg.arg, C.BinaryOp) and isinstance( arg.arg.op, C.Op.ArrayRef): curr_node = arg.arg elif isinstance(arg, C.BinaryOp) and isinstance(arg.op, C.Op.ArrayRef): curr_node = arg else: curr_node = None idx = self.dim num_zeroes = self.prefetch_num_zeroes while (idx + 1 != 0): if num_zeroes > 0: curr_node.right = C.Constant(0) num_zeroes -= 1 curr_node = curr_node.left idx += 1 old_expr = curr_node.right new_expr = C.Add(old_expr, C.Constant(self.prefetch_constant)) curr_node.right = new_expr if isinstance(arg, C.BinaryOp) and isinstance(arg.op, C.Op.ArrayRef): return C.Ref(arg) return arg
def visit_BinaryOp(self, node): node.left = self.visit(node.left) node.right = self.visit(node.right) if isinstance(node.op, C.Op.Assign) and isinstance( node.left, C.SymbolRef): value = self._get_value(node.right) if value is not None: self.seen[node.left.name] = value return None elif isinstance(node.op, C.Op.Div): left = self._get_value(node.left) right = self._get_value(node.right) if left is not None and right is not None: if isinstance(left, int) and isinstance(right, int): return C.Constant(left // right) else: return C.Constant(left / right) elif isinstance(node.op, C.Op.Mul): left = self._get_value(node.left) right = self._get_value(node.right) if left is not None and right is not None: return C.Constant(left * right) elif left == 0 or right == 0: return C.Constant(0) elif isinstance(node.op, C.Op.Mod): left = self._get_value(node.left) right = self._get_value(node.right) if left is not None and right is not None: return C.Constant(left % right) elif isinstance(node.op, C.Op.Add): left = self._get_value(node.left) right = self._get_value(node.right) if left is not None and right is not None: return C.Constant(left + right) elif left == 0: return node.right elif right == 0: return node.left elif isinstance(node.op, C.Op.Sub): left = self._get_value(node.left) right = self._get_value(node.right) if left is not None and right is not None: return C.Constant(left - right) return node
def visit_FunctionDef(self, node): self.decls = {} node.defn = [self.visit(s) for s in node.body] new_params = [] for param in node.args.args: if sys.version_info > (3, 0): _id = param.arg else: _id = param.id if _id == 'self': continue value = self.symbol_table[_id] if isinstance(value, Array): _type = np.ctypeslib.ndpointer(value.dtype, value.ndim, value.shape)() else: _type = get_ctype(value) new_params.append(C.SymbolRef(_id, _type)) for name, value in self.decls.items(): if isinstance(value, Array): type = np.ctypeslib.ndpointer(value.dtype, value.ndim, value.shape)() value = value.ctypes.data new_params.append(C.SymbolRef(name, type)) else: if value is True: value = 1 type = ct.c_int() elif value is False: value = 0 type = ct.c_int() else: type = get_ctype(value) node.body.insert( 0, C.Assign(C.SymbolRef(name, type), C.Constant(value))) node.args.args = new_params return node
def visit_Subscript(self, node): node.slice = self.visit(node.slice) node.value = self.visit(node.value) if isinstance(node.value, C.SymbolRef): # Evaluate subscripts of constants immediately (i.e. tuple indices) if node.value.name in self.decls: if isinstance(node.slice.value, ast.Num): value = self.decls.pop(node.value.name) return C.Constant(value[node.slice.value.n]) if isinstance(node.slice.value, ast.Tuple): if isinstance(node.value, C.SymbolRef): value = self.decls[node.value.name] else: value = self.eval_in_table(node.value) index = node.slice.value.elts c_index = ast.BinOp(index[-1], ast.Mult(), ast.Num(np.prod(value.shape[len(index):]))) for dim in reversed(range(len(index) - 1)): c_index = ast.BinOp( ast.BinOp(index[dim], ast.Mult(), ast.Num(np.prod(value.shape[dim + 1:]))), ast.Add(), c_index) node.slice.value = c_index return node
def __init__(self, kernel_size, stride, padding, shape): super(Col2Im, self).__init__(C.Constant(0)) self.kernel_h, self.kernel_w = kernel_size self.stride_h, self.stride_w = stride self.pad_h, self.pad_w = padding self.shape = shape