def insert_free(name): return (StringTemplate( "$free;", { "free": C.FunctionCall(C.SymbolRef('_mm_free'), [C.SymbolRef(name)]), }))
def visit_AugAssign(self, node): node.value = self.visit(node.value) if not self.vectorize: node.target = self.visit(node.target) return node if util.contains_symbol(node.target, self.loop_var): return simd_macros.mm256_store_ps( node.target, C.BinaryOp(self.visit(node.target), node.op, node.value)) elif isinstance(node.op, C.Op.Add) and isinstance(node.value, C.BinaryOp) and \ isinstance(node.value.op, C.Op.Mul): # if not isinstance(node.target, C.SymbolRef): # node.value = C.FunctionCall(C.SymbolRef("vsum"), [node.value]) # return node # else: return C.Assign( node.target, C.FunctionCall( C.SymbolRef("_mm256_fmadd_ps"), [node.value.left, node.value.right, node.target])) elif isinstance(node.op, C.Op.Add) and isinstance( node.value, C.FunctionCall): # TODO: Verfiy it's a vector intrinsic return C.Assign( node.target, C.FunctionCall(C.SymbolRef("_mm256_add_ps"), [node.value, node.target])) elif isinstance(node.target, C.BinaryOp) and isinstance( node.target.op, C.Op.ArrayRef): raise NotImplementedError() node.target = self.visit(node.target) return node
def gen_loop_index(self, loopvars, shape): curr = C.SymbolRef(loopvars[-1]) for i in reversed(range(len(loopvars) - 1)): curr = C.Add( C.Mul(C.SymbolRef(loopvars[i]), C.Constant(np.prod(shape[i + 1:]))), curr) return curr
def transform(self, py_ast, program_cfg): arg_cfg, tune_cfg = program_cfg tree = PyBasicConversions().visit(py_ast) param_dict = {} tree.body[0].params.append(C.SymbolRef("retval", arg_cfg[0]())) # Annotate arguments for param, type in zip(tree.body[0].params, arg_cfg): param.type = type() param_dict[param.name] = type._dtype_ length = np.prod(arg_cfg[0]._shape_) transformer = MapTransformer("i", param_dict, "retval") body = list(map(transformer.visit, tree.body[0].defn)) tree.body[0].defn = [C.For( C.Assign(C.SymbolRef("i", ct.c_int()), C.Constant(0)), C.Lt(C.SymbolRef("i"), C.Constant(length)), C.PostInc(C.SymbolRef("i")), body=body, pragma="ivdep" )] tree = DeclarationFiller().visit(tree) defns = [] tree = HwachaVectorize(param_dict, defns).visit(tree) file_body = [ StringTemplate("#include <stdlib.h>"), StringTemplate("#include <stdint.h>"), StringTemplate("#include <assert.h>"), StringTemplate("extern \"C\" void __hwacha_body(void);"), ] file_body.extend(defns) file_body.append(tree) return [CFile("generated", file_body)]
def rewrite_arg(self, arg): if isinstance(arg, C.UnaryOp) and isinstance( arg.op, C.Op.Ref) and isinstance( arg.arg, C.BinaryOp) and isinstance( arg.arg.op, C.Op.ArrayRef): curr_node = arg.arg elif isinstance(arg, C.BinaryOp) and isinstance(arg.op, C.Op.ArrayRef): curr_node = arg else: curr_node = None idx = self.dim num_zeroes = self.prefetch_num_zeroes while (idx + 1 != 0): if num_zeroes > 0: curr_node.right = C.Constant(0) num_zeroes -= 1 curr_node = curr_node.left idx += 1 old_expr = curr_node.right #if isinstance(old_expr, C.BinaryOp) and isinstance(old_expr.op, C.Op.Add): # old_expr = old_expr.left #new_expr = C.Add(old_expr, C.Mul(C.Add(C.SymbolRef(self.prefetch_loop_var), C.SymbolRef(self.prefetch_constant)), C.SymbolRef(self.prefetch_multiplier))) new_expr = C.Mul( C.Add(C.SymbolRef(self.prefetch_loop_var), C.SymbolRef(self.prefetch_constant)), C.SymbolRef(self.prefetch_multiplier)) curr_node.right = new_expr if isinstance(arg, C.BinaryOp) and isinstance(arg.op, C.Op.ArrayRef): return C.Ref(arg) return arg
def visit_If(self, node): check = [ util.contains_symbol(node, var) for var in list(self.unrolled_vars) + [self.target_var] ] if any(check): body = [] for i in range(self.factor): stmt = deepcopy(node) for var in self.unrolled_vars: stmt = util.replace_symbol(var, C.SymbolRef(var + "_" + str(i)), stmt) if self.unroll_type == 0: body.append( util.replace_symbol( self.target_var, C.Add(C.SymbolRef(self.target_var), C.Constant(i)), stmt)) elif self.unroll_type == 1: body.append( util.replace_symbol( self.target_var, C.Add( C.Mul(C.Constant(self.factor), C.SymbolRef(self.target_var)), C.Constant(i)), stmt)) else: assert (false) return body return node
def visit_BinaryOp(self, node): if isinstance(node.op, C.Op.Assign): check = [ util.contains_symbol(node.right, var) for var in list(self.unrolled_vars) + [self.target_var] ] if any(check): body = [] if hasattr(node.left, 'type') and node.left.type is not None: self.unrolled_vars.add(node.left.name) for i in range(self.factor): stmt = deepcopy(node) for var in self.unrolled_vars: stmt = util.replace_symbol( var, C.SymbolRef(var + "_" + str(i)), stmt) if self.unroll_type == 0: body.append( util.replace_symbol( self.target_var, C.Add(C.SymbolRef(self.target_var), C.Constant(i)), stmt)) elif self.unroll_type == 1: body.append( util.replace_symbol( self.target_var, C.Add( C.Mul(C.Constant(self.factor), C.SymbolRef(self.target_var)), C.Constant(i)), stmt)) else: assert (false) return body return node
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value return StringTemplate(""" //{ // ContinueNode *$reduce_node = new ContinueNode(&graph, [=]() { parallel_for(0,$size, [=](int low, int high) { #pragma simd for (int x = low; x < high; ++x) { float sum = _$arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += _$arr[i * $size + x]; } _$arr[x] = sum; } }); // }); // for (int i = 0; i < $looplen1; i+=$loopincr) { // make_edge($node_list[i], $reduce_node); // } //}; """, {'size': C.Constant(size), 'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'node_list': C.SymbolRef("node_list_"), 'reduce_node': C.SymbolRef("reduce_node_"), 'looplen1': C.Constant(looplen1.value), 'loopincr': C.Constant(loopincr) })
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value kernel_name = self._gen_unique_kernel_name() kernel_src = StringTemplate(""" __kernel void $kernel_name(__global float * $arr) { int x = get_global_id(0); float sum = $arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += $arr[i * $size + x]; } $arr[x] = sum; } """, {'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'size': C.Constant(size), 'kernel_name': C.SymbolRef(kernel_name)}) program = cl.clCreateProgramWithSource( latte.config.cl_ctx, kernel_src.codegen()).build() kernel = program[kernel_name] self.kernels[kernel_name] = kernel kernel.setarg(0, self.cl_buffers[var], ctypes.sizeof(cl.cl_mem)) return StringTemplate( """ size_t global_size_{kernel_name}[1] = {{{looplen1}}}; clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL); clFinish(queue); """.format( kernel_name=kernel_name, looplen1=size) )
def test_add_zero(self): tree = C.Add(C.SymbolRef("a"), C.Constant(0)) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("a")) tree = C.Add(C.Constant(0), C.SymbolRef("a")) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("a"))
def test_sub_zero(self): tree = C.Sub(C.SymbolRef("a"), C.Constant(0)) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("a")) tree = C.Sub(C.Constant(0), C.SymbolRef("a")) tree = ConstantFold().visit(tree) self.assertEqual(str(tree), str(C.Sub(C.SymbolRef("a"))))
def test_mul_by_1(self): tree = C.Mul(C.Constant(1), C.SymbolRef("b")) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("b")) tree = C.Mul(C.SymbolRef("b"), C.Constant(1)) tree = ConstantFold().visit(tree) self.assertEqual(tree, C.SymbolRef("b"))
def test_recursive_fold(self): tree = C.Assign( C.SymbolRef("c"), C.Add(C.Add(C.Constant(2), C.Constant(-2)), C.SymbolRef("b"))) tree = ConstantFold().visit(tree) self.assertEqual( str(tree), str(C.Assign(C.SymbolRef("c"), C.SymbolRef("b"))))
def visit_For(self, node): node.body = util.flatten([s for s in node.body]) new_body = [] for stmt in node.body: if isinstance(stmt, C.FunctionCall) and "_mm" in stmt.func.name \ and "_store" in stmt.func.name and inReplaceMapSource(stmt.args[0], self.replace_map): if isinstance(stmt.args[1], C.SymbolRef): sym_arr_ref = extract_reference(stmt.args) store_in_du_map(sym_arr_ref) reg = stmt.args[1] self.seen[reg.name] = None new_body.append(stmt) elif isinstance(stmt.args[1], C.FunctionCall) and "_mm" in stmt.func.name: tmp = self._gen_register() new_body.append(C.Assign(C.SymbolRef(tmp, get_simd_type()()), deepcopy(stmt.args[1]))) new_body.append(C.FunctionCall(C.SymbolRef(stmt.func.name), [stmt.args[0],C.SymbolRef(tmp, None)])) sym_arr_ref = extract_reference(C.FunctionCall(C.SymbolRef(stmt.func.name), [stmt.args[0],C.SymbolRef(tmp, None)]).args) store_in_du_map(sym_arr_ref) # if stmt.args[0].type: # self.seen[reg.name] = stmt.args[0].type #else: self.seen[tmp] = None elif isinstance(stmt, C.BinaryOp) and \ isinstance(stmt.op, C.Op.Assign) and \ isinstance(stmt.left, C.SymbolRef) and \ isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name and "_load" in stmt.right.func.name and inReplaceMapSink(stmt.right.args[0], self.replace_map): #print(stmt.right.args[0]) source = get_alias(stmt.right.args, self.replace_map) #print(source) if (source is not None): sym_arr_ref = construct_arr_reference(source, deepcopy(stmt.right.args)) if in_du_map(sym_arr_ref): reg = get_register(sym_arr_ref) #print(reg.name) if str(reg.name) in self.seen: #print(reg.name) sym_map[stmt.left.name] = reg else: new_body.append(stmt) else: new_body.append(stmt) else: new_body.append(stmt) else: new_body.append(stmt) node.body = util.flatten([self.visit(s) for s in new_body]) return node
def visit_AugAssign(self, node): node.value = self.visit(node.value) if util.contains_symbol(node.target, self.loop_var): if not util.contains_symbol(node.target.right, self.loop_var): target = self.visit(deepcopy(node.target)) curr_node = node.target idx = 1 while curr_node.left.right.name != self.loop_var: curr_node = curr_node.left idx += 1 curr_node.left = curr_node.left.left node.target = C.ArrayRef(node.target, C.SymbolRef(self.loop_var)) while not isinstance(curr_node, C.SymbolRef): curr_node = curr_node.left if curr_node.name in self.transposed_buffers and self.transposed_buffers[ curr_node.name] != idx: raise NotImplementedError() self.transposed_buffers[curr_node.name] = idx curr_node.name += "_transposed" if isinstance(node.target.right, C.Constant) and node.target.value == 0.0: return store_ps(node.target.left, C.BinaryOp(target, node.op, node.value)) else: return store_ps(C.Ref(node.target), C.BinaryOp(target, node.op, node.value)) else: if isinstance(node.target.right, C.Constant) and node.target.value == 0.0: return store_ps( node.target.left, C.BinaryOp(self.visit(node.target), node.op, node.value)) else: return store_ps( C.Ref(node.target), C.BinaryOp(self.visit(node.target), node.op, node.value)) elif isinstance(node.op, C.Op.Add) and isinstance( node.value, C.FunctionCall): # TODO: Verfiy it's a vector intrinsic return C.Assign( node.target, C.FunctionCall(C.SymbolRef("_mm256_add_ps"), [node.value, node.target])) elif isinstance(node.target, C.BinaryOp) and isinstance( node.target.op, C.Op.ArrayRef): raise NotImplementedError(node) node.target = self.visit(node.target) return node
def visit(self, node): node = super().visit(node) if hasattr(node, 'body'): # [collector.visit(s) for s in node.body] newbody = [] for s in node.body: if isinstance(s, C.BinaryOp) and isinstance(s.op, C.Op.Assign): # Anand - needs more work 27th June 2017 if isinstance(s.left, C.SymbolRef) and (s.left.type is not None) and s.left.name in self.variables \ and s.left.name not in self.defs: y = self._gen_register() new_stmt = C.Assign( C.SymbolRef(y, get_simd_type(s.left.type)()), broadcast_ss(C.SymbolRef(s.left.name, None), s.left.type)) newbody.append(s) newbody.append(new_stmt) self.defs[s.left.name] = C.SymbolRef(y, None) self.symbol_table[y] = get_simd_type(s.left.type)() else: for i in self.defs: s = replace_symbol(i, self.defs[i], s) if (isinstance(s.left.type, get_simd_type(ctypes.c_int())) or isinstance( s.left.type, get_simd_type( ctypes.c_float()))) and isinstance( s.right, C.SymbolRef): s.right = broadcast_ss( C.SymbolRef(s.right.name, None), s.left.type) elif isinstance(s.left, C.SymbolRef) and s.left.name in self.symbol_table and\ (isinstance(self.symbol_table[s.left.name], get_simd_type(ctypes.c_int())) or isinstance(self.symbol_table[s.left.name], get_simd_type(ctypes.c_float()))) and isinstance(s.right, C.SymbolRef): s.right = broadcast_ss( C.SymbolRef(s.right.name, None), self.symbol_table[s.left.name]) newbody.append(s) else: for i in self.defs: s = replace_symbol(i, self.defs[i], s) newbody.append(s) node.body = util.flatten(newbody) return node
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg channels, height, width = arg_cfg[0] cfg = { 'pad_h': C.Constant(self.pad_h), 'pad_w': C.Constant(self.pad_w), 'stride_h': C.Constant(self.stride_h), 'stride_w': C.Constant(self.stride_w), 'kernel_h': C.Constant(self.kernel_h), 'kernel_w': C.Constant(self.kernel_w), 'channels': C.Constant(channels), 'height': C.Constant(height), 'width': C.Constant(width), } im2col = C.FunctionDecl( None, C.SymbolRef("im2col"), [C.SymbolRef("data_im", arg_cfg[1]()), C.SymbolRef("data_col", arg_cfg[1]())], [StringTemplate(""" int stride_h = $stride_h; int stride_w = $stride_w; int pad_h = $pad_h; int pad_w = $pad_w; int kernel_h = $kernel_h; int kernel_w = $kernel_w; int channels = $channels; int height = $height; int width = $width; int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; for (int c = 0; c < channels_col; ++c) { int w_offset = c % kernel_w; int h_offset = (c / kernel_w) % kernel_h; int c_im = c / kernel_h / kernel_w; for (int h = 0; h < height_col; ++h) { for (int w = 0; w < width_col; ++w) { int h_pad = h * stride_h - pad_h + h_offset; int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_col[(c * height_col + h) * width_col + w] = data_im[(c_im * height + h_pad) * width + w_pad]; else data_col[(c * height_col + h) * width_col + w] = 0; } } } """, cfg)]) return [C.CFile('im2col', [im2col])]
def store_epi32(target, value): return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_store_epi32", "AVX-2": "_mm256_store_epi32", "AVX-512": "_mm512_store_epi32", }[latte.config.vec_config]), [target, value])
def load_ps(arg): return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_load_ps", "AVX-2": "_mm256_load_ps", "AVX-512": "_mm512_load_ps", }[latte.config.vec_config]), [arg])
def set_zero_ps(): return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_setzero_ps", "AVX-2": "_mm256_setzero_ps", "AVX-512": "_mm512_setzero_ps" }[latte.config.vec_config]), [])
def visit_BinaryOp(self, node): if isinstance(node.op, C.Op.ArrayRef): if util.contains_symbol(node, self.loop_var): idx = 0 curr_node = node while not isinstance(curr_node.right, C.SymbolRef) or \ curr_node.right.name != self.loop_var: idx += 1 curr_node = curr_node.left while not isinstance(curr_node, C.SymbolRef): curr_node = curr_node.left self.vectorized_buffers[curr_node.name] = idx if self.vectorize: return simd_macros.mm256_load_ps(node) else: return C.ArrayRef(node, C.SymbolRef("_neuron_index_1_inner")) else: if self.vectorize: return simd_macros.mm256_set1_ps(node) else: return node node.left = self.visit(node.left) node.right = self.visit(node.right) return node
def simd_add(left, right): func = { "AVX": "_mm256_add_ps", "AVX-2": "_mm256_add_ps", "AVX-512": "_mm512_add_ps", }[latte.config.vec_config] return C.FunctionCall(C.SymbolRef(func), [left, right])
def get_register(sym_arr_ref): for i in range(len(du_map)): if ref_equal(du_map[i][0], sym_arr_ref): return C.SymbolRef(du_map[i][1]) return None
def visit_For(self, node): node.body = [self.visit(s) for s in node.body] if node.init.left.name == self.target_var: if self.unroll_type == 0: node.incr = C.AddAssign(C.SymbolRef(self.target_var), C.Constant(self.factor)) node.incr = C.AddAssign(C.SymbolRef(self.target_var), C.Constant(self.factor)) elif self.unroll_type == 1: assert (node.test.right.value % self.factor == 0) node.test.right.value = node.test.right.value // self.factor else: assert (0) visitor = UnrollStatements(self.target_var, self.factor, self.unroll_type) node.body = util.flatten([visitor.visit(s) for s in node.body]) return node
def simd_fma(*args): assert len(args) == 3 fma_func = { "AVX": "_mm256_fmadd_ps", "AVX-2": "_mm256_fmadd_ps", "AVX-512": "_mm512_fmadd_ps", }[latte.config.vec_config] return C.FunctionCall(C.SymbolRef(fma_func), list(args))
def test_no_folding(self): trees = [ C.Add(C.SymbolRef("a"), C.SymbolRef("b")), C.Sub(C.SymbolRef("a"), C.SymbolRef("b")), C.Mul(C.SymbolRef("a"), C.SymbolRef("b")), C.Div(C.SymbolRef("a"), C.SymbolRef("b")), ] for tree in trees: new_tree = ConstantFold().visit(tree) self.assertEqual(tree, new_tree)
def insert_malloc(body, shape, name, dtype, _global=False): shape_str = "".join("[{}]".format(d) for d in shape[1:]) size = 1 for d in shape: size *= d body.insert( 0, StringTemplate( "$global$type (* $arg_name0)$shape = ($global$type (*)$cast) $arg_name1;", { "arg_name0": C.SymbolRef(name), "arg_name1": C.FunctionCall(C.SymbolRef('_mm_malloc'), [ C.Mul( C.Constant(size), C.FunctionCall(C.SymbolRef('sizeof'), [ ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype) ()) ])), C.Constant(64) ]), "shape": C.SymbolRef(shape_str), "cast": C.SymbolRef(shape_str), "type": C.SymbolRef( ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype)())), "global": C.SymbolRef("__global " if _global else "") }))
def visit_Attribute(self, node): if isinstance(node.ctx, ast.Load): # Lifts attributes that can be declared as constants in the # current scope value = node.value.id if value in self.symbol_table: name = "_".join((value, node.attr)) self.decls[name] = getattr(self.symbol_table[value], node.attr) return C.SymbolRef(name) return node
def visit_FunctionCall(self, node): if isinstance(node.func, C.SymbolRef): if node.func.name == "rand": if "OPENCL" in latte.config.parallel_strategy: import struct platform_c_maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1 return StringTemplate("((({} + get_global_id(0)) * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1)) >> 16".format(int(random.random() * platform_c_maxint))) return C.Div(node, C.Cast(ctypes.c_float(), C.SymbolRef("RAND_MAX"))) #ANAND: 10/11/2016 Adding following uutility to convert python max to c max if isinstance(node.func, C.SymbolRef): if node.func.name == "max": return C.FunctionCall(C.SymbolRef("MAX"), [node.args[0],node.args[1]]) return node
def collect_args_and_insert_casts(self, kernel_args, body): [CollectArrayReferences(kernel_args).visit(s) for s in body] params = [] for arg in kernel_args: buf = self.buffers[arg] typ = np.ctypeslib.ndpointer(buf.dtype, buf.ndim, buf.shape) params.append(C.SymbolRef("_" + arg, typ())) params[-1].set_global() util.insert_cast(body, buf.shape[1:], arg, buf.dtype, _global=True) return params