def visit_AugAssign(self, node): node.value = self.visit(node.value) if not self.vectorize: node.target = self.visit(node.target) return node if util.contains_symbol(node.target, self.loop_var): return simd_macros.mm256_store_ps( node.target, C.BinaryOp(self.visit(node.target), node.op, node.value)) elif isinstance(node.op, C.Op.Add) and isinstance(node.value, C.BinaryOp) and \ isinstance(node.value.op, C.Op.Mul): # if not isinstance(node.target, C.SymbolRef): # node.value = C.FunctionCall(C.SymbolRef("vsum"), [node.value]) # return node # else: return C.Assign( node.target, C.FunctionCall( C.SymbolRef("_mm256_fmadd_ps"), [node.value.left, node.value.right, node.target])) elif isinstance(node.op, C.Op.Add) and isinstance( node.value, C.FunctionCall): # TODO: Verfiy it's a vector intrinsic return C.Assign( node.target, C.FunctionCall(C.SymbolRef("_mm256_add_ps"), [node.value, node.target])) elif isinstance(node.target, C.BinaryOp) and isinstance( node.target.op, C.Op.ArrayRef): raise NotImplementedError() node.target = self.visit(node.target) return node
def insert_malloc(body, shape, name, dtype, _global=False): shape_str = "".join("[{}]".format(d) for d in shape[1:]) size = 1 for d in shape: size *= d body.insert( 0, StringTemplate( "$global$type (* $arg_name0)$shape = ($global$type (*)$cast) $arg_name1;", { "arg_name0": C.SymbolRef(name), "arg_name1": C.FunctionCall(C.SymbolRef('_mm_malloc'), [ C.Mul( C.Constant(size), C.FunctionCall(C.SymbolRef('sizeof'), [ ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype) ()) ])), C.Constant(64) ]), "shape": C.SymbolRef(shape_str), "cast": C.SymbolRef(shape_str), "type": C.SymbolRef( ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype)())), "global": C.SymbolRef("__global " if _global else "") }))
def visit_For(self, node): node.body = util.flatten([s for s in node.body]) new_body = [] for stmt in node.body: if isinstance(stmt, C.FunctionCall) and "_mm" in stmt.func.name \ and "_store" in stmt.func.name and inReplaceMapSource(stmt.args[0], self.replace_map): if isinstance(stmt.args[1], C.SymbolRef): sym_arr_ref = extract_reference(stmt.args) store_in_du_map(sym_arr_ref) reg = stmt.args[1] self.seen[reg.name] = None new_body.append(stmt) elif isinstance(stmt.args[1], C.FunctionCall) and "_mm" in stmt.func.name: tmp = self._gen_register() new_body.append(C.Assign(C.SymbolRef(tmp, get_simd_type()()), deepcopy(stmt.args[1]))) new_body.append(C.FunctionCall(C.SymbolRef(stmt.func.name), [stmt.args[0],C.SymbolRef(tmp, None)])) sym_arr_ref = extract_reference(C.FunctionCall(C.SymbolRef(stmt.func.name), [stmt.args[0],C.SymbolRef(tmp, None)]).args) store_in_du_map(sym_arr_ref) # if stmt.args[0].type: # self.seen[reg.name] = stmt.args[0].type #else: self.seen[tmp] = None elif isinstance(stmt, C.BinaryOp) and \ isinstance(stmt.op, C.Op.Assign) and \ isinstance(stmt.left, C.SymbolRef) and \ isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name and "_load" in stmt.right.func.name and inReplaceMapSink(stmt.right.args[0], self.replace_map): #print(stmt.right.args[0]) source = get_alias(stmt.right.args, self.replace_map) #print(source) if (source is not None): sym_arr_ref = construct_arr_reference(source, deepcopy(stmt.right.args)) if in_du_map(sym_arr_ref): reg = get_register(sym_arr_ref) #print(reg.name) if str(reg.name) in self.seen: #print(reg.name) sym_map[stmt.left.name] = reg else: new_body.append(stmt) else: new_body.append(stmt) else: new_body.append(stmt) else: new_body.append(stmt) node.body = util.flatten([self.visit(s) for s in new_body]) return node
def load_ps(arg): return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_load_ps", "AVX-2": "_mm256_load_ps", "AVX-512": "_mm512_load_ps", }[latte.config.vec_config]), [arg])
def set_zero_ps(): return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_setzero_ps", "AVX-2": "_mm256_setzero_ps", "AVX-512": "_mm512_setzero_ps" }[latte.config.vec_config]), [])
def store_epi32(target, value): return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_store_epi32", "AVX-2": "_mm256_store_epi32", "AVX-512": "_mm512_store_epi32", }[latte.config.vec_config]), [target, value])
def simd_add(left, right): func = { "AVX": "_mm256_add_ps", "AVX-2": "_mm256_add_ps", "AVX-512": "_mm512_add_ps", }[latte.config.vec_config] return C.FunctionCall(C.SymbolRef(func), [left, right])
def insert_free(name): return (StringTemplate( "$free;", { "free": C.FunctionCall(C.SymbolRef('_mm_free'), [C.SymbolRef(name)]), }))
def visit(self, node): node = super().visit(node) if hasattr(node, 'body'): # [collector.visit(s) for s in node.body] new_body = [] seen = {} stores = [] collector = VectorLoadCollector() for s in node.body: collector.visit(s) for stmt in collector.loads.keys(): if stmt not in seen: reg = self._gen_register() load_node, number, func = collector.loads[stmt] seen[stmt] = (reg, load_node, func) self.sym[reg] = get_simd_type()() new_body.append( C.Assign( C.SymbolRef(reg, get_simd_type()()), C.FunctionCall(C.SymbolRef(func), [load_node]))) if isinstance( s, C.FunctionCall ) and "_mm" in s.func.name and "_store" in s.func.name: if s.args[0].codegen() in seen: stores.append((s.args[0], seen[s.args[0].codegen()][0], s.func.name)) s = C.Assign(C.SymbolRef(seen[s.args[0].codegen()][0]), s.args[1]) for stmt in seen.keys(): reg, load_node, func = seen[stmt] replacer = VectorLoadReplacer( C.FunctionCall(C.SymbolRef(func), [load_node]).codegen(), C.SymbolRef(reg)) s = replacer.visit(s) new_body.append(s) for target, value, name in stores: if "epi32" in name: new_body.append(store_epi32(target, C.SymbolRef(value))) elif "ps" in name: new_body.append(store_ps(target, C.SymbolRef(value))) else: assert (false) node.body = util.flatten(new_body) return node
def simd_fma(*args): assert len(args) == 3 fma_func = { "AVX": "_mm256_fmadd_ps", "AVX-2": "_mm256_fmadd_ps", "AVX-512": "_mm512_fmadd_ps", }[latte.config.vec_config] return C.FunctionCall(C.SymbolRef(fma_func), list(args))
def broadcast_ss(arg): if latte.config.vec_config == "AVX-512": # AVX-512 doesn't support broadcast, use set1_ps and remove Ref node assert isinstance(arg, C.UnaryOp) and isinstance(arg.op, C.Op.Ref) arg = arg.arg return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_broadcast_ss", "AVX-2": "_mm256_broadcast_ss", "AVX-512": "_mm512_set1_ps", }[latte.config.vec_config]), [arg])
def visit_AugAssign(self, node): node.value = self.visit(node.value) if util.contains_symbol(node.target, self.loop_var): if not util.contains_symbol(node.target.right, self.loop_var): target = self.visit(deepcopy(node.target)) curr_node = node.target idx = 1 while curr_node.left.right.name != self.loop_var: curr_node = curr_node.left idx += 1 curr_node.left = curr_node.left.left node.target = C.ArrayRef(node.target, C.SymbolRef(self.loop_var)) while not isinstance(curr_node, C.SymbolRef): curr_node = curr_node.left if curr_node.name in self.transposed_buffers and self.transposed_buffers[ curr_node.name] != idx: raise NotImplementedError() self.transposed_buffers[curr_node.name] = idx curr_node.name += "_transposed" if isinstance(node.target.right, C.Constant) and node.target.value == 0.0: return store_ps(node.target.left, C.BinaryOp(target, node.op, node.value)) else: return store_ps(C.Ref(node.target), C.BinaryOp(target, node.op, node.value)) else: if isinstance(node.target.right, C.Constant) and node.target.value == 0.0: return store_ps( node.target.left, C.BinaryOp(self.visit(node.target), node.op, node.value)) else: return store_ps( C.Ref(node.target), C.BinaryOp(self.visit(node.target), node.op, node.value)) elif isinstance(node.op, C.Op.Add) and isinstance( node.value, C.FunctionCall): # TODO: Verfiy it's a vector intrinsic return C.Assign( node.target, C.FunctionCall(C.SymbolRef("_mm256_add_ps"), [node.value, node.target])) elif isinstance(node.target, C.BinaryOp) and isinstance( node.target.op, C.Op.ArrayRef): raise NotImplementedError(node) node.target = self.visit(node.target) return node
def visit_FunctionDecl(self, node): new_body = [] count = 0 for statement in node.defn: if isinstance(statement, ast.For) or isinstance(statement, C.For): pre = C.SubAssign(C.ArrayRef(C.SymbolRef('times'), C.Constant(count)),C.FunctionCall('omp_get_wtime', [])) post = C.AddAssign(C.ArrayRef(C.SymbolRef('times'), C.Constant(count)),C.FunctionCall('omp_get_wtime', [])) new_body.append(pre) new_body.append(statement) new_body.append(post) count = count + 1 else: new_body.append(statement) memset = C.Assign(C.SymbolRef('times'), C.FunctionCall(C.SymbolRef('calloc_doubles'),[C.Constant(count)])) new_body.insert(0, memset) new_body.insert(0, C.Assign(C.SymbolRef("*times", ctypes.c_double()), C.Constant(0))) for i in range(0,count): print_stmt = C.FunctionCall(C.SymbolRef('printf'),[C.String("\ttimes[%d] = %g\\n"), C.Constant(i), C.ArrayRef(C.SymbolRef('times'), C.Constant(i))]) new_body.append(print_stmt) node.defn = new_body return node
def gen_vector_cmp_instruction(dest, src1, src2, type_map, symbol_map): src1_type = get_type(src1, type_map, symbol_map) src2_type = get_type(src2, type_map, symbol_map) assert (src1_type is not None) assert (src2_type is not None) if isinstance(src1_type, simd.types.m256) and isinstance( src2_type, simd.types.m256): return C.Assign( dest, C.FunctionCall( C.SymbolRef("_mm256_cmp_ps_mask"), [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)])) elif isinstance(src1_type, simd.types.m512) and isinstance( src2_type, simd.types.m512): return C.Assign( dest, C.FunctionCall( C.SymbolRef("_mm512_cmp_ps_mask"), [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)])) elif isinstance(src1_type, simd.types.m256i) and isinstance( src2_type, simd.types.m256i): return C.Assign( dest, C.FunctionCall( C.SymbolRef("_mm256_cmp_epi32_mask"), [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)])) elif isinstance(src1_type, simd.types.m512i) and isinstance( src2_type, simd.types.m512i): return C.Assign( dest, C.FunctionCall( C.SymbolRef("_mm512_cmp_epi32_mask"), [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)])) else: assert (False)
def gen_mask_move_instruction(dest, src1, selector, src2, type_map, symbol_map): src1_type = get_type(src1, type_map, symbol_map) src2_type = get_type(src2, type_map, symbol_map) #assert(src1_type == src2_type) assert (src1_type is not None) assert (src2_type is not None) if isinstance(src1_type, simd.types.m256) and isinstance( src2_type, simd.types.m256): return C.Assign( dest, C.FunctionCall(C.SymbolRef("_mm256_mask_mov_ps"), [src1, selector, src2])) elif isinstance(src1_type, simd.types.m512) and isinstance( src2_type, simd.types.m512): return C.Assign( dest, C.FunctionCall(C.SymbolRef("_mm512_mask_mov_ps"), [src1, selector, src2])) elif isinstance(src1_type, simd.types.m256i) and isinstance( src2_type, simd.types.m256i): return C.Assign( dest, C.FunctionCall(C.SymbolRef("_mm256_mask_mov_epi32"), [src1, selector, src2])) elif isinstance(src1_type, simd.types.m512i) and isinstance( src2_type, simd.types.m512i): return C.Assign( dest, C.FunctionCall(C.SymbolRef("_mm512_mask_mov_epi32"), [src1, selector, src2])) else: assert (False)
def broadcast_ss(arg, type_): if isinstance(type_, ctypes.c_int): if latte.config.vec_config == "AVX-512": # AVX-512 doesn't support broadcast, use set1_ps and remove Ref node if isinstance(arg, C.UnaryOp) and isinstance(arg.op, C.Op.Ref): arg = arg.arg return C.FunctionCall( C.SymbolRef({ "AVX-2": "_mm256_broadcastd_epi32", "AVX-512": "_mm512_set1_epi32", }[latte.config.vec_config]), [arg]) else: if latte.config.vec_config == "AVX-512": # AVX-512 doesn't support broadcast, use set1_ps and remove Ref node if isinstance(arg, C.UnaryOp) and isinstance(arg.op, C.Op.Ref): arg = arg.arg return C.FunctionCall( C.SymbolRef({ "AVX": "_mm256_broadcast_ss", "AVX-2": "_mm256_broadcast_ss", "AVX-512": "_mm512_set1_ps", }[latte.config.vec_config]), [arg])
def block_loop(self, node): loopvar = node.init.left.name loopvar += loopvar self.nest.insert( 0, C.For( C.Assign(C.SymbolRef(loopvar, node.init.left.type), node.init.right), C.Lt(C.SymbolRef(loopvar), node.test.right), C.AddAssign(C.SymbolRef(loopvar), C.Constant(self.block_factor)), [None])) node.init.right = C.SymbolRef(loopvar) node.test.right = C.FunctionCall(C.SymbolRef("fmin"), [ C.Add(C.SymbolRef(loopvar), C.Constant(self.block_factor)), node.test.right ])
def visit_FunctionCall(self, node): if isinstance(node.func, C.SymbolRef): if node.func.name == "rand": if "OPENCL" in latte.config.parallel_strategy: import struct platform_c_maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1 return StringTemplate("((({} + get_global_id(0)) * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1)) >> 16".format(int(random.random() * platform_c_maxint))) return C.Div(node, C.Cast(ctypes.c_float(), C.SymbolRef("RAND_MAX"))) #ANAND: 10/11/2016 Adding following uutility to convert python max to c max if isinstance(node.func, C.SymbolRef): if node.func.name == "max": return C.FunctionCall(C.SymbolRef("MAX"), [node.args[0],node.args[1]]) return node
def visit_For(self, node): node.body = util.flatten([self.visit(s) for s in node.body]) if node.init.left.name == self.enclosing_loop_var: new_body = [] added_code = False prefetch_count = self.prefetch_count for stmt in node.body: new_body.append(stmt) if prefetch_count > 0 and isinstance(stmt, C.BinaryOp) and isinstance(stmt.op, C.Op.Assign) and \ isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name \ and ("_load_" in stmt.right.func.name or "_set1" in stmt.right.func.name or "_broadcast" in stmt.right.func.name): ast.dump(stmt.right.args[0]) if check_name(stmt.right.args[0], self.prefetch_field): array_ref = deepcopy(stmt.right.args[0]) new_array_ref = self.rewrite_arg(array_ref) where_to_add = new_body prefetch_count -= 1 if node.init.left.name != self.prefetch_dest_loop: where_to_add = HoistPrefetch.escape_body added_code = True where_to_add.append( C.FunctionCall( C.SymbolRef(prefetch_symbol_table[ self.cacheline_hint]), [ C.Add(new_array_ref, C.SymbolRef("prefetch_offset_var")) ])) where_to_add.append( C.Assign( C.SymbolRef("prefetch_offset_var"), C.Add(C.SymbolRef("prefetch_offset_var"), C.Constant(self.prefetch_offset)))) if added_code: InitPrefetcher.init_body.append( C.Assign( C.SymbolRef("prefetch_offset_var", ctypes.c_int()), C.Constant(0))) node.body = new_body return node
def visit_For(self, node): node.body = util.flatten([self.visit(s) for s in node.body]) if node.init.left.name == self.enclosing_loop_var: new_body = [] prefetch_count = self.prefetch_count for stmt in node.body: new_body.append(stmt) if prefetch_count > 0 and isinstance(stmt, C.BinaryOp) and isinstance(stmt.op, C.Op.Assign) and \ isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name \ and ("_load_" in stmt.right.func.name or "_set1" in stmt.right.func.name or "_broadcast" in stmt.right.func.name): ast.dump(stmt.right.args[0]) if check_name(stmt.right.args[0], self.prefetch_field): array_ref = deepcopy(stmt.right.args[0]) new_array_ref = self.rewrite_arg(array_ref) prefetch_count -= 1 new_body.append( C.FunctionCall( C.SymbolRef(prefetch_symbol_table[ self.cacheline_hint]), [new_array_ref])) node.body = new_body return node
def visit_FunctionCall(self, node): new_args = [deepcopy(self.visit(a)) for a in node.args] return C.FunctionCall(node.func, new_args)
def visit_FunctionDecl(self, node): new_body = [] pre = [] count = 0 _id = 2 for statement in node.defn: self.seen = set() #self.visit(statement) if isinstance(statement, ast.For) or isinstance(statement, C.For): temp = [] if hasattr(statement, 'pre_trans') and statement.pre_trans is not None: #new_body.extend(stmt.pre_trans) pre.extend(statement.pre_trans) # self.visit(temp) #self.seen = set(); self.visit(statement) args = [] args2 = [] args3 = [] for var in self.seen: args.append(C.SymbolRef(var)) args2.append(var) args3.append(C.SymbolRef("_" + var)) # create function call func_name = self.name + str(_id) _id = _id + 1 arg_bufs = [self.buffers[var] for var in args2] #arg_bufs.sort() type_sig = [ np.ctypeslib.ndpointer(buf.dtype, buf.ndim, buf.shape) for buf in arg_bufs ] params = [ C.SymbolRef("_" + arg, typ()) for arg, typ in zip(args2, type_sig) ] outlined_func_call = C.FunctionCall(C.SymbolRef(func_name), args3) for arg in args2: name = arg buf = self.buffers[name] new_body2 = [] for arg in args2: name = arg buf = self.buffers[name] new_body2.insert( 0, StringTemplate( "__assume_aligned({}, 64);\n".format(name))) util.insert_cast(new_body2, buf.shape[1:], name, buf.dtype) new_body2.append(statement) func_decl = C.FunctionDecl(None, C.SymbolRef(func_name), params, new_body2) if len(args2) > 0: #shape_str = "{}* ".format(self.buffers[args2[0]].dtype) + args2[0].join(", {}* ".format(self.buffers[d].dtype) + "{}".format(d) for d in args2[1:]) shape_str = "{}* ".format( ctree.types.codegen_type(ctree.types.get_c_type_from_numpy_dtype(self.buffers[args2[0]].dtype)())) + \ args2[0].join(", {}* ".format( ctree.types.codegen_type(ctree.types.get_c_type_from_numpy_dtype(self.buffers[d].dtype)())) + "{}".format(d) for d in args2[1:]) else: shape_str = "" self.func_headers.append( StringTemplate( "void $func ($args);", { "func": C.SymbolRef(func_name), "args": C.SymbolRef(shape_str) })) self.new_funcs.append(func_decl) new_body.append(outlined_func_call) else: new_body.append(statement) new_body = pre + new_body node.defn = new_body return node
def visit_For(self, node): if hasattr(node, 'parallel') and node.parallel: # Supports depth one nesting with collapse loopvar1 = node.init.left.name looplen1 = node.test.right to_return = [] if all(isinstance(s, C.For) and hasattr(s, 'parallel') and s.parallel for s in node.body): for s in node.body: body = s.body kernel_args = set() loopvar2 = s.init.left.name looplen2 = s.test.right kernel_name = self._gen_unique_kernel_name() params = self.collect_args_and_insert_casts(kernel_args, body) body.insert(0, C.Assign( C.SymbolRef(loopvar1, ctypes.c_int()), C.FunctionCall(C.SymbolRef("get_global_id"), [C.Constant(0)]) )) body.insert(0, C.Assign( C.SymbolRef(loopvar2, ctypes.c_int()), C.FunctionCall(C.SymbolRef("get_global_id"), [C.Constant(1)]) )) kernel_src = C.FunctionDecl(None, C.SymbolRef(kernel_name), params, body) kernel_src.set_kernel() self.build_kernel(kernel_src, kernel_name, kernel_args) to_return.append(StringTemplate( """ size_t global_size_{kernel_name}[2] = {{{looplen1}, {looplen2}}}; clEnqueueNDRangeKernel(queue, {kernel_name}, 2, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL); clFinish(queue); """.format( kernel_name=kernel_name, looplen1=looplen1, looplen2=looplen2) )) else: kernel_args = set() body = node.body kernel_name = self._gen_unique_kernel_name() params = self.collect_args_and_insert_casts(kernel_args, body) body.insert(0, C.Assign( C.SymbolRef(loopvar1, ctypes.c_int()), C.FunctionCall(C.SymbolRef("get_global_id"), [C.Constant(0)]) )) kernel_src = C.FunctionDecl(None, C.SymbolRef(kernel_name), params, body) kernel_src.set_kernel() self.build_kernel(kernel_src, kernel_name, kernel_args) to_return.append(StringTemplate( """ size_t global_size_{kernel_name}[1] = {{{looplen1}}}; clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL); clFinish(queue); """.format( kernel_name=kernel_name, looplen1=looplen1) )) if hasattr(node, 'reduce_vars') and len(node.reduce_vars) > 0: for var in node.reduce_vars: size = np.prod(self.buffers[var].shape[1:]) to_return.append(self._gen_reduce_for_loop(node, var, size)) return to_return else: raise NotImplementedError(node) return node