def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) shape = decl_info.shape if temp_var.scope == temp_var_scope.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs( ) shape = lsize + shape if shape: from cgen import ArrayOf ecm = self.get_expression_to_code_mapper(codegen_state) temp_var_decl = ArrayOf( temp_var_decl, ecm(p.flattened_product(shape), prec=PREC_NONE, type_context="i")) return temp_var_decl
def generate_c_instruction_code(codegen_state, insn): kernel = codegen_state.kernel if codegen_state.vectorization_info is not None: raise Unvectorizable("C instructions cannot be vectorized") body = [] from loopy.target.c import POD from cgen import Initializer, Block, Line from pymbolic.primitives import Variable for name, iname_expr in insn.iname_exprs: if (isinstance(iname_expr, Variable) and name not in codegen_state.var_subst_map): # No need, the bare symbol will work continue body.append( Initializer( POD(codegen_state.ast_builder, kernel.index_dtype, name), codegen_state.expression_to_code_mapper(iname_expr, prec=PREC_NONE, type_context="i"))) if body: body.append(Line()) body.extend(Line(line) for line in insn.code.split("\n")) return Block(body)
def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from cgen import RestrictPointer, Const arg_decl = RestrictPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) return arg_decl
def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): decl = POD(self, dtype, name) from cgen import Initializer, Const if is_const: decl = Const(decl) return Initializer(decl, val_str)
def get_global_arg_decl(self, name, shape, dtype, is_written): from cgen import RestrictPointer, Const arg_decl = RestrictPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) return arg_decl
def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import RestrictPointer, Const arg_decl = RestrictPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) return arg_decl
def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.cuda import CudaRestrictPointer arg_decl = CudaRestrictPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) return arg_decl
def get_value_arg_decl(self, name, shape, dtype, is_written): assert shape == () result = POD(self, dtype, name) if not is_written: from cgen import Const result = Const(result) if self.target.fortran_abi: from cgen import Pointer result = Pointer(result) return result
def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.ispc import ISPCUniformPointer, ISPCUniform arg_decl = ISPCUniformPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) arg_decl = ISPCUniform(arg_decl) return arg_decl
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer return For( InlineInitializer(POD(self, iname_dtype, iname), ecm(lbound, PREC_NONE, "i")), ecm(Comparison(var(iname), "<=", ubound), PREC_NONE, "i"), "++%s" % iname, inner)
def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if temp_var.read_only: from cgen import Const temp_var_decl = Const(temp_var_decl) if decl_info.shape: from cgen import ArrayOf ecm = self.get_expression_to_code_mapper(codegen_state) temp_var_decl = ArrayOf(temp_var_decl, ecm(p.flattened_product(decl_info.shape), prec=PREC_NONE, type_context="i")) return temp_var_decl
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer from cgen.ispc import ISPCUniform return For( InlineInitializer(ISPCUniform(POD(self, iname_dtype, iname)), ecm(lbound, PREC_NONE, "i")), ecm(p.Comparison(var(iname), "<=", ubound), PREC_NONE, "i"), "++%s" % iname, inner)
def idi_to_cgen_declarator(self, kernel, idi): from loopy.kernel.data import InameArg if (idi.offset_for_name is not None or idi.stride_for_name_and_axis is not None): assert not idi.is_written from cgen import Const return Const(POD(self, idi.dtype, idi.name)) elif issubclass(idi.arg_class, InameArg): return InameArg(idi.name, idi.dtype).get_arg_decl(self) else: name = idi.base_name or idi.name var_descr = kernel.get_var_descriptor(name) from loopy.kernel.data import ArrayBase if isinstance(var_descr, ArrayBase): return var_descr.get_arg_decl(self, idi.name[len(name):], idi.shape, idi.dtype, idi.is_written) else: return var_descr.get_arg_decl(self)
def get_temporary_decl(self, knl, sched_index, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) shape = decl_info.shape from loopy.kernel.data import temp_var_scope if temp_var.scope == temp_var_scope.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) _, lsize = knl.get_grid_size_upper_bounds_as_exprs() shape = lsize + shape if shape: from cgen import ArrayOf temp_var_decl = ArrayOf(temp_var_decl, " * ".join(str(s) for s in shape)) return temp_var_decl
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import (temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.linearization[schedule_index].kernel_name sub_knl_temps = (temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted(kernel.temporary_variables.values(), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl(codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer( decl, generate_array_literal(codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None if (tv.address_space == AddressSpace.GLOBAL and codegen_state.is_generating_device_code): # global temps trigger no codegen in the device code continue offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append(tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append(tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl( cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "({} {}) ({} + {})".format(" ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += (idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(base_storage_sizes.items()): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line for tv in sorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.scope != temp_var_scope.GLOBAL: decl = self.wrap_temporary_decl( self.get_temporary_decl(codegen_state, schedule_index, tv, idi), tv.scope) if tv.initializer is not None: assert tv.read_only decl = Initializer( decl, generate_array_literal(codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append(tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append(tv.scope) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.scope) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? cast_decl = _ConstRestrictPointer(cast_decl) temp_var_decl = _ConstRestrictPointer(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % (" ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += (idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE # FIXME: Could detect operations, generate atomic_{add,...} when # appropriate. if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ np.int32, np.int64, np.float32, np.float64 ]: from cgen import Block, DoWhile, Assign from loopy.target.c import POD old_val_var = codegen_state.var_name_generator("loopy_old_val") new_val_var = codegen_state.var_name_generator("loopy_new_val") from loopy.kernel.data import TemporaryVariable, AddressSpace ecm = codegen_state.expression_to_code_mapper.with_assignments({ old_val_var: TemporaryVariable(old_val_var, lhs_dtype), new_val_var: TemporaryVariable(new_val_var, lhs_dtype), }) lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) from pymbolic.mapper.substitutor import make_subst_func from pymbolic import var from loopy.symbolic import SubstitutionMapper subst = SubstitutionMapper( make_subst_func({lhs_expr: var(old_val_var)})) rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) if lhs_dtype.numpy_dtype.itemsize == 4: func_name = "atomic_cmpxchg" elif lhs_dtype.numpy_dtype.itemsize == 8: func_name = "atom_cmpxchg" else: raise LoopyError("unexpected atomic size") cast_str = "" old_val = old_val_var new_val = new_val_var if lhs_dtype.numpy_dtype.kind == "f": if lhs_dtype.numpy_dtype == np.float32: ctype = "int" elif lhs_dtype.numpy_dtype == np.float64: ctype = "long" else: assert False from loopy.kernel.data import (TemporaryVariable, ArrayArg) if (isinstance(lhs_var, ArrayArg) and lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" elif (isinstance(lhs_var, ArrayArg) and lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif (isinstance(lhs_var, TemporaryVariable) and lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif (isinstance(lhs_var, TemporaryVariable) and lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" else: raise LoopyError("unexpected kind of variable '%s' in " "atomic operation: " % (lhs_var.name, type(lhs_var).__name__)) old_val = "*(%s *) &" % ctype + old_val new_val = "*(%s *) &" % ctype + new_val cast_str = "(%s %s *) " % (var_kind, ctype) return Block([ POD(self, NumpyType(lhs_dtype.dtype, target=self.target), old_val_var), POD(self, NumpyType(lhs_dtype.dtype, target=self.target), new_val_var), DoWhile( "%(func_name)s(" "%(cast_str)s&(%(lhs_expr)s), " "%(old_val)s, " "%(new_val)s" ") != %(old_val)s" % { "func_name": func_name, "cast_str": cast_str, "lhs_expr": lhs_expr_code, "old_val": old_val, "new_val": new_val, }, Block([ Assign(old_val_var, lhs_expr_code), Assign(new_val_var, rhs_expr_code), ])) ]) else: raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.primitives import Sum from cgen import Statement from pymbolic.mapper.stringifier import PREC_NONE if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ np.int32, np.int64, np.float32, np.float64 ]: # atomicAdd if isinstance(rhs_expr, Sum): ecm = self.get_expression_to_code_mapper(codegen_state) new_rhs_expr = Sum( tuple(c for c in rhs_expr.children if c != lhs_expr)) lhs_expr_code = ecm(lhs_expr) rhs_expr_code = ecm(new_rhs_expr) return Statement("atomicAdd(&{}, {})".format( lhs_expr_code, rhs_expr_code)) else: from cgen import Block, DoWhile, Assign from loopy.target.c import POD old_val_var = codegen_state.var_name_generator("loopy_old_val") new_val_var = codegen_state.var_name_generator("loopy_new_val") from loopy.kernel.data import TemporaryVariable ecm = codegen_state.expression_to_code_mapper.with_assignments( { old_val_var: TemporaryVariable(old_val_var, lhs_dtype), new_val_var: TemporaryVariable(new_val_var, lhs_dtype), }) lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) from pymbolic.mapper.substitutor import make_subst_func from pymbolic import var from loopy.symbolic import SubstitutionMapper subst = SubstitutionMapper( make_subst_func({lhs_expr: var(old_val_var)})) rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) cast_str = "" old_val = old_val_var new_val = new_val_var if lhs_dtype.numpy_dtype.kind == "f": if lhs_dtype.numpy_dtype == np.float32: ctype = "int" elif lhs_dtype.numpy_dtype == np.float64: ctype = "long" else: raise AssertionError() old_val = "*(%s *) &" % ctype + old_val new_val = "*(%s *) &" % ctype + new_val cast_str = "(%s *) " % (ctype) return Block([ POD(self, NumpyType(lhs_dtype.dtype, target=self.target), old_val_var), POD(self, NumpyType(lhs_dtype.dtype, target=self.target), new_val_var), DoWhile( "atomicCAS(" "%(cast_str)s&(%(lhs_expr)s), " "%(old_val)s, " "%(new_val)s" ") != %(old_val)s" % { "cast_str": cast_str, "lhs_expr": lhs_expr_code, "old_val": old_val, "new_val": new_val, }, Block([ Assign(old_val_var, lhs_expr_code), Assign(new_val_var, rhs_expr_code), ])) ]) else: raise NotImplementedError("atomic update for '%s'" % lhs_dtype)