def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError("found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname+"_outer") inner_iname = var_name_gen(split_iname+"_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis+1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx))
def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx, ) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError( "found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname + "_outer") inner_iname = var_name_gen(split_iname + "_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis + 1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx))
def map_subscript(self, expr, expn_state): from loopy.symbolic import simplify_using_aff from pymbolic.primitives import Subscript new_indices = tuple( simplify_using_aff(self.kernel, self.rec(idx, expn_state)) for idx in expr.index_tuple) return Subscript(self.rec(expr.aggregate, expn_state), new_indices)
def get_arg_descriptor_for_expression(kernel, expr): """ :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` describing the argument expression *expr* which occurs in a call in the code of *kernel*. """ from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, SweptInameStrideCollector) from loopy.kernel.data import TemporaryVariable, ArrayArg if isinstance(expr, SubArrayRef): name = expr.subscript.aggregate.name arg = kernel.get_var_descriptor(name) if not isinstance(arg, (TemporaryVariable, ArrayArg)): raise LoopyError("unsupported argument type " "'%s' of '%s' in call statement" % (type(arg).__name__, expr.name)) aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag sub_dim_tags = [] sub_shape = [] # This helps in identifying identities like # "2*(i//2) + i%2" := "i" # See the kernel in # test_callables.py::test_shape_translation_through_sub_array_refs from loopy.symbolic import simplify_using_aff linearized_index = simplify_using_aff( kernel, sum(dim_tag.stride * iname for dim_tag, iname in zip( arg.dim_tags, expr.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector( tuple(iname.name for iname in expr.swept_inames))(linearized_index) sub_dim_tags = tuple( # Not all swept inames necessarily occur in the expression. DimTag(strides_as_dict.get(iname, 0)) for iname in expr.swept_inames) sub_shape = tuple( pw_aff_to_expr( kernel.get_iname_bounds(iname.name).upper_bound_pw_aff - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff) + 1 for iname in expr.swept_inames) return ArrayArgDescriptor(address_space=aspace, dim_tags=sub_dim_tags, shape=sub_shape) else: ExpressionIsScalarChecker(kernel)(expr) return ValueArgDescriptor()
def map_variable(self, expr, type_context): from loopy.kernel.data import ValueArg, AddressSpace def postproc(x): return x if expr.name in self.codegen_state.var_subst_map: if self.kernel.options.annotate_inames: return var("/* {} */ {}".format( expr.name, self.rec(self.codegen_state.var_subst_map[expr.name], type_context))) else: return self.rec(self.codegen_state.var_subst_map[expr.name], type_context) elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] from loopy.kernel.array import ArrayBase if isinstance(arg, ArrayBase): if arg.shape == (): if arg.offset: from loopy.kernel.array import _apply_offset from loopy.symbolic import simplify_using_aff subscript = _apply_offset(0, expr.name, arg) result = self.make_subscript( arg, var(expr.name), simplify_using_aff(self.kernel, self.rec(subscript, "i"))) return result else: return var(expr.name)[0] else: raise RuntimeError( "unsubscripted reference to array '%s'" % expr.name) if isinstance(arg, ValueArg) and self.fortran_abi: postproc = lambda x: x[0] # noqa elif expr.name in self.kernel.temporary_variables: temporary = self.kernel.temporary_variables[expr.name] if (temporary.base_storage or temporary.address_space == AddressSpace.GLOBAL): postproc = lambda x: x[0] # noqa result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name) if result is not None: _, c_name = result return postproc(var(c_name)) return postproc(var(expr.name))
def split_access_axis(expr): idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis_nr+1, outer_index) elif order == "C": idx.insert(axis_nr, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx))
def split_access_axis(expr): idx = expr.index if not isinstance(idx, tuple): idx = (idx, ) idx = list(idx) axis_idx = idx[axis_nr] from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis_nr + 1, outer_index) elif order == "C": idx.insert(axis_nr, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx))
def map_subscript(self, expr, enclosing_prec, type_context): def base_impl(expr, enclosing_prec, type_context): return self.parenthesize_if_needed( "%s[%s]" % (self.rec(expr.aggregate, PREC_CALL, type_context), self.rec(expr.index, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, enclosing_prec, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info( self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" % (ary.name, ary.dimensions, ", ".join( self.rec(idx, PREC_NONE, 'i') for idx in expr.index[::-1]))) if ary.dtype.numpy_dtype == np.float32: return base_access + ".x" if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return "as_double(%s.xy)" % base_access else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (GlobalArg, TemporaryVariable)): if len(access_info.subscripts) == 0: if isinstance(ary, GlobalArg): # unsubscripted global args are pointers result = "*" + access_info.array_name else: # unsubscripted temp vars are scalars result = access_info.array_name else: subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[%s]" % (access_info.array_name, self.rec(subscript, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False
def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper assignee_var_name, = insn.assignee_var_names() lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity: raise NotImplementedError("atomic ops in ISPC") from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_code = ecm(insn.expression, prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) lhs = insn.assignee # {{{ handle streaming stores if "!streaming_store" in insn.tags: ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) access_info = get_access_info( kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), codegen_state.vectorization_info) from loopy.kernel.data import GlobalArg, TemporaryVariable if not isinstance(ary, (GlobalArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" % type(ary).__name) if len(access_info.subscripts) != 1: raise LoopyError("streaming stores must have a subscript") subscript, = access_info.subscripts from pymbolic.primitives import Sum, flattened_sum, Variable if isinstance(subscript, Sum): terms = subscript.children else: terms = (subscript.children, ) new_terms = [] from loopy.kernel.data import LocalIndexTag from loopy.symbolic import get_dependencies saw_l0 = False for term in terms: if (isinstance(term, Variable) and isinstance( kernel.iname_to_tag.get(term.name), LocalIndexTag) and kernel.iname_to_tag.get(term.name).axis == 0): if saw_l0: raise LoopyError("streaming store must have stride 1 " "in local index, got: %s" % subscript) saw_l0 = True continue else: for dep in get_dependencies(term): if (isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag) and kernel.iname_to_tag.get(dep).axis == 0): raise LoopyError( "streaming store must have stride 1 " "in local index, got: %s" % subscript) new_terms.append(term) if not saw_l0: raise LoopyError("streaming store must have stride 1 in " "local index, got: %s" % subscript) if access_info.vector_index is not None: raise LoopyError("streaming store may not use a short-vector " "data type") rhs_has_programindex = any( isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag) and kernel.iname_to_tag.get(dep).axis == 0 for dep in get_dependencies(insn.expression)) if not rhs_has_programindex: rhs_code = "broadcast(%s, 0)" % rhs_code from cgen import Statement return Statement( "streaming_store(%s + %s, %s)" % (access_info.array_name, ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code)) # }}} from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] def make_var(name): from loopy import TaggedVariable if isinstance(expr.aggregate, TaggedVariable): return TaggedVariable(name, expr.aggregate.tag) else: return var(name) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) if isinstance(ary, ImageArg): extra_axes = 0 num_target_axes = ary.num_target_axes() if num_target_axes in [1, 2]: idx_vec_type = "float2" extra_axes = 2-num_target_axes elif num_target_axes == 3: idx_vec_type = "float4" extra_axes = 4-num_target_axes else: raise LoopyError("unsupported number (%d) of target axes in image" % num_target_axes) idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes base_access = var("read_imagef")( var(ary.name), var("loopy_sampler"), var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i'))) if ary.dtype.numpy_dtype == np.float32: return base_access.attr("x") if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return var("as_double")(base_access.attr("xy")) else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: if ( (isinstance(ary, (ConstantArg, ArrayArg)) or (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = make_var(access_info.array_name)[0] else: # unsubscripted temp vars are scalars # (unless they use base_storage) result = make_var(access_info.array_name) else: subscript, = access_info.subscripts result = make_var(access_info.array_name)[simplify_using_aff( self.kernel, self.rec(subscript, 'i'))] if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False
def map_subscript(self, expr): name = expr.aggregate.name # name of array if name in self.knl.arg_dict: array = self.knl.arg_dict[name] else: # this is a temporary variable return self.rec(expr.index) if not isinstance(array, lp.GlobalArg): # this array is not in global memory return self.rec(expr.index) index = expr.index # could be tuple or scalar index if not isinstance(index, tuple): index = (index,) from loopy.symbolic import get_dependencies from loopy.kernel.data import LocalIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() local_id0 = None local_id_found = False for iname in my_inames: # find local id0 tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True if tag.axis == 0: local_id0 = iname break # there will be only one local_id0 if not local_id_found: # count as uniform access return ToCountMap( {(self.type_inf(expr), 'uniform'): 1} ) + self.rec(expr.index) if local_id0 is None: # only non-zero local id(s) found, assume non-consecutive access return ToCountMap( {(self.type_inf(expr), 'nonconsecutive'): 1} ) + self.rec(expr.index) # check coefficient of local_id0 for each axis from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): from loopy.symbolic import simplify_using_aff coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx)) # check if he contains the lid 0 guy try: coeff_id0 = coeffs[Variable(local_id0)] except KeyError: # does not contain local_id0 continue if coeff_id0 != 1: # non-consecutive access return ToCountMap( {(self.type_inf(expr), 'nonconsecutive'): 1} ) + self.rec(expr.index) # coefficient is 1, now determine if stride is 1 from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: continue if stride != 1: # non-consecutive return ToCountMap( {(self.type_inf(expr), 'nonconsecutive'): 1} ) + self.rec(expr.index) # else, stride == 1, continue since another idx could contain id0 # loop finished without returning, stride==1 for every instance of local_id0 return ToCountMap( {(self.type_inf(expr), 'consecutive'): 1} ) + self.rec(expr.index)
def map_subscript(self, expr, enclosing_prec, type_context): def base_impl(expr, enclosing_prec, type_context): return self.parenthesize_if_needed( "%s[%s]" % ( self.rec(expr.aggregate, PREC_CALL, type_context), self.rec(expr.index, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, enclosing_prec, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" % (ary.name, ary.dimensions, ", ".join(self.rec(idx, PREC_NONE, 'i') for idx in expr.index[::-1]))) if ary.dtype.numpy_dtype == np.float32: return base_access+".x" if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return "as_double(%s.xy)" % base_access else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (GlobalArg, TemporaryVariable)): if len(access_info.subscripts) == 0: if isinstance(ary, GlobalArg): # unsubscripted global args are pointers result = "*" + access_info.array_name else: # unsubscripted temp vars are scalars result = access_info.array_name else: subscript, = access_info.subscripts result = self.parenthesize_if_needed( "%s[%s]" % ( access_info.array_name, self.rec(subscript, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False
def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper assignee_var_name, = insn.assignee_var_names() lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity: raise NotImplementedError("atomic ops in ISPC") from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_code = ecm(insn.expression, prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) lhs = insn.assignee # {{{ handle streaming stores if "!streaming_store" in insn.tags: ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) access_info = get_access_info(kernel.target, ary, index_tuple, lambda expr: evaluate(expr, codegen_state.var_subst_map), codegen_state.vectorization_info) from loopy.kernel.data import ArrayArg, TemporaryVariable if not isinstance(ary, (ArrayArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" % type(ary).__name) if len(access_info.subscripts) != 1: raise LoopyError("streaming stores must have a subscript") subscript, = access_info.subscripts from pymbolic.primitives import Sum, flattened_sum, Variable if isinstance(subscript, Sum): terms = subscript.children else: terms = (subscript.children,) new_terms = [] from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type from loopy.symbolic import get_dependencies saw_l0 = False for term in terms: if (isinstance(term, Variable) and kernel.iname_tags_of_type(term.name, LocalIndexTag)): tag, = kernel.iname_tags_of_type( term.name, LocalIndexTag, min_num=1, max_num=1) if tag.axis == 0: if saw_l0: raise LoopyError( "streaming store must have stride 1 in " "local index, got: %s" % subscript) saw_l0 = True continue else: for dep in get_dependencies(term): if filter_iname_tags_by_type( kernel.iname_to_tags.get(dep, []), LocalIndexTag): tag, = filter_iname_tags_by_type( kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1) if tag.axis == 0: raise LoopyError( "streaming store must have stride 1 in " "local index, got: %s" % subscript) new_terms.append(term) if not saw_l0: raise LoopyError("streaming store must have stride 1 in " "local index, got: %s" % subscript) if access_info.vector_index is not None: raise LoopyError("streaming store may not use a short-vector " "data type") rhs_has_programindex = any( isinstance(tag, LocalIndexTag) and tag.axis == 0 for tag in kernel.iname_tags(dep) for dep in get_dependencies(insn.expression)) if not rhs_has_programindex: rhs_code = "broadcast(%s, 0)" % rhs_code from cgen import Statement return Statement( "streaming_store(%s + %s, %s)" % ( access_info.array_name, ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code)) # }}} from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)