def kernel(self, target: TargetBase, typed: bool = True) -> LoopKernel: "Build and return loop kernel." domains = self.kernel_domains() body = '\n'.join(self.kernel_isns()) data = self.kernel_data() knl = make_kernel(domains, body, data, target=target) knl = make_reduction_inames_unique(knl) knl.name = self.__class__.__name__ if typed: dtypes = self.kernel_dtypes() knl = add_and_infer_dtypes(knl, dtypes) return knl
def test_double_sum_made_unique(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 20 knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", [ "a = sum((i,j), i*j)", "b = sum(i, sum(j, i*j))", ], assumptions="n>=1") knl = lp.make_reduction_inames_unique(knl) print(knl) evt, (a, b) = knl(queue, n=n) ref = sum(i * j for i in range(n) for j in range(n)) assert a.get() == ref assert b.get() == ref
def test_double_sum_made_unique(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 20 knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", [ "a = sum((i,j), i*j)", "b = sum(i, sum(j, i*j))", ], assumptions="n>=1") knl = lp.make_reduction_inames_unique(knl) print(knl) evt, (a, b) = knl(queue, n=n) ref = sum(i*j for i in range(n) for j in range(n)) assert a.get() == ref assert b.get() == ref
def fixup_utoprim(params, pflag, G, P): sh = G.shapes s = G.slices if params['debug']: nbad_utop = np.sum(pflag.get()[s.bulk] != 0) print("Fixing {} bad cells".format(nbad_utop)) # Make sure we are not using ill defined physical corner regions # TODO can this be forgotten? U_to_P only updates the bulk, and bounds should not touch physical corners #zero_corners(params, G, pflag) sum = cl_array.zeros(params['queue'], sh.grid_primitives, dtype=np.float64) wsum = cl_array.zeros(params['queue'], sh.grid_scalar, dtype=np.float64) global knl_fixup_utoprim_sums, knl_fixup_utoprim_fix if knl_fixup_utoprim_sums is None: # TODO these should really be combined and the check on wsum inlined # That's gonna be a project code_sums = add_ghosts(""" # TODO if statements here to speed up evaluation? w(l, m, n) := not(pflag[i+l,j+m,k+n]) / (abs(l) + abs(m) + abs(n) + 1) wsum[i, j, k] = reduce(sum, (l,m,n), w(l,m,n)) sum[p, i, j, k] = reduce(sum, (l,m,n), w(l,m,n) * P[p, i+l, j+m, k+n]) """) code_fixup = add_ghosts(""" P[p, i, j, k] = if(pflag[i, j, k] == 0, P[p, i, j, k], sum[p, i, j, k] / wsum[i, j, k]) """) knl_fixup_utoprim_sums = lp.make_kernel(sh.isl_grid_primitives_fixup, code_sums, [*primsArrayArgs("P", "sum"), *scalarArrayArgs("wsum"), *scalarArrayArgs("pflag", dtype=np.int32)], assumptions=sh.assume_grid) knl_fixup_utoprim_sums = spec_prims_kernel(knl_fixup_utoprim_sums, sh.bulk_primitives, ng=G.NG) # Roll our own optimization here as this is the only convolution kernel we got knl_fixup_utoprim_sums = lp.split_iname(knl_fixup_utoprim_sums, "k", 8, outer_tag="g.0", inner_tag="l.0") knl_fixup_utoprim_sums = lp.split_iname(knl_fixup_utoprim_sums, "j", 8, outer_tag="g.1", inner_tag="l.1") knl_fixup_utoprim_sums = lp.split_iname(knl_fixup_utoprim_sums, "i", 8, outer_tag="g.2", inner_tag="l.2") knl_fixup_utoprim_sums = lp.make_reduction_inames_unique(knl_fixup_utoprim_sums) # TODO these are some feisty prefetches. Leaving them for later # knl_fixup_utoprim_sums = lp.tag_inames(knl_fixup_utoprim_sums, "p:unr") # knl_fixup_utoprim_sums = lp.add_prefetch(knl_fixup_utoprim_sums, "pflag", "i_inner,j_inner,k_inner", # default_tag="l.auto") # knl_fixup_utoprim_sums = lp.add_prefetch(knl_fixup_utoprim_sums, "P", "i_inner,j_inner,k_inner,l,m,n", # default_tag="l.auto") # TODO The prefetches on this are not working either, look at that knl_fixup_utoprim_fix = lp.make_kernel(sh.isl_grid_primitives, code_fixup, [*primsArrayArgs("P", "sum"), *scalarArrayArgs("wsum"), *scalarArrayArgs("pflag", dtype=np.int32)], assumptions=sh.assume_grid) knl_fixup_utoprim_fix = tune_prims_kernel(knl_fixup_utoprim_fix, shape=sh.bulk_primitives, ng=G.NG) print("Compiled fixup_utoprim") evt, _ = knl_fixup_utoprim_sums(params['queue'], P=P, pflag=pflag, sum=sum, wsum=wsum) evt.wait() if params['debug']: if np.any(wsum.get()[s.bulk] < 1.e-10): # TODO don't die on this when we hit prod raise ValueError("fixup_utoprim found no usable neighbors!") evt, _ = knl_fixup_utoprim_fix(params['queue'], P=P, pflag=pflag, sum=sum, wsum=wsum) if params['debug']: # TODO count what we fixed nleft_utop = nbad_utop - nbad_utop if nleft_utop > 0: print("Cells STILL BAD after fixup_utoprim: {}".format(nleft_utop)) # Reset the pflag, because we tried our best and that's what counts # TODO necessary? See above about new copy #pflag.fill(0) return P
def generate_loopy( result: Union[Array, DictOfNamedArrays, Dict[str, Array]], target: Optional[LoopyTarget] = None, options: Optional[lp.Options] = None, *, cl_device: Optional["pyopencl.Device"] = None, array_tag_t_to_not_propagate: FrozenSet[Type[Tag]] = frozenset( [ImplStored, Named, PrefixNamed]), axis_tag_t_to_not_propagate: FrozenSet[Type[Tag]] = frozenset(), ) -> BoundProgram: r"""Code generation entry point. :param result: Outputs of the computation. :param target: Code generation target. :param options: Code generation options for the kernel. :returns: A :class:`pytato.target.BoundProgram` wrapping the generated :mod:`loopy` program. If *result* is a :class:`dict` or a :class:`pytato.DictOfNamedArrays` and *options* is not supplied, then the Loopy option :attr:`~loopy.Options.return_dict` will be set to *True*. If it is supplied, :attr:`~loopy.Options.return_dict` must already be set to *True*. .. note:: :mod:`pytato` metadata :math:`\mapsto` :mod:`loopy` metadata semantics: - Inames that index over an :class:`~pytato.array.Array`'s axis in the allocation instruction are tagged with the corresponding :class:`~pytato.array.Axis`'s tags. The caller may choose to not propagate axis tags of type *axis_tag_t_to_not_propagate*. - :attr:`pytato.Array.tags` of inputs/outputs in *outputs* would be copied over to the tags of the corresponding :class:`loopy.ArrayArg`. The caller may choose to not propagate array tags of type *array_tag_t_to_not_propagate*. - Arrays tagged with :class:`pytato.tags.ImplStored` would have their tags copied over to the tags of corresponding :class:`loopy.TemporaryVariable`. The caller may choose to not propagate array tags of type *array_tag_t_to_not_propagate*. """ result_is_dict = isinstance(result, (dict, DictOfNamedArrays)) orig_outputs: DictOfNamedArrays = normalize_outputs(result) del result if target is None: target = LoopyPyOpenCLTarget(device=cl_device) else: if cl_device is not None: raise TypeError("may not pass both 'target' and 'cl_device'") preproc_result = preprocess(orig_outputs, target) outputs = preproc_result.outputs compute_order = preproc_result.compute_order if options is None: options = lp.Options(return_dict=result_is_dict) elif isinstance(options, dict): from warnings import warn warn( "Passing a dict for options is deprecated and will stop working in " "2022. Pass an actual loopy.Options object instead.", DeprecationWarning, stacklevel=2) options = lp.Options(**options) if options.return_dict != result_is_dict: raise ValueError("options.result_is_dict is expected to match " "whether the returned value is a dictionary") state = get_initial_codegen_state(target, options) from pytato.transform import InputGatherer ing = InputGatherer() state.var_name_gen.add_names({ input_expr.name for name in compute_order for input_expr in ing(outputs[name].expr) if isinstance(input_expr, (Placeholder, SizeParam, DataWrapper)) if input_expr.name is not None }) state.var_name_gen.add_names(outputs) cg_mapper = CodeGenMapper(array_tag_t_to_not_propagate, axis_tag_t_to_not_propagate) # Generate code for outputs. for name in compute_order: expr = outputs[name].expr insn_id = add_store(name, expr, cg_mapper(expr, state), state, cg_mapper) # replace "expr" with the created stored variable state.results[expr] = StoredResult(name, expr.ndim, frozenset([insn_id])) # Why call make_reduction_inames_unique? # Consider pt.generate_loopy(pt.sum(x) + pt.sum(x)), the generated program # would be a single instruction with rhs: `_pt_subst() + _pt_subst()`. # The result of pt.sum(x) is cached => same instance of InlinedResult is # emitted for both invocations and we would be required to avoid such # reduction iname collisions. program = lp.make_reduction_inames_unique(state.program) return target.bind_program(program=program, bound_arguments=preproc_result.bound_arguments)