def __init__(self, energy, Stepper, mpl=1., dtype=np.float64): self.mpl = mpl from pystella.step import LowStorageRKStepper self.is_low_storage = LowStorageRKStepper in Stepper.__bases__ num_copies = Stepper.__dict__.get("num_copies", 1) shape = (num_copies,) arg_shape = (1,) if self.is_low_storage else tuple() self.a = np.ones(shape, dtype=dtype) self.adot = self.adot_friedmann_1(self.a, energy) self.hubble = self.adot / self.a slc = (0,) if self.is_low_storage else () from pystella import Field _a = Field("a", indices=[], shape=arg_shape)[slc] _adot = Field("adot", indices=[], shape=arg_shape)[slc] from pymbolic import var _e = var("energy") _p = var("pressure") rhs_dict = {_a: _adot, _adot: self.addot_friedmann_2(_a, _e, _p)} from pystella import DisableLogging from loopy.target.c.c_execution import logger as c_logger with DisableLogging(c_logger): # silence GCCToolchain warning self.stepper = Stepper(rhs_dict, rank_shape=(0, 0, 0), halo_shape=0, dtype=dtype, target=lp.ExecutableCTarget())
def make_shift_kernel(self, **kwargs): f = Field("f", offset=0) tmp = Field("tmp", offset=0) from pymbolic import var shift = var("shift") scale = var("scale") self.shift_dict = {tmp: scale * f + shift} args = [...] from pystella import ElementWiseMap self.shifter = ElementWiseMap(self.shift_dict, args=args, **kwargs)
def rhs_dict(self): f = self.f H = Field("hubble", indices=[]) a = Field("a", indices=[]) rhs_dict = {} V = self.potential(f) for fld in range(self.nscalars): rhs_dict[f[fld]] = f.dot[fld] rhs_dict[f.dot[fld]] = (f.lap[fld] - 2 * H * f.dot[fld] - a**2 * diff(V, f[fld])) return rhs_dict
def __init__(self, decomp, num_bins, dtype, **kwargs): from pymbolic import parse import pymbolic.functions as pf max_f, min_f = parse("max_f, min_f") max_log_f, min_log_f = parse("max_log_f, min_log_f") halo_shape = kwargs.pop("halo_shape", 0) f = Field("f", offset=halo_shape) def clip(expr): _min, _max = parse("min, max") return _max(_min(expr, num_bins - 1), 0) linear_bin = (f - min_f) / (max_f - min_f) log_bin = (pf.log(pf.fabs(f)) - min_log_f) / (max_log_f - min_log_f) histograms = { "linear": (clip(linear_bin * num_bins), 1), "log": (clip(log_bin * num_bins), 1) } super().__init__(decomp, histograms, num_bins, dtype, **kwargs) reducers = {} reducers["max_f"] = [(f, "max")] reducers["min_f"] = [(f, "min")] reducers["max_log_f"] = [(pf.log(pf.fabs(f)), "max")] reducers["min_log_f"] = [(pf.log(pf.fabs(f)), "min")] self.get_min_max = Reduction(decomp, reducers, halo_shape=halo_shape, **kwargs)
def test_reduction(ctx_factory, grid_shape, proc_shape, dtype, op, _grid_shape, pass_grid_dims, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) h = 1 grid_shape = _grid_shape or grid_shape mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) from pymbolic import var from pystella import Field tmp_insns = [(var("x"), Field("f") / 2 + .31)] reducers = {} reducers["avg"] = [(var("x"), op)] if pass_grid_dims: reducer = ps.Reduction(mpi, reducers, rank_shape=rank_shape, tmp_instructions=tmp_insns, grid_size=np.product(grid_shape)) else: reducer = ps.Reduction(mpi, reducers, tmp_instructions=tmp_insns) f = clr.rand(queue, rank_shape, dtype=dtype) import pyopencl.tools as clt pool = clt.MemoryPool(clt.ImmediateAllocator(queue)) result = reducer(queue, f=f, allocator=pool) avg = result["avg"] avg_test = reducer.reduce_array(f / 2 + .31, op) if op == "avg": avg_test /= np.product(grid_shape) rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"{op} reduction innaccurate for {grid_shape=}, {proc_shape=}" if timing: from common import timer t = timer(lambda: reducer(queue, f=f, allocator=pool), ntime=1000) if mpi.rank == 0: print( f"reduction took {t:.3f} ms for {grid_shape=}, {proc_shape=}") bandwidth = f.nbytes / 1024**3 / t * 1000 print(f"Bandwidth = {bandwidth:.1f} GB/s")
def make_residual_kernel(self, MapKernel, **kwargs): residual_dict = {} for f, (lhs, rho) in self.lhs_dict.items(): resid = Field("r_" + f.child.name, offset="h") residual_dict[resid] = rho - lhs args = self.unknown_args + self.rho_args + self.residual_args self.residual = MapKernel(residual_dict, args=args, **kwargs)
def make_stepper(self, MapKernel, **kwargs): self.step_dict = {} for f, (lhs, rho) in self.lhs_dict.items(): tmp = Field("tmp_" + f.child.name, offset=f.offset) self.step_dict[tmp] = self.step_operator(f, lhs, rho) args = self.unknown_args + self.rho_args + self.temp_args self.stepper = MapKernel(self.step_dict, args=args, **kwargs)
def test_get_field_args(proc_shape): if proc_shape != (1, 1, 1): pytest.skip("test field only on one rank") from pystella import Field, DynamicField, get_field_args x = Field("x", offset=(1, 2, 3)) y = Field("y", offset="h") z = DynamicField("z", shape=(2, "a")) import loopy as lp true_args = [ lp.GlobalArg("x", shape="(Nx+2, Ny+4, Nz+6)", offset=lp.auto), lp.GlobalArg("y", shape="(Nx+2*h, Ny+2*h, Nz+2*h)", offset=lp.auto), lp.GlobalArg("z", shape="(2, a, Nx, Ny, Nz)", offset=lp.auto), lp.GlobalArg("dzdx", shape="(2, a, 3, Nx, Ny, Nz)", offset=lp.auto), ] def lists_equal(a, b): equal = True for x in a: equal *= x in b for x in b: equal *= x in a return equal expressions = {x: y, y: x * z + z.pd[0]} args = get_field_args(expressions) assert lists_equal(args, true_args) expressions = x * y + z + z.pd[2] args = get_field_args(expressions) assert lists_equal(args, true_args) expressions = [x, y, y * z**2, 3 + z.pd[0] + z.pd[1]] args = get_field_args(expressions) assert lists_equal(args, true_args) expressions = [shift_fields(x, (1, 2, 3)), y + z.pd[0], y * z**2] args = get_field_args(expressions) assert lists_equal(args, true_args)
def test_collect_field_indices(proc_shape): if proc_shape != (1, 1, 1): pytest.skip("test field only on one rank") from pystella import Field, DynamicField from pystella.field import collect_field_indices x = Field("x", offset=(1, 2, 3)) y = Field("y", indices=("i", "x"), offset="h") z = DynamicField("z", shape=(2, "a")) expressions = {x: y, y: x * z + z.pd[0]} indices = collect_field_indices(expressions) assert indices == {"i", "j", "k", "x"} expressions = [x, z] indices = collect_field_indices(expressions) assert indices == {"i", "j", "k"} expressions = [shift_fields(x, (1, 2, 3)), y + z.pd[0], y * z**2] indices = collect_field_indices(expressions) assert indices == {"i", "j", "k", "x"}
def test_reduction_with_new_shape(ctx_factory, grid_shape, proc_shape, dtype, op, _grid_shape, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) h = 1 grid_shape = _grid_shape or grid_shape mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) from pystella import Field reducers = {} reducers["avg"] = [(Field("f"), op)] reducer = ps.Reduction(mpi, reducers) f = clr.rand(queue, rank_shape, dtype=dtype) result = reducer(queue, f=f) avg = result["avg"] avg_test = reducer.reduce_array(f, op) if op == "avg": avg_test /= np.product(grid_shape) rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"{op} reduction innaccurate for {grid_shape=}, {proc_shape=}" # test call to reducer with new shape grid_shape = tuple(Ni // 2 for Ni in grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) f = clr.rand(queue, rank_shape, dtype=dtype) result = reducer(queue, f=f) avg = result["avg"] avg_test = reducer.reduce_array(f, op) if op == "avg": avg_test /= np.product(grid_shape) rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"{op} reduction w/new shape innaccurate for {grid_shape=}, {proc_shape=}"
def make_lhs_kernel(self, MapKernel, **kwargs): tmp_dict = {} lhs_dict = {} from pymbolic import var tmp_lhs = var("tmp_lhs") for i, (f, (lhs, rho)) in enumerate(self.lhs_dict.items()): tmp_dict[tmp_lhs[i]] = lhs resid = Field("r_" + f.child.name, offset="h") lhs_dict[rho] = resid + tmp_lhs[i] args = self.unknown_args + self.rho_args + self.residual_args self.lhs_correction = MapKernel(lhs_dict, tmp_instructions=tmp_dict, args=args, **kwargs)
def make_resid_stats(self, decomp, queue, dtype, **kwargs): reducers = {} avg_reducers = {} # from pymbolic.functions import fabs from pymbolic import var fabs = var("fabs") for arg in self.unknown_args: f = arg.name resid = Field("r_" + f, offset="h") reducers[f] = [(fabs(resid), "max"), (resid**2, "avg")] avg_reducers[f] = [(resid, "avg")] args = self.residual_args from pystella import Reduction self.resid_stats = Reduction(decomp, reducers, args=args, **kwargs) self.avg_resid = Reduction(decomp, avg_reducers, args=args, **kwargs)
def rhs_dict(self): hij = self.hij H = Field("hubble", indices=[]) rhs_dict = {} for i in range(1, 4): for j in range(i, 4): fld = tensor_index(i, j) Sij = sum( sector.stress_tensor(i, j, drop_trace=True) for sector in self.sectors) rhs_dict[hij[fld]] = hij.dot[fld] rhs_dict[hij.dot[fld]] = (hij.lap[fld] - 2 * H * hij.dot[fld] + 16 * np.pi * Sij) return rhs_dict
def stress_tensor(self, mu, nu, drop_trace=False): f = self.f a = Field("a", indices=[]) Tmunu = sum( f.d(fld, mu) * f.d(fld, nu) for fld in range(self.nscalars)) if drop_trace: return Tmunu else: metric = np.diag( (-1 / a**2, 1 / a**2, 1 / a**2, 1 / a**2)) # contravariant lag = (-sum( sum(metric[mu, nu] * f.d(fld, mu) * f.d(fld, nu) for mu in range(4) for nu in range(4)) for fld in range(self.nscalars)) / 2 - self.potential(self.f)) metric = np.diag((-a**2, a**2, a**2, a**2)) # covariant return Tmunu + metric[mu, nu] * lag
def __init__(self, decomp, halo_shape, **kwargs): self.min_max = kwargs.pop("max_min", False) from pystella import Field f = Field("f", offset="h") reducers = {} reducers["mean"] = [f] reducers["variance"] = [f**2] if self.min_max: reducers["max"] = [(f, "max")] reducers["min"] = [(f, "min")] # from pymbolic.functions import fabs from pymbolic import var fabs = var("fabs") reducers["abs_max"] = [(fabs(f), "max")] reducers["abs_min"] = [(fabs(f), "min")] self.reducers = reducers super().__init__(decomp, reducers, halo_shape=halo_shape, **kwargs)
def InterpolationBase(even_coefs, odd_coefs, StencilKernel, halo_shape, **kwargs): """ A base function for generating a restriction kernel. :arg even_coefs: The coefficients representing the interpolation formula for gridpoints on the coarse and fine grid which coincide in space. Follows the convention of :func:`pystella.derivs.centered_diff` (since the restriction is applied recursively in each dimension). :arg odd_coefs: Same as ``even_coefs``, but for points on the fine grid which lie between points on the coarse grid. :arg StencilKernel: The stencil mapper to create an instance of. Defaults to :class:`~pystella.Stencil`. :arg halo_shape: The number of halo layers on (both sides of) each axis of the computational grid. Currently must be an :class:`int`. :arg correct: A :class:`bool` determining whether to produce a kernel which corrects an output array by the interpolated array, or to only perform strict interpolation. Defaults to *False*. :returns: An instance of ``StencilKernel`` which executes the requested interpolation. """ from pymbolic import parse, var i, j, k = parse("i, j, k") f1 = Field("f1", offset="h") tmp_insns = {} tmp = var("tmp") import itertools for parity in tuple(itertools.product((0, 1), (0, 1), (0, 1))): result = 0 for a, c_a in odd_coefs.items() if parity[0] else even_coefs.items(): for b, c_b in odd_coefs.items() if parity[1] else even_coefs.items( ): for c, c_c in odd_coefs.items( ) if parity[2] else even_coefs.items(): f2 = Field("f2", offset="h", indices=((i + a) // 2, (j + b) // 2, (k + c) // 2)) result += c_a * c_b * c_c * f2 tmp_insns[tmp[parity]] = result from pymbolic.primitives import Remainder a, b, c = (Remainder(ind, 2) for ind in (i, j, k)) if kwargs.pop("correct", False): interp_dict = {f1: f1 + tmp[a, b, c]} else: interp_dict = {f1: tmp[a, b, c]} args = [ lp.GlobalArg("f1", shape="(Nx+2*h, Ny+2*h, Nz+2*h)"), lp.GlobalArg("f2", shape="(Nx//2+2*h, Ny//2+2*h, Nz//2+2*h)") ] return StencilKernel(interp_dict, tmp_instructions=tmp_insns, args=args, prefetch_args=["f2"], halo_shape=halo_shape, **kwargs)
def test_relax(ctx_factory, grid_shape, proc_shape, h, dtype, Solver, timing=False): if min(grid_shape) < 128: pytest.skip("test_relax needs larger grids, for now") if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape)) mpi = ps.DomainDecomposition(proc_shape, h, rank_shape) L = 10 dx = L / grid_shape[0] dk = 2 * np.pi / L fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype) spectra = ps.PowerSpectra(mpi, fft, (dk, ) * 3, L**3) statistics = ps.FieldStatistics(mpi, h, rank_shape=rank_shape, grid_size=np.product(grid_shape)) def get_laplacian(f): from pystella.derivs import _lap_coefs, centered_diff lap_coefs = _lap_coefs[h] from pymbolic import var return sum([ centered_diff(f, lap_coefs, direction=mu, order=2) for mu in range(1, 4) ]) / var("dx")**2 test_problems = {} from pystella import Field f = Field("f", offset="h") rho = Field("rho", offset="h") test_problems[f] = (get_laplacian(f), rho) f = Field("f2", offset="h") rho = Field("rho2", offset="h") test_problems[f] = (get_laplacian(f) - f, rho) solver = Solver(mpi, queue, test_problems, halo_shape=h, dtype=dtype, fixed_parameters=dict(omega=1 / 2)) def zero_mean_array(): f0 = clr.rand(queue, grid_shape, dtype) f = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype) mpi.scatter_array(queue, f0, f, root=0) avg = statistics(f)["mean"] f = f - avg mpi.share_halos(queue, f) return f f = zero_mean_array() rho = zero_mean_array() tmp = cla.zeros_like(f) f2 = zero_mean_array() rho2 = zero_mean_array() tmp2 = cla.zeros_like(f) num_iterations = 1000 errors = {"f": [], "f2": []} first_mode_zeroed = {"f": [], "f2": []} for i in range(0, num_iterations, 2): solver(mpi, queue, iterations=2, dx=np.array(dx), f=f, tmp_f=tmp, rho=rho, f2=f2, tmp_f2=tmp2, rho2=rho2) err = solver.get_error(queue, f=f, r_f=tmp, rho=rho, f2=f2, r_f2=tmp2, rho2=rho2, dx=np.array(dx)) for k, v in err.items(): errors[k].append(v) for key, resid in zip(["f", "f2"], [tmp, tmp2]): spectrum = spectra(resid, k_power=0) if mpi.rank == 0: max_amp = np.max(spectrum) first_zero = np.argmax(spectrum[1:] < 1e-30 * max_amp) first_mode_zeroed[key].append(first_zero) for k, errs in errors.items(): errs = np.array(errs) iters = np.arange(1, errs.shape[0] + 1) assert (errs[10:, 0] * iters[10:] / errs[0, 0] < 1.).all(), \ "relaxation not converging at least linearly for " \ f"{grid_shape=}, {h=}, {proc_shape=}" first_mode_zeroed = mpi.bcast(first_mode_zeroed, root=0) for k, x in first_mode_zeroed.items(): x = np.array(list(x))[2:] assert (x[1:] <= x[:-1]).all() and np.min(x) < np.max(x) / 5, \ f"relaxation not smoothing error {grid_shape=}, {h=}, {proc_shape=}"
def __init__(self, fft, effective_k, dk, dx): self.fft = fft if not callable(effective_k): if effective_k != 0: from pystella.derivs import FirstCenteredDifference h = effective_k effective_k = FirstCenteredDifference(h).get_eigenvalues else: def effective_k(k, dx): # pylint: disable=function-redefined return k queue = self.fft.sub_k["momenta_x"].queue sub_k = list(x.get().astype("int") for x in self.fft.sub_k.values()) eff_mom_names = ("eff_mom_x", "eff_mom_y", "eff_mom_z") self.eff_mom = {} for mu, (name, kk) in enumerate(zip(eff_mom_names, sub_k)): eff_k = effective_k(dk[mu] * kk.astype(fft.rdtype), dx[mu]) eff_k[abs(sub_k[mu]) == fft.grid_shape[mu] // 2] = 0. eff_k[sub_k[mu] == 0] = 0. import pyopencl.array as cla self.eff_mom[name] = cla.to_device(queue, eff_k) from pymbolic import var, parse from pymbolic.primitives import If, Comparison, LogicalAnd from pystella import Field indices = parse("i, j, k") eff_k = tuple( var(array)[mu] for array, mu in zip(eff_mom_names, indices)) fabs, sqrt, conj = parse("fabs, sqrt, conj") kmag = sqrt(sum(kk**2 for kk in eff_k)) from pystella import ElementWiseMap vector = Field("vector", shape=(3, )) vector_T = Field("vector_T", shape=(3, )) kvec_zero = LogicalAnd( tuple(Comparison(fabs(eff_k[mu]), "<", 1e-14) for mu in range(3))) # note: write all output via private temporaries to allow for in-place div = var("div") div_insn = [(div, sum(eff_k[mu] * vector[mu] for mu in range(3)))] self.transversify_knl = ElementWiseMap( { vector_T[mu]: If(kvec_zero, 0, vector[mu] - eff_k[mu] / kmag**2 * div) for mu in range(3) }, tmp_instructions=div_insn, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) import loopy as lp def assign(asignee, expr, **kwargs): default = dict(within_inames=frozenset(("i", "j", "k")), no_sync_with=[("*", "any")]) default.update(kwargs) return lp.Assignment(asignee, expr, **default) kmag, Kappa = parse("kmag, Kappa") eps_insns = [ assign(kmag, sqrt(sum(kk**2 for kk in eff_k))), assign(Kappa, sqrt(sum(kk**2 for kk in eff_k[:2]))) ] zero = fft.cdtype.type(0) kx_ky_zero = LogicalAnd( tuple(Comparison(fabs(eff_k[mu]), "<", 1e-10) for mu in range(2))) kz_nonzero = Comparison(fabs(eff_k[2]), ">", 1e-10) eps = var("eps") eps_insns.extend([ assign( eps[0], If(kx_ky_zero, If(kz_nonzero, fft.cdtype.type(1 / 2**.5), zero), (eff_k[0] * eff_k[2] / kmag - 1j * eff_k[1]) / Kappa / 2**.5)), assign( eps[1], If(kx_ky_zero, If(kz_nonzero, fft.cdtype.type(1j / 2**(1 / 2)), zero), (eff_k[1] * eff_k[2] / kmag + 1j * eff_k[0]) / Kappa / 2**.5)), assign(eps[2], If(kx_ky_zero, zero, -Kappa / kmag / 2**.5)) ]) plus, minus, lng = Field("plus"), Field("minus"), Field("lng") plus_tmp, minus_tmp = parse("plus_tmp, minus_tmp") pol_isns = [(plus_tmp, sum(vector[mu] * conj(eps[mu]) for mu in range(3))), (minus_tmp, sum(vector[mu] * eps[mu] for mu in range(3)))] args = [ lp.TemporaryVariable("kmag"), lp.TemporaryVariable("Kappa"), lp.TemporaryVariable("eps", shape=(3, )), ... ] self.vec_to_pol_knl = ElementWiseMap( { plus: plus_tmp, minus: minus_tmp }, tmp_instructions=eps_insns + pol_isns, args=args, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) vector_tmp = var("vector_tmp") vec_insns = [(vector_tmp[mu], plus * eps[mu] + minus * conj(eps[mu])) for mu in range(3)] self.pol_to_vec_knl = ElementWiseMap( {vector[mu]: vector_tmp[mu] for mu in range(3)}, tmp_instructions=eps_insns + vec_insns, args=args, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) ksq = sum(kk**2 for kk in eff_k) lng_rhs = If(kvec_zero, 0, -div / ksq * 1j) self.vec_decomp_knl = ElementWiseMap( { plus: plus_tmp, minus: minus_tmp, lng: lng_rhs }, tmp_instructions=eps_insns + pol_isns + div_insn, args=args, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) lng_rhs = If(kvec_zero, 0, -div / ksq**.5 * 1j) self.vec_decomp_knl_times_abs_k = ElementWiseMap( { plus: plus_tmp, minus: minus_tmp, lng: lng_rhs }, tmp_instructions=eps_insns + pol_isns + div_insn, args=args, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) from pystella.sectors import tensor_index as tid eff_k_hat = tuple(kk / sqrt(sum(kk**2 for kk in eff_k)) for kk in eff_k) hij = Field("hij", shape=(6, )) hij_TT = Field("hij_TT", shape=(6, )) Pab = var("P") Pab_insns = [(Pab[tid(a, b)], (If(Comparison(a, "==", b), 1, 0) - eff_k_hat[a - 1] * eff_k_hat[b - 1])) for a in range(1, 4) for b in range(a, 4)] hij_TT_tmp = var("hij_TT_tmp") TT_insns = [(hij_TT_tmp[tid(a, b)], sum((Pab[tid(a, c)] * Pab[tid(d, b)] - Pab[tid(a, b)] * Pab[tid(c, d)] / 2) * hij[tid(c, d)] for c in range(1, 4) for d in range(1, 4))) for a in range(1, 4) for b in range(a, 4)] # note: where conditionals (branch divergence) go can matter: # this kernel is twice as fast when putting the branching in the global # write, rather than when setting hij_TT_tmp write_insns = [(hij_TT[tid(a, b)], If(kvec_zero, 0, hij_TT_tmp[tid(a, b)])) for a in range(1, 4) for b in range(a, 4)] self.tt_knl = ElementWiseMap( write_insns, tmp_instructions=Pab_insns + TT_insns, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) tensor_to_pol_insns = { plus: sum(hij[tid(c, d)] * conj(eps[c - 1]) * conj(eps[d - 1]) for c in range(1, 4) for d in range(1, 4)), minus: sum(hij[tid(c, d)] * eps[c - 1] * eps[d - 1] for c in range(1, 4) for d in range(1, 4)) } self.tensor_to_pol_knl = ElementWiseMap( tensor_to_pol_insns, tmp_instructions=eps_insns, args=args, lsize=(32, 1, 1), rank_shape=fft.shape(True), ) pol_to_tensor_insns = { hij[tid(a, b)]: (plus * eps[a - 1] * eps[b - 1] + minus * conj(eps[a - 1]) * conj(eps[b - 1])) for a in range(1, 4) for b in range(a, 4) } self.pol_to_tensor_knl = ElementWiseMap( pol_to_tensor_insns, tmp_instructions=eps_insns, args=args, lsize=(32, 1, 1), rank_shape=fft.shape(True), )
def RestrictionBase(coefs, StencilKernel, halo_shape, **kwargs): """ A base function for generating a restriction kernel. :arg coefs: The coefficients representing the restriction formula. Follows the convention of :func:`pystella.derivs.centered_diff` (since the restriction is applied recursively in each dimension). :arg StencilKernel: The stencil mapper to create an instance of. Defaults to :class:`~pystella.Stencil`. :arg halo_shape: The number of halo layers on (both sides of) each axis of the computational grid. Currently must be an :class:`int`. :arg lsize: The shape of prefetched arrays in shared memory. See :class:`~pystella.ElementWiseMap`. Defaults to ``(4, 4, 4)``. :arg correct: A :class:`bool` determining whether to produce a kernel which corrects an output array by the restricted array, or to only perform strict restriction. Defaults to *False*. :returns: An instance of ``StencilKernel`` which executes the requested restriction. """ lsize = kwargs.pop("lsize", (4, 4, 4)) # ensure grid dimensions are *not* passed, as they will be misinterpreted for N in ["Nx", "Ny", "Nz"]: _ = kwargs.pop(N, None) restrict_coefs = {} for a, c_a in coefs.items(): for b, c_b in coefs.items(): for c, c_c in coefs.items(): restrict_coefs[(a, b, c)] = c_a * c_b * c_c from pymbolic import parse, var i, j, k = parse("i, j, k") f1 = Field("f1", offset="h", indices=(2 * i, 2 * j, 2 * k)) f2 = Field("f2", offset="h") tmp = var("tmp") tmp_dict = {tmp: expand_stencil(f1, restrict_coefs)} if kwargs.pop("correct", False): restrict_dict = {f2: f2 - tmp} else: restrict_dict = {f2: tmp} args = [ lp.GlobalArg("f1", shape="(2*Nx+2*h, 2*Ny+2*h, 2*Nz+2*h)"), lp.GlobalArg("f2", shape="(Nx+2*h, Ny+2*h, Nz+2*h)") ] if isinstance(StencilKernel, Stencil): return StencilKernel(restrict_dict, tmp_instructions=tmp_dict, args=args, prefetch_args=["f1"], halo_shape=halo_shape, lsize=lsize, **kwargs) else: return StencilKernel(restrict_dict, tmp_instructions=tmp_dict, args=args, halo_shape=halo_shape, lsize=lsize, **kwargs)
def test_multigrid(ctx_factory, grid_shape, proc_shape, h, dtype, Solver, MG, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape)) mpi = ps.DomainDecomposition(proc_shape, h, rank_shape) L = 10 dx = L / grid_shape[0] statistics = ps.FieldStatistics(mpi, h, rank_shape=rank_shape, grid_size=np.product(grid_shape)) def get_laplacian(f): from pystella.derivs import _lap_coefs, centered_diff lap_coefs = _lap_coefs[h] from pymbolic import var return sum([centered_diff(f, lap_coefs, direction=mu, order=2) for mu in range(1, 4)]) / var("dx")**2 test_problems = {} from pystella import Field f = Field("f", offset="h") rho = Field("rho", offset="h") test_problems[f] = (get_laplacian(f), rho) f = Field("f2", offset="h") rho = Field("rho2", offset="h") test_problems[f] = (get_laplacian(f) - f, rho) solver = Solver(mpi, queue, test_problems, halo_shape=h, dtype=dtype, fixed_parameters=dict(omega=1/2)) mg = MG(solver=solver, halo_shape=h, dtype=dtype) def zero_mean_array(): f0 = clr.rand(queue, grid_shape, dtype) f = clr.rand(queue, tuple(ni + 2*h for ni in rank_shape), dtype) mpi.scatter_array(queue, f0, f, root=0) avg = statistics(f)["mean"] f = f - avg mpi.share_halos(queue, f) return f f = zero_mean_array() rho = zero_mean_array() f2 = zero_mean_array() rho2 = zero_mean_array() poisson_errs = [] helmholtz_errs = [] num_v_cycles = 15 if MG == MultiGridSolver else 10 for i in range(num_v_cycles): errs = mg(mpi, queue, dx0=dx, f=f, rho=rho, f2=f2, rho2=rho2) poisson_errs.append(errs[-1][-1]["f"]) helmholtz_errs.append(errs[-1][-1]["f2"]) for name, cycle_errs in zip(["poisson", "helmholtz"], [poisson_errs, helmholtz_errs]): tol = 1e-6 if MG == MultiGridSolver else 1e-15 assert cycle_errs[-1][1] < tol and cycle_errs[-2][1] < 10*tol, \ "multigrid solution to {name} eqn is inaccurate for " \ f"{grid_shape=}, {h=}, {proc_shape=}"