def get_divergence_error(vector): for mu in range(3): fft.idft(vector[mu], vector_x[mu]) derivs.divergence(queue, vector_x, div) derivs(queue, fx=vector_x[0], pdx=pdx[0]) derivs(queue, fx=vector_x[1], pdy=pdx[1]) derivs(queue, fx=vector_x[2], pdz=pdx[2]) norm = sum([clm.fabs(pdx[mu]) for mu in range(3)]) max_err = cla.max(clm.fabs(div)) / cla.max(norm) avg_err = cla.sum(clm.fabs(div)) / cla.sum(norm) return max_err, avg_err
def get_divergence_errors(hij): max_errors = [] avg_errors = [] for i in range(1, 4): for mu in range(3): fft.idft(hij[tensor_id(i, mu + 1)], vector_x[mu]) derivs.divergence(queue, vector_x, div) derivs(queue, fx=vector_x[0], pdx=pdx[0]) derivs(queue, fx=vector_x[1], pdy=pdx[1]) derivs(queue, fx=vector_x[2], pdz=pdx[2]) norm = sum([clm.fabs(pdx[mu]) for mu in range(3)]) max_errors.append(cla.max(clm.fabs(div)) / cla.max(norm)) avg_errors.append(cla.sum(clm.fabs(div)) / cla.sum(norm)) return np.array(max_errors), np.array(avg_errors)
def test_outoforderqueue_clmath(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) # testing that clmath functions wait for and create events b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) queue.finish() b1 = b_gpu.get() b = np.abs(np.sin(a * 5)) assert np.abs(b1 - b).mean() < 1e-5
def test_outoforderqueue_clmath(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties. OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) # testing that clmath functions wait for and create events b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) queue.finish() b1 = b_gpu.get() b = np.abs(np.sin(a * 5)) assert np.abs(b1 - b).mean() < 1e-5
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype) L = (3, 5, 7) dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape)) dk = tuple(2 * np.pi / Li for Li in L) if h == 0: def get_evals_2(k, dx): return - k**2 derivs = ps.SpectralCollocator(fft, dk) else: from pystella.derivs import SecondCenteredDifference get_evals_2 = SecondCenteredDifference(h).get_eigenvalues derivs = ps.FiniteDifferencer(mpi, h, dx, stream=False) solver = ps.SpectralPoissonSolver(fft, dk, dx, get_evals_2) pencil_shape = tuple(ni + 2*h for ni in rank_shape) statistics = ps.FieldStatistics(mpi, 0, rank_shape=rank_shape, grid_size=np.product(grid_shape)) fx = cla.empty(queue, pencil_shape, dtype) rho = clr.rand(queue, rank_shape, dtype) rho -= statistics(rho)["mean"] lap = cla.empty(queue, rank_shape, dtype) rho_h = rho.get() for m_squared in (0, 1.2, 19.2): solver(queue, fx, rho, m_squared=m_squared) fx_h = fx.get() if h > 0: fx_h = fx_h[h:-h, h:-h, h:-h] derivs(queue, fx=fx, lap=lap) diff = np.fabs(lap.get() - rho_h - m_squared * fx_h) max_err = np.max(diff) / cla.max(clm.fabs(rho)) avg_err = np.sum(diff) / cla.sum(clm.fabs(rho)) max_rtol = 1e-12 if dtype == np.float64 else 1e-4 avg_rtol = 1e-13 if dtype == np.float64 else 1e-5 assert max_err < max_rtol and avg_err < avg_rtol, \ f"solution inaccurate for {h=}, {grid_shape=}, {proc_shape=}" if timing: from common import timer time = timer(lambda: solver(queue, fx, rho, m_squared=m_squared), ntime=10) if mpi.rank == 0: print(f"poisson took {time:.3f} ms for {grid_shape=}, {proc_shape=}")
def test_step(ctx_factory, proc_shape, dtype, Stepper): if proc_shape != (1, 1, 1): pytest.skip("test step only on one rank") if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) from pystella.step import LowStorageRKStepper is_low_storage = LowStorageRKStepper in Stepper.__bases__ rank_shape = (1, 1, 8) init_vals = np.linspace(1, 3, 8) if is_low_storage: y = cla.zeros(queue, rank_shape, dtype) y[0, 0, :] = init_vals y0 = y.copy() else: num_copies = Stepper.num_copies y = cla.zeros(queue, (num_copies, ) + rank_shape, dtype) y[0, 0, 0, :] = init_vals y0 = y[0].copy() dtlist = [.1, .05, .025] for n in [-1., -2., -3., -4.]: max_errs = {} for dt in dtlist: def sol(y0, t): return ((-1 + n) * (-t + y0**(1 - n) / (-1 + n)))**(1 / (1 - n)) _y = ps.Field("y") rhs_dict = {_y: _y**n} stepper = Stepper(rhs_dict, dt=dt, halo_shape=0, rank_shape=rank_shape) if is_low_storage: y[0, 0, :] = init_vals else: y[0, 0, 0, :] = init_vals t = 0 errs = [] while t < .1: for s in range(stepper.num_stages): stepper(s, queue=queue, y=y, filter_args=True) t += dt if is_low_storage: errs.append(cla.max(clm.fabs(1. - sol(y0, t) / y)).get()) else: errs.append( cla.max(clm.fabs(1. - sol(y0, t) / y[0])).get()) max_errs[dt] = np.max(errs) order = stepper.expected_order print(f"{order=}, {n=}") print(max_errs) print([ max_errs[a] / max_errs[b] for a, b in zip(dtlist[:-1], dtlist[1:]) ]) order = stepper.expected_order rtol = dtlist[-1]**order if dtype == np.float64 else 1e-1 assert list(max_errs.values())[-1] < rtol, \ f"Stepper solution inaccurate for {n=}" for a, b in zip(dtlist[:-1], dtlist[1:]): assert max_errs[a] / max_errs[b] > .9 * (a/b)**order, \ f"Stepper convergence failing for {n=}"
def U_to_P(params, G, U, P, Pout=None, iter_max=None): s = G.slices sh = G.shapes # Set some default parameters, but allow overrides if iter_max is None: if 'invert_iter_max' in params: iter_max = params['invert_iter_max'] else: iter_max = 8 if 'invert_err_tol' in params: err_tol = params['invert_err_tol'] else: err_tol = 1.e-8 if 'invert_iter_delta' in params: delta = params['invert_iter_delta'] else: delta = 1.e-5 if 'gamma_max' not in params: params['gamma_max'] = 25 # Don't overwrite old memory by default, just allow using old values # Caller can pass new/old memory but is responsible that it contain logical values if we don't update it if Pout is None: Pout = P.copy() # Update the primitive B-fields G.vecdivbygeom(params['queue'], u=U[s.B3VEC], g=G.gdet_d[Loci.CENT.value], out=Pout[s.B3VEC]) # Cached constant quantities global ncov, ncon, lgdet if ncov is None: # For later on ncov = cl_array.zeros(params['queue'], sh.grid_vector, dtype=np.float64) G.timesgeom(params['queue'], u=cl_array.empty_like(ncov[0]).fill(1.0), g=-G.lapse_d[Loci.CENT.value], out=ncov[0]) ncon = G.raise_grid(ncov) lgdet = G.lapse_d[Loci.CENT.value] / G.gdet_d[Loci.CENT.value] # Eflag will indicate inversion failures # Define a generic kernel so we can split out flagging in the future # Use it to catch negative density early on eflag = cl_array.zeros(params['queue'], sh.grid_scalar, dtype=np.int32) global knl_set_eflag if knl_set_eflag is None: code = add_ghosts( """eflag[i,j,k] = if(var[i,j,k] < 0, flag, eflag[i,j,k])""") knl_set_eflag = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("eflag", dtype=np.int32), *scalarArrayArgs("var"), lp.ValueArg("flag", dtype=np.int32), ... ], default_offset=lp.auto) knl_set_eflag = tune_grid_kernel(knl_set_eflag, sh.bulk_scalar, ng=G.NG) evt, _ = knl_set_eflag(params['queue'], var=U[s.RHO], eflag=eflag, flag=-100) # Convert from conserved variables to four-vectors Bcon = cl_array.zeros(params['queue'], sh.grid_vector, dtype=np.float64) G.vectimesgeom(params['queue'], u=U[s.B3VEC], g=lgdet, out=Bcon[1:]) Qcov = cl_array.empty_like(Bcon) G.timesgeom(params['queue'], u=(U[s.UU] - U[s.RHO]), g=lgdet, out=Qcov[0]) G.vectimesgeom(params['queue'], u=U[s.U3VEC], g=lgdet, out=Qcov[1:]) Bcov = G.lower_grid(Bcon) Qcon = G.raise_grid(Qcov) # This will have fringes of zeros still! Bsq = G.dot(Bcon, Bcov) QdB = G.dot(Bcon, Qcov) Qdotn = G.dot(Qcon, ncov) Qsq = G.dot(Qcon, Qcov) Qtsq = Qsq + Qdotn**2 Qtcon = cl_array.empty_like(Qcon) for i in range(4): Qtcon[i] = Qcon[i] + ncon[i] * Qdotn # Set up eqn for W', the energy density D = cl_array.zeros_like(Qsq) G.timesgeom(params['queue'], u=U[s.RHO], g=lgdet, out=D) Ep = -Qdotn - D del Bcov, Qcon, Qcov # Numerical rootfinding # Take guesses from primitives Wp = Wp_func(params, G, P, Loci.CENT, eflag) # Trap on any failures so far if debugging. They're very rare. if 'debug' in params and params['debug']: if np.any(eflag.get()[s.bulk] != 0): raise ValueError("Unexpected flag set!") # Step around the guess & evaluate errors h = delta * Wp # TODO stable enough? Need fancy subtraction from iharm3d? errp = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp + h, eflag) err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag) errm = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp - h, eflag) # Preserve Wp/err before updating them below Wp1 = Wp.copy() err1 = err.copy() global knl_utop_prep if knl_utop_prep is None: # TODO keep an accumulator here to avoid that costly any() call? code = add_ghosts(""" # TODO put error/prep in here, not calling below # Attempt a Halley/Muller/Bailey/Press step dedW := (errp[i,j,k] - errm[i,j,k]) / (2 * h[i,j,k]) dedW2 := (errp[i,j,k] - 2.*err[i,j,k] + errm[i,j,k]) / (h[i,j,k]**2) # Limit size of 2nd derivative correction # Loopy trick common in HARM: define intermediate variables (xt, xt2, xt3...) to impose clipping # This allows assignments or substitutions while keeping dependencies straight ft := 0.5*err[i,j,k]*dedW2/(dedW**2) ft2 := if(ft > 0.3, 0.3, ft) f := if(ft2 < -0.3, -0.3, ft2) # Limit size of step dWt := -err[i,j,k] / dedW / (1. - f) dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt) dW := if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2) Wp[i,j,k] = Wp[i,j,k] + dW {id=wp} # Guarantee we take one step in every bulk zone stop_flag[i,j,k] = 0 # This would avoid the step where there's convergence, but would require taking 2*dW above or similar #stop_flag[i,j,k] = (fabs(dW / Wp[i,j,k]) < err_tol) + \ # (fabs(err[i,j,k] / Wp[i,j,k]) < err_tol) {dep=wp,nosync=wp} """) knl_utop_prep = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("err", "errp", "errm", "h", "Wp"), *scalarArrayArgs("stop_flag", dtype=np.int8), ... ], assumptions=sh.assume_grid, seq_dependencies=True) knl_utop_prep = lp.fix_parameters(knl_utop_prep, err_tol=err_tol) knl_utop_prep = tune_grid_kernel(knl_utop_prep, sh.bulk_scalar, ng=G.NG) print("Compiled utop_prep") # Fill stop_flag with 1 so we don't have to worry about ghost zones taking steps stop_flag = cl_array.empty(params['queue'], sh.grid_scalar, dtype=np.int8).fill(1) evt, _ = knl_utop_prep(params['queue'], err=err, errp=errp, errm=errm, h=h, Wp=Wp, stop_flag=stop_flag) evt.wait() err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag, out=err) # Iteration kernel for 1Dw solver global knl_utop_iter if knl_utop_iter is None: code = add_ghosts(""" # Evaluate whether we need to do any of this <> go = not(stop_flag[i,j,k]) {id=insn_go} # Normal secant increment is dW. Limit guess to between 0.5 and 2 times current value dWt := (Wp1[i,j,k] - Wp[i,j,k]) * err[i,j,k] / (err[i,j,k] - err1[i,j,k]) dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt) <> dW = if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2) {id=dw,if=go} # Preserve last values, after use but before any changes Wp1[i,j,k] = Wp[i,j,k] {id=wp1,nosync=dw,if=go} err1[i,j,k] = err[i,j,k] {nosync=dw,if=go} # Update Wp. Err will be updated outside kernel Wp[i,j,k] = Wp[i,j,k] + dW {id=wp,nosync=dw:wp1,if=go} # Set flag not to continue in zones that have converged stop_flag[i,j,k] = if(fabs(dW / Wp[i,j,k]) < err_tol, 1, stop_flag[i,j,k]) {nosync=dw:wp:insn_go,if=go} # For the future, when we've defined err_eqn for loopy kernels # err = err_eqn(Bsq, D, Ep, QdB, Qtsq, Wp, gam, gamma_max, eflag) {if=go} # stop_flag[i,j,k] = stop_flag[i,j,k] + (fabs(err[] / Wp[]) < err_tol) {if=go} """) knl_utop_iter = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("Wp", "Wp1", "err", "err1"), *scalarArrayArgs("stop_flag", dtype=np.int8), ... ], assumptions=sh.assume_grid, default_offset=lp.auto, seq_dependencies=True) knl_utop_iter = lp.fix_parameters(knl_utop_iter, err_tol=err_tol) knl_utop_iter = tune_grid_kernel(knl_utop_iter, sh.bulk_scalar, ng=G.NG) print("Compiled utop_iter") # Iterate at least once to set new values from first step # TODO Needed now we set Wp, err1 right? for niter in range(iter_max): #print("U_to_P iter") evt, _ = knl_utop_iter(params['queue'], Wp=Wp, Wp1=Wp1, err=err, err1=err1, stop_flag=stop_flag) err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag) # TODO there may be better/faster if/reduction statements here... stop_flag |= (clm.fabs(err / Wp) < err_tol) if cl_array.min(stop_flag) >= 1: break # If secant method failed to converge, do not set primitives other than B eflag += (stop_flag == 0) del Wp1, err, err1, stop_flag # Find utsq, gamma, rho0 from Wp gamma = gamma_func(params, G, Bsq, D, QdB, Qtsq, Wp, eflag) if 'debug' in params and params['debug']: if np.any(gamma.get()[s.bulk] < 1.): raise ValueError("gamma < 1 failure!") # Find the scalars global knl_utop_set if knl_utop_set is None: code = add_ghosts( replace_prim_names(""" rho0 := D[i,j,k] / gamma[i,j,k] W := Wp[i,j,k] + D[i,j,k] w := W / (gamma[i,j,k]**2) pres := (w - rho0) * (gam - 1.) / gam u := w - (rho0 + pres) # Set flag if prims are < 0 eflag[i,j,k] = if((u < 0)*(rho0 < 0), 8, if(u < 0, 7, if(rho0 < 0, 6, eflag[i,j,k]))) {id=ef} # Don't update flagged primitives (necessary? Could skip the branch if fixup does ok) <> set = not(eflag[i,j,k]) {dep=ef,nosync=ef} P[RHO,i,j,k] = rho0 {if=set} P[UU,i,j,k] = u {if=set} P[U1,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[1,i,j,k] + QdB[i,j,k] * Bcon[1,i,j,k] / W) {if=set} P[U2,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[2,i,j,k] + QdB[i,j,k] * Bcon[2,i,j,k] / W) {if=set} P[U3,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[3,i,j,k] + QdB[i,j,k] * Bcon[3,i,j,k] / W) {if=set} """)) if 'electrons' in params and params['electrons']: code += add_ghosts(""" P[KEL,i,j,k] = U[KEL,i,j,k]/U[RHO,i,j,k] P[KTOT,i,j,k] = U[KTOT,i,j,k]/U[RHO,i,j,k] """) knl_utop_set = lp.make_kernel( sh.isl_grid_scalar, code, [ *primsArrayArgs("P"), *vecArrayArgs("Qtcon", "Bcon"), *scalarArrayArgs("D", "gamma", "Wp", "Bsq", "QdB"), *scalarArrayArgs("eflag", dtype=np.int32), ... ], assumptions=sh.assume_grid, default_offset=lp.auto) knl_utop_set = lp.fix_parameters(knl_utop_set, gam=params['gam'], nprim=params['n_prim'], ndim=4) knl_utop_set = tune_grid_kernel(knl_utop_set, sh.bulk_scalar, ng=G.NG) print("Compiled utop_set") evt, _ = knl_utop_set(params['queue'], P=Pout, Qtcon=Qtcon, Bcon=Bcon, D=D, gamma=gamma, Wp=Wp, Bsq=Bsq, QdB=QdB, eflag=eflag) evt.wait() del Qtcon, Bcon, D, gamma, Wp, Bsq, QdB # Trap on flags early in test problems if 'debug' in params and params['debug']: n_nonzero = np.count_nonzero(eflag.get()) if n_nonzero > 0: print("Nonzero eflag in bulk: {}\nFlags: {}".format( n_nonzero, np.argwhere(eflag.get() != 0))) return Pout, eflag