Ejemplo n.º 1
0
    def get_divergence_error(vector):
        for mu in range(3):
            fft.idft(vector[mu], vector_x[mu])

        derivs.divergence(queue, vector_x, div)

        derivs(queue, fx=vector_x[0], pdx=pdx[0])
        derivs(queue, fx=vector_x[1], pdy=pdx[1])
        derivs(queue, fx=vector_x[2], pdz=pdx[2])
        norm = sum([clm.fabs(pdx[mu]) for mu in range(3)])

        max_err = cla.max(clm.fabs(div)) / cla.max(norm)
        avg_err = cla.sum(clm.fabs(div)) / cla.sum(norm)
        return max_err, avg_err
Ejemplo n.º 2
0
    def get_divergence_errors(hij):
        max_errors = []
        avg_errors = []
        for i in range(1, 4):
            for mu in range(3):
                fft.idft(hij[tensor_id(i, mu + 1)], vector_x[mu])

            derivs.divergence(queue, vector_x, div)

            derivs(queue, fx=vector_x[0], pdx=pdx[0])
            derivs(queue, fx=vector_x[1], pdy=pdx[1])
            derivs(queue, fx=vector_x[2], pdz=pdx[2])
            norm = sum([clm.fabs(pdx[mu]) for mu in range(3)])

            max_errors.append(cla.max(clm.fabs(div)) / cla.max(norm))
            avg_errors.append(cla.sum(clm.fabs(div)) / cla.sum(norm))

        return np.array(max_errors), np.array(avg_errors)
Ejemplo n.º 3
0
def test_outoforderqueue_clmath(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype('float32'))
    a_gpu = cl_array.to_device(queue, a)
    # testing that clmath functions wait for and create events
    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
    queue.finish()
    b1 = b_gpu.get()
    b = np.abs(np.sin(a * 5))
    assert np.abs(b1 - b).mean() < 1e-5
Ejemplo n.º 4
0
def test_outoforderqueue_clmath(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
                                properties=cl.command_queue_properties.
                                OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype('float32'))
    a_gpu = cl_array.to_device(queue, a)
    # testing that clmath functions wait for and create events
    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
    queue.finish()
    b1 = b_gpu.get()
    b = np.abs(np.sin(a * 5))
    assert np.abs(b1 - b).mean() < 1e-5
Ejemplo n.º 5
0
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype,
                          timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)
    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)

    L = (3, 5, 7)
    dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
    dk = tuple(2 * np.pi / Li for Li in L)

    if h == 0:
        def get_evals_2(k, dx):
            return - k**2

        derivs = ps.SpectralCollocator(fft, dk)
    else:
        from pystella.derivs import SecondCenteredDifference
        get_evals_2 = SecondCenteredDifference(h).get_eigenvalues
        derivs = ps.FiniteDifferencer(mpi, h, dx, stream=False)

    solver = ps.SpectralPoissonSolver(fft, dk, dx, get_evals_2)

    pencil_shape = tuple(ni + 2*h for ni in rank_shape)

    statistics = ps.FieldStatistics(mpi, 0, rank_shape=rank_shape,
                                    grid_size=np.product(grid_shape))

    fx = cla.empty(queue, pencil_shape, dtype)
    rho = clr.rand(queue, rank_shape, dtype)
    rho -= statistics(rho)["mean"]
    lap = cla.empty(queue, rank_shape, dtype)
    rho_h = rho.get()

    for m_squared in (0, 1.2, 19.2):
        solver(queue, fx, rho, m_squared=m_squared)
        fx_h = fx.get()
        if h > 0:
            fx_h = fx_h[h:-h, h:-h, h:-h]

        derivs(queue, fx=fx, lap=lap)

        diff = np.fabs(lap.get() - rho_h - m_squared * fx_h)
        max_err = np.max(diff) / cla.max(clm.fabs(rho))
        avg_err = np.sum(diff) / cla.sum(clm.fabs(rho))

        max_rtol = 1e-12 if dtype == np.float64 else 1e-4
        avg_rtol = 1e-13 if dtype == np.float64 else 1e-5

        assert max_err < max_rtol and avg_err < avg_rtol, \
            f"solution inaccurate for {h=}, {grid_shape=}, {proc_shape=}"

    if timing:
        from common import timer
        time = timer(lambda: solver(queue, fx, rho, m_squared=m_squared), ntime=10)

        if mpi.rank == 0:
            print(f"poisson took {time:.3f} ms for {grid_shape=}, {proc_shape=}")
Ejemplo n.º 6
0
def test_step(ctx_factory, proc_shape, dtype, Stepper):
    if proc_shape != (1, 1, 1):
        pytest.skip("test step only on one rank")

    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)

    from pystella.step import LowStorageRKStepper
    is_low_storage = LowStorageRKStepper in Stepper.__bases__

    rank_shape = (1, 1, 8)
    init_vals = np.linspace(1, 3, 8)
    if is_low_storage:
        y = cla.zeros(queue, rank_shape, dtype)
        y[0, 0, :] = init_vals
        y0 = y.copy()
    else:
        num_copies = Stepper.num_copies
        y = cla.zeros(queue, (num_copies, ) + rank_shape, dtype)
        y[0, 0, 0, :] = init_vals
        y0 = y[0].copy()

    dtlist = [.1, .05, .025]

    for n in [-1., -2., -3., -4.]:
        max_errs = {}
        for dt in dtlist:

            def sol(y0, t):
                return ((-1 + n) * (-t + y0**(1 - n) / (-1 + n)))**(1 /
                                                                    (1 - n))

            _y = ps.Field("y")
            rhs_dict = {_y: _y**n}

            stepper = Stepper(rhs_dict,
                              dt=dt,
                              halo_shape=0,
                              rank_shape=rank_shape)

            if is_low_storage:
                y[0, 0, :] = init_vals
            else:
                y[0, 0, 0, :] = init_vals

            t = 0
            errs = []
            while t < .1:
                for s in range(stepper.num_stages):
                    stepper(s, queue=queue, y=y, filter_args=True)
                t += dt

                if is_low_storage:
                    errs.append(cla.max(clm.fabs(1. - sol(y0, t) / y)).get())
                else:
                    errs.append(
                        cla.max(clm.fabs(1. - sol(y0, t) / y[0])).get())

            max_errs[dt] = np.max(errs)

        order = stepper.expected_order
        print(f"{order=}, {n=}")
        print(max_errs)
        print([
            max_errs[a] / max_errs[b] for a, b in zip(dtlist[:-1], dtlist[1:])
        ])

        order = stepper.expected_order
        rtol = dtlist[-1]**order if dtype == np.float64 else 1e-1
        assert list(max_errs.values())[-1] < rtol, \
            f"Stepper solution inaccurate for {n=}"

        for a, b in zip(dtlist[:-1], dtlist[1:]):
            assert max_errs[a] / max_errs[b] > .9 * (a/b)**order, \
                f"Stepper convergence failing for {n=}"
Ejemplo n.º 7
0
def U_to_P(params, G, U, P, Pout=None, iter_max=None):

    s = G.slices
    sh = G.shapes

    # Set some default parameters, but allow overrides
    if iter_max is None:
        if 'invert_iter_max' in params:
            iter_max = params['invert_iter_max']
        else:
            iter_max = 8

    if 'invert_err_tol' in params:
        err_tol = params['invert_err_tol']
    else:
        err_tol = 1.e-8

    if 'invert_iter_delta' in params:
        delta = params['invert_iter_delta']
    else:
        delta = 1.e-5

    if 'gamma_max' not in params:
        params['gamma_max'] = 25

    # Don't overwrite old memory by default, just allow using old values
    # Caller can pass new/old memory but is responsible that it contain logical values if we don't update it
    if Pout is None:
        Pout = P.copy()

    # Update the primitive B-fields
    G.vecdivbygeom(params['queue'],
                   u=U[s.B3VEC],
                   g=G.gdet_d[Loci.CENT.value],
                   out=Pout[s.B3VEC])

    # Cached constant quantities
    global ncov, ncon, lgdet
    if ncov is None:
        # For later on
        ncov = cl_array.zeros(params['queue'],
                              sh.grid_vector,
                              dtype=np.float64)
        G.timesgeom(params['queue'],
                    u=cl_array.empty_like(ncov[0]).fill(1.0),
                    g=-G.lapse_d[Loci.CENT.value],
                    out=ncov[0])
        ncon = G.raise_grid(ncov)
        lgdet = G.lapse_d[Loci.CENT.value] / G.gdet_d[Loci.CENT.value]

    # Eflag will indicate inversion failures
    # Define a generic kernel so we can split out flagging in the future
    # Use it to catch negative density early on
    eflag = cl_array.zeros(params['queue'], sh.grid_scalar, dtype=np.int32)
    global knl_set_eflag
    if knl_set_eflag is None:
        code = add_ghosts(
            """eflag[i,j,k] = if(var[i,j,k] < 0, flag, eflag[i,j,k])""")
        knl_set_eflag = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *scalarArrayArgs("eflag", dtype=np.int32),
                *scalarArrayArgs("var"),
                lp.ValueArg("flag", dtype=np.int32), ...
            ],
            default_offset=lp.auto)
        knl_set_eflag = tune_grid_kernel(knl_set_eflag,
                                         sh.bulk_scalar,
                                         ng=G.NG)

    evt, _ = knl_set_eflag(params['queue'],
                           var=U[s.RHO],
                           eflag=eflag,
                           flag=-100)

    # Convert from conserved variables to four-vectors
    Bcon = cl_array.zeros(params['queue'], sh.grid_vector, dtype=np.float64)
    G.vectimesgeom(params['queue'], u=U[s.B3VEC], g=lgdet, out=Bcon[1:])

    Qcov = cl_array.empty_like(Bcon)
    G.timesgeom(params['queue'], u=(U[s.UU] - U[s.RHO]), g=lgdet, out=Qcov[0])
    G.vectimesgeom(params['queue'], u=U[s.U3VEC], g=lgdet, out=Qcov[1:])

    Bcov = G.lower_grid(Bcon)
    Qcon = G.raise_grid(Qcov)

    # This will have fringes of zeros still!
    Bsq = G.dot(Bcon, Bcov)
    QdB = G.dot(Bcon, Qcov)
    Qdotn = G.dot(Qcon, ncov)
    Qsq = G.dot(Qcon, Qcov)

    Qtsq = Qsq + Qdotn**2

    Qtcon = cl_array.empty_like(Qcon)
    for i in range(4):
        Qtcon[i] = Qcon[i] + ncon[i] * Qdotn

    # Set up eqn for W', the energy density
    D = cl_array.zeros_like(Qsq)
    G.timesgeom(params['queue'], u=U[s.RHO], g=lgdet, out=D)
    Ep = -Qdotn - D

    del Bcov, Qcon, Qcov

    # Numerical rootfinding
    # Take guesses from primitives
    Wp = Wp_func(params, G, P, Loci.CENT, eflag)
    # Trap on any failures so far if debugging.  They're very rare.
    if 'debug' in params and params['debug']:
        if np.any(eflag.get()[s.bulk] != 0):
            raise ValueError("Unexpected flag set!")

    # Step around the guess & evaluate errors
    h = delta * Wp  # TODO stable enough?  Need fancy subtraction from iharm3d?
    errp = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp + h, eflag)
    err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag)
    errm = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp - h, eflag)

    # Preserve Wp/err before updating them below
    Wp1 = Wp.copy()
    err1 = err.copy()

    global knl_utop_prep
    if knl_utop_prep is None:
        # TODO keep an accumulator here to avoid that costly any() call?
        code = add_ghosts("""
        # TODO put error/prep in here, not calling below

        # Attempt a Halley/Muller/Bailey/Press step
        dedW := (errp[i,j,k] - errm[i,j,k]) / (2 * h[i,j,k])
        dedW2 := (errp[i,j,k] - 2.*err[i,j,k] + errm[i,j,k]) / (h[i,j,k]**2)

        # Limit size of 2nd derivative correction
        # Loopy trick common in HARM: define intermediate variables (xt, xt2, xt3...) to impose clipping
        # This allows assignments or substitutions while keeping dependencies straight
        ft := 0.5*err[i,j,k]*dedW2/(dedW**2)
        ft2 := if(ft > 0.3, 0.3, ft)
        f := if(ft2 < -0.3, -0.3, ft2)

        # Limit size of step
        dWt := -err[i,j,k] / dedW / (1. - f)
        dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt)
        dW := if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2)

        Wp[i,j,k] = Wp[i,j,k] + dW {id=wp}

        # Guarantee we take one step in every bulk zone
        stop_flag[i,j,k] = 0
        # This would avoid the step where there's convergence, but would require taking 2*dW above or similar
        #stop_flag[i,j,k] = (fabs(dW / Wp[i,j,k]) < err_tol) + \
        #                   (fabs(err[i,j,k] / Wp[i,j,k]) < err_tol) {dep=wp,nosync=wp}
        """)
        knl_utop_prep = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *scalarArrayArgs("err", "errp", "errm", "h", "Wp"),
                *scalarArrayArgs("stop_flag", dtype=np.int8), ...
            ],
            assumptions=sh.assume_grid,
            seq_dependencies=True)
        knl_utop_prep = lp.fix_parameters(knl_utop_prep, err_tol=err_tol)
        knl_utop_prep = tune_grid_kernel(knl_utop_prep,
                                         sh.bulk_scalar,
                                         ng=G.NG)
        print("Compiled utop_prep")

    # Fill stop_flag with 1 so we don't have to worry about ghost zones taking steps
    stop_flag = cl_array.empty(params['queue'], sh.grid_scalar,
                               dtype=np.int8).fill(1)

    evt, _ = knl_utop_prep(params['queue'],
                           err=err,
                           errp=errp,
                           errm=errm,
                           h=h,
                           Wp=Wp,
                           stop_flag=stop_flag)
    evt.wait()

    err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag, out=err)

    # Iteration kernel for 1Dw solver
    global knl_utop_iter
    if knl_utop_iter is None:
        code = add_ghosts("""
        # Evaluate whether we need to do any of this
        <> go = not(stop_flag[i,j,k]) {id=insn_go}

        # Normal secant increment is dW. Limit guess to between 0.5 and 2 times current value
        dWt := (Wp1[i,j,k] - Wp[i,j,k]) * err[i,j,k] / (err[i,j,k] - err1[i,j,k])
        dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt)
        <> dW = if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2) {id=dw,if=go}

        # Preserve last values, after use but before any changes
        Wp1[i,j,k] = Wp[i,j,k] {id=wp1,nosync=dw,if=go}
        err1[i,j,k] = err[i,j,k] {nosync=dw,if=go}

        # Update Wp.  Err will be updated outside kernel
        Wp[i,j,k] = Wp[i,j,k] + dW {id=wp,nosync=dw:wp1,if=go}

        # Set flag not to continue in zones that have converged
        stop_flag[i,j,k] = if(fabs(dW / Wp[i,j,k]) < err_tol, 1, stop_flag[i,j,k]) {nosync=dw:wp:insn_go,if=go}

        # For the future, when we've defined err_eqn for loopy kernels
        # err = err_eqn(Bsq, D, Ep, QdB, Qtsq, Wp, gam, gamma_max, eflag) {if=go}
        # stop_flag[i,j,k] = stop_flag[i,j,k] + (fabs(err[] / Wp[]) < err_tol) {if=go}
        """)
        knl_utop_iter = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *scalarArrayArgs("Wp", "Wp1", "err", "err1"),
                *scalarArrayArgs("stop_flag", dtype=np.int8), ...
            ],
            assumptions=sh.assume_grid,
            default_offset=lp.auto,
            seq_dependencies=True)
        knl_utop_iter = lp.fix_parameters(knl_utop_iter, err_tol=err_tol)
        knl_utop_iter = tune_grid_kernel(knl_utop_iter,
                                         sh.bulk_scalar,
                                         ng=G.NG)
        print("Compiled utop_iter")

    # Iterate at least once to set new values from first step
    # TODO Needed now we set Wp, err1 right?
    for niter in range(iter_max):
        #print("U_to_P iter")
        evt, _ = knl_utop_iter(params['queue'],
                               Wp=Wp,
                               Wp1=Wp1,
                               err=err,
                               err1=err1,
                               stop_flag=stop_flag)
        err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag)
        # TODO there may be better/faster if/reduction statements here...
        stop_flag |= (clm.fabs(err / Wp) < err_tol)
        if cl_array.min(stop_flag) >= 1:
            break

    # If secant method failed to converge, do not set primitives other than B
    eflag += (stop_flag == 0)

    del Wp1, err, err1, stop_flag

    # Find utsq, gamma, rho0 from Wp
    gamma = gamma_func(params, G, Bsq, D, QdB, Qtsq, Wp, eflag)

    if 'debug' in params and params['debug']:
        if np.any(gamma.get()[s.bulk] < 1.):
            raise ValueError("gamma < 1 failure!")

    # Find the scalars
    global knl_utop_set
    if knl_utop_set is None:
        code = add_ghosts(
            replace_prim_names("""
        rho0 := D[i,j,k] / gamma[i,j,k]
        W := Wp[i,j,k] + D[i,j,k]
        w := W / (gamma[i,j,k]**2)
        pres := (w - rho0) * (gam - 1.) / gam
        u := w - (rho0 + pres)

        # Set flag if prims are < 0
        eflag[i,j,k] = if((u < 0)*(rho0 < 0), 8, if(u < 0, 7, if(rho0 < 0, 6, eflag[i,j,k]))) {id=ef}

        # Don't update flagged primitives (necessary?  Could skip the branch if fixup does ok)
        <> set = not(eflag[i,j,k]) {dep=ef,nosync=ef}

        P[RHO,i,j,k] = rho0 {if=set}
        P[UU,i,j,k] = u     {if=set}
        P[U1,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[1,i,j,k] + QdB[i,j,k] * Bcon[1,i,j,k] / W) {if=set}
        P[U2,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[2,i,j,k] + QdB[i,j,k] * Bcon[2,i,j,k] / W) {if=set}
        P[U3,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[3,i,j,k] + QdB[i,j,k] * Bcon[3,i,j,k] / W) {if=set}
        """))
        if 'electrons' in params and params['electrons']:
            code += add_ghosts("""
            P[KEL,i,j,k] = U[KEL,i,j,k]/U[RHO,i,j,k]
            P[KTOT,i,j,k] = U[KTOT,i,j,k]/U[RHO,i,j,k]
            """)
        knl_utop_set = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *primsArrayArgs("P"), *vecArrayArgs("Qtcon", "Bcon"),
                *scalarArrayArgs("D", "gamma", "Wp", "Bsq", "QdB"),
                *scalarArrayArgs("eflag", dtype=np.int32), ...
            ],
            assumptions=sh.assume_grid,
            default_offset=lp.auto)
        knl_utop_set = lp.fix_parameters(knl_utop_set,
                                         gam=params['gam'],
                                         nprim=params['n_prim'],
                                         ndim=4)
        knl_utop_set = tune_grid_kernel(knl_utop_set, sh.bulk_scalar, ng=G.NG)
        print("Compiled utop_set")

    evt, _ = knl_utop_set(params['queue'],
                          P=Pout,
                          Qtcon=Qtcon,
                          Bcon=Bcon,
                          D=D,
                          gamma=gamma,
                          Wp=Wp,
                          Bsq=Bsq,
                          QdB=QdB,
                          eflag=eflag)
    evt.wait()
    del Qtcon, Bcon, D, gamma, Wp, Bsq, QdB

    # Trap on flags early in test problems
    if 'debug' in params and params['debug']:
        n_nonzero = np.count_nonzero(eflag.get())
        if n_nonzero > 0:
            print("Nonzero eflag in bulk: {}\nFlags: {}".format(
                n_nonzero, np.argwhere(eflag.get() != 0)))

    return Pout, eflag