Esempio n. 1
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Esempio n. 2
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Esempio n. 3
0
def network_time_step(
        model: model.BaseKernel,
        coupling: coupling.BaseCoupling,
        scheme: scheme.TimeStepScheme,
        target: lp.target.TargetBase=None,
        ):
    target = target or utils.default_target()
    # fuse kernels
    kernels = [
        model.kernel(target),
        network.Network(model, coupling).kernel(target),
        lp.fix_parameters(scheme.kernel(target), nsvar=len(model.state_sym)),
    ]
    data_flow = [
        ('input', 1, 0),
        ('diffs', 0, 2),
        ('drift', 0, 2),
        ('state', 2, 0)
    ]
    knl = lp.fuse_kernels(kernels, data_flow=data_flow)
    # time step
    knl = lp.to_batched(knl, 'nstep', [], 'i_step', sequential=True)
    new_i_time = pm.parse('(i_step + i_step_0) % ntime')
    knl = lp.fix_parameters(knl, i_time=new_i_time)
    knl.args.append(lp.ValueArg('i_step_0', np.uintc))
    knl = lp.add_dtypes(knl, {'i_step_0': np.uintc})
    return knl
Esempio n. 4
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    if (not ctx.devices[0].image_support
            or ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Esempio n. 5
0
def make_knl():

    target = NumbaTarget()

    # build individual kernels
    osc = model.Kuramoto()
    osc.dt = 1.0
    osc.const['omega'] = 10.0 * 2.0 * np.pi / 1e3
    osc_knl = osc.kernel(target)

    cfun = coupling.Kuramoto(osc)
    cfun.param['a'] = pm.parse('a')
    net = network.Network(osc, cfun)
    net_knl = net.kernel(target)

    scm = scheme.EulerStep(osc.dt)
    scm_knl = scm.kernel(target)
    scm_knl = lp.fix_parameters(scm_knl, nsvar=len(osc.state_sym))

    # fuse kernels
    knls = osc_knl, net_knl, scm_knl
    data_flow = [('input', 1, 0), ('diffs', 0, 2), ('drift', 0, 2),
                 ('state', 2, 0)]
    knl = lp.fuse_kernels(knls, data_flow=data_flow)

    # and time step
    knl = lp.to_batched(knl, 'nstep', [], 'i_step', sequential=True)
    knl = lp.fix_parameters(knl,
                            i_time=pm.parse('(i_step + i_step_0) % ntime'))
    knl.args.append(lp.ValueArg('i_step_0', np.uintc))
    knl = lp.add_dtypes(knl, {'i_step_0': np.uintc})

    return knl, osc
Esempio n. 6
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    if (not ctx.devices[0].image_support
            or ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Esempio n. 7
0
 def test_simple(self):
     target = NumbaTarget()
     knl = lp.make_kernel("{ [i]: 0<=i<n }",
                          "out[i] = 2*a[i]",
                          target=target)
     typed = lp.add_dtypes(knl, {'a': np.float32})
     a, out = np.zeros((2, 10), np.float32)
     a[:] = np.r_[:a.size]
     typed(a, 10, out)
     np.testing.assert_allclose(out, a * 2)
Esempio n. 8
0
 def test_simple_kernel(self):
     knl = lp.make_kernel("{ [i]: 0<=i<n }",
                          "out[i] = 2*a[i]",
                          target=CTarget())
     typed = lp.add_dtypes(knl, {'a': np.float32})
     code, _ = lp.generate_code(typed)
     fn = CompiledKernel(typed)  # noqa
     a, out = np.zeros((2, 10), np.float32)
     a[:] = np.r_[:a.size]
     fn(a, 10, out)
     np.testing.assert_allclose(out, a * 2)
Esempio n. 9
0
def test_force_outer_iname_for_scan():
    knl = lp.make_kernel(
        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
        "out[i] = product(j, a[j]) {inames=i:k}")

    knl = lp.add_dtypes(knl, dict(a=np.float32))

    # TODO: Maybe this deserves to work?
    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
Esempio n. 10
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
            "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
            """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp,dep=init_w:aind_init}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
            [
                # Must declare coeffs to have "no" shape, to keep loopy
                # from trying to figure it out the shape automatically.

                lp.GlobalArg("coeffs", None, shape=None),
                "..."
                ],
            assumptions="deg>=0 and nels>=1",
            target=lp.PyOpenCLTarget(ctx.devices[0])
            )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
            slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
    knl = lp.add_dtypes(knl, dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
                ))
    print(lp.generate_code_v2(knl))
Esempio n. 11
0
def test_force_outer_iname_for_scan():
    knl = lp.make_kernel(
        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
        "out[i] = product(j, a[j]) {inames=i:k}")

    knl = lp.add_dtypes(knl, dict(a=np.float32))

    # TODO: Maybe this deserves to work?
    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
Esempio n. 12
0
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (self.get_default_src_tgt_arguments() + [
            lp.GlobalArg("srcindices", None, shape="nresult"),
            lp.GlobalArg("tgtindices", None, shape="nresult"),
            lp.ValueArg("nresult", None)
        ] + [
            lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
            for i, dtype in enumerate(self.value_dtypes)
        ])

        loopy_knl = lp.make_kernel(
            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
            self.get_kernel_scaling_assignments()
            # NOTE: itgt, isrc need to always be defined in case a statement
            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
            # places like get_kernel_exprs)
            + [
                """
                for imat
                    <> itgt = tgtindices[imat]
                    <> isrc = srcindices[imat]

                    <> d[idim] = targets[idim, itgt] - sources[idim, isrc]
            """
            ] + [
                """
                    <> is_self = (isrc == target_to_source[itgt])
                """ if self.exclude_self else ""
            ] + loopy_insns + kernel_exprs + [
                """
                    result_{i}[imat] = \
                        knl_{i}_scaling * pair_result_{i} \
                            {{id_prefix=write_p2p}}
                """.format(i=iknl) for iknl in range(len(self.kernels))
            ] + ["end"],
            arguments,
            assumptions="nresult>=1",
            silenced_warnings="write_race(write_p2p*)",
            name=self.name,
            fixed_parameters=dict(dim=self.dim),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.add_dtypes(loopy_knl,
                                  dict(nsources=np.int32, ntargets=np.int32))

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Esempio n. 13
0
File: p2p.py Progetto: inducer/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (
            self.get_default_src_tgt_arguments()
            + [
                lp.GlobalArg("srcindices", None, shape="nresult"),
                lp.GlobalArg("tgtindices", None, shape="nresult"),
                lp.ValueArg("nresult", None)
            ]
            + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
             for i, dtype in enumerate(self.value_dtypes)])

        loopy_knl = lp.make_kernel(
            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
            self.get_kernel_scaling_assignments()
            # NOTE: itgt, isrc need to always be defined in case a statement
            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
            # places like get_kernel_exprs)
            + ["""
                for imat
                    <> itgt = tgtindices[imat]
                    <> isrc = srcindices[imat]

                    <> d[idim] = targets[idim, itgt] - sources[idim, isrc]
            """]
            + ["""
                    <> is_self = (isrc == target_to_source[itgt])
                """ if self.exclude_self else ""]
            + loopy_insns + kernel_exprs
            + ["""
                    result_{i}[imat] = \
                        knl_{i}_scaling * pair_result_{i} \
                            {{id_prefix=write_p2p}}
                """.format(i=iknl)
                for iknl in range(len(self.kernels))]
            + ["end"],
            arguments,
            assumptions="nresult>=1",
            silenced_warnings="write_race(write_p2p*)",
            name=self.name,
            fixed_parameters=dict(dim=self.dim),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.add_dtypes(loopy_knl,
            dict(nsources=np.int32, ntargets=np.int32))

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Esempio n. 14
0
def test_scan_data_types(ctx_factory, dtype):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }",
                         "res[i] = reduce(sum, j, a[j])",
                         assumptions="n>=1")

    a = np.random.randn(20).astype(dtype)
    knl = lp.add_dtypes(knl, dict(a=dtype))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res, ) = knl(queue, a=a)

    assert np.allclose(res, np.cumsum(a))
Esempio n. 15
0
def test_scan_data_types(ctx_factory, dtype):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<=i }",
            "res[i] = reduce(sum, j, a[j])",
            assumptions="n>=1")

    a = np.random.randn(20).astype(dtype)
    knl = lp.add_dtypes(knl, dict(a=dtype))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res,) = knl(queue, a=a)

    assert np.allclose(res, np.cumsum(a))
Esempio n. 16
0
def test_scan_library(ctx_factory, op_name, np_op):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }",
                         "res[i] = reduce(%s, j, a[j])" % op_name,
                         assumptions="n>=1")

    a = np.random.randn(20)
    knl = lp.add_dtypes(knl, dict(a=np.float))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res, ) = knl(queue, a=a)

    assert np.allclose(res,
                       np.array([np_op(a[:i + 1]) for i in range(len(a))]))
Esempio n. 17
0
def test_funny_shape_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)
    m = n + 12
    ell = m + 12

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,ell >= 1")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
    })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)

    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl,
                        "a_acc",
                        "k_inner,i_inner",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "b_acc",
                        "j_inner,k_inner",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        op_count=[2 * n**3 / 1e9],
                        op_label=["GFlops"],
                        parameters={
                            "n": n,
                            "m": m,
                            "ell": ell
                        })
Esempio n. 18
0
def test_scan_library(ctx_factory, op_name, np_op):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<=i }",
            "res[i] = reduce(%s, j, a[j])" % op_name,
            assumptions="n>=1")

    a = np.random.randn(20)
    knl = lp.add_dtypes(knl, dict(a=np.float))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res,) = knl(queue, a=a)

    assert np.allclose(res, np.array(
            [np_op(a[:i+1]) for i in range(len(a))]))
Esempio n. 19
0
def test_parallel_multi_output_reduction(ctx_factory):
    knl = lp.make_kernel(
        "{[i]: 0<=i<128}", """
                max_val, max_indices = argmax(i, abs(a[i]), i)
                """)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.add_dtypes(knl, dict(a=np.float64))

    ctx = ctx_factory()

    with cl.CommandQueue(ctx) as queue:
        a = np.random.rand(128)
        out, (max_index, max_val) = knl(queue, a=a)

        assert max_val == np.max(a)
        assert max_index == np.argmax(np.abs(a))
Esempio n. 20
0
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """
        out[i-1] = sum(j, a[j]**2)
        """, "...")

    knl = lp.fix_parameters(knl, n=16)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)
    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))
    evt, (out, ) = knl(queue, a=np.arange(1, 17))

    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
Esempio n. 21
0
def test_parallel_multi_output_reduction(ctx_factory):
    knl = lp.make_kernel(
                "{[i]: 0<=i<128}",
                """
                max_val, max_indices = argmax(i, abs(a[i]), i)
                """)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.add_dtypes(knl, dict(a=np.float64))
    knl = lp.realize_reduction(knl)

    ctx = ctx_factory()

    with cl.CommandQueue(ctx) as queue:
        a = np.random.rand(128)
        out, (max_index, max_val) = knl(queue, a=a)

        assert max_val == np.max(a)
        assert max_index == np.argmax(np.abs(a))
Esempio n. 22
0
def test_local_parallel_scan(ctx_factory, n):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """
        out[i] = sum(j, a[j]**2)
        """, "...")

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))

    print(knl)

    evt, (a, ) = knl(queue, a=np.arange(n))
    assert (a == np.cumsum(np.arange(n)**2)).all()
Esempio n. 23
0
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}",
        """
        out[i-1] = sum(j, a[j]**2)
        """,
        "..."
        )

    knl = lp.fix_parameters(knl, n=16)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)
    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))
    evt, (out,) = knl(queue, a=np.arange(1, 17))

    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
Esempio n. 24
0
def test_funny_shape_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)
    m = n+12
    ell = m+12

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            name="matmul", assumptions="n,m,ell >= 1")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)

    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
            default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
            default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n, "m": m, "ell": ell})
Esempio n. 25
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl
    ref_knl = lp.add_dtypes(ref_knl, {"n": np.int32})

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_address_space=lp.AddressSpace.GLOBAL,
                        default_tag="l.auto")
    knl = lp.preprocess_kernel(knl)
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
Esempio n. 26
0
def test_local_parallel_scan(ctx_factory, n):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}",
        """
        out[i] = sum(j, a[j]**2)
        """,
        "..."
        )

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))

    print(knl)

    evt, (a,) = knl(queue, a=np.arange(n))
    assert (a == np.cumsum(np.arange(n)**2)).all()
Esempio n. 27
0
 def _dtype_and_code(self, knl, **extra_dtypes):
     dtypes = {'in': np.float32, 'out': np.float32}
     dtypes.update(extra_dtypes)
     knl = lp.add_dtypes(knl, dtypes)
     code, _ = lp.generate_code(knl)
     return code
Esempio n. 28
0
def call_loopy(translation_unit: "lp.TranslationUnit",
               bindings: Dict[str, ArrayOrScalar],
               entrypoint: Optional[str] = None) -> LoopyCall:
    """
    Invokes an entry point of a :class:`loopy.TranslationUnit` on the array inputs as
    specified by *bindings*.

    Restrictions on the structure of ``translation_unit[entrypoint]``:

    * array arguments of ``translation_unit[entrypoint]`` must either be either
      input-only or output-only.
    * all input-only arguments of ``translation_unit[entrypoint]`` must appear in
      *bindings*.
    * all output-only arguments of ``translation_unit[entrypoint]`` must appear
      in *bindings*.
    * if *translation_unit* has been declared with multiple entrypoints,
      *entrypoint* can not be *None*.

    :arg translation_unit: the translation unit to call.
    :arg bindings: mapping from argument names of ``translation_unit[entrypoint]``
      to :class:`pytato.array.Array`.
    :arg entrypoint: the entrypoint of the ``translation_unit`` parameter.
    """
    if entrypoint is None:
        if len(translation_unit.entrypoints) != 1:
            raise ValueError("cannot infer entrypoint")

        entrypoint, = translation_unit.entrypoints

    translation_unit = translation_unit.with_entrypoints(entrypoint)

    # {{{ sanity checks

    if any([
            arg.is_input and arg.is_output
            for arg in translation_unit[entrypoint].args
    ]):
        # Pytato DAG cannot have stateful nodes.
        raise ValueError("Cannot call a kernel with side-effects.")

    for name in bindings:
        if name not in translation_unit[entrypoint].arg_dict:
            raise ValueError(f"Kernel '{entrypoint}' got an unexpected input: "
                             f"'{name}'.")
        if translation_unit[entrypoint].arg_dict[name].is_output:
            raise ValueError(
                f"Kernel '{entrypoint}' got an output arg '{name}' "
                f"as input.")

    # {{{ perform shape inference here

    bindings = extend_bindings_with_shape_inference(
        translation_unit[entrypoint], pmap(bindings))

    # }}}

    for arg in translation_unit[entrypoint].args:
        if arg.is_input:
            if arg.name not in bindings:
                raise ValueError(f"Kernel '{entrypoint}' expects an input"
                                 f" '{arg.name}'")

            arg_binding = bindings[arg.name]

            if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)):
                if not isinstance(arg_binding, Array):
                    raise ValueError(f"Argument '{arg.name}' expected to be a "
                                     f"pytato.Array, got {type(arg_binding)}.")
            else:
                assert isinstance(arg, lp.ValueArg)
                if not (isinstance(arg_binding, Number) or
                        (isinstance(arg_binding, Array)
                         and arg_binding.shape == ())):
                    raise ValueError(f"Argument '{arg.name}' expected to be a "
                                     " number or a scalar expression, got "
                                     f"{type(arg_binding)}.")

    # }}}

    # {{{ infer types of the translation_unit

    for name, ary in bindings.items():
        if translation_unit[entrypoint].arg_dict[name].dtype not in [
                lp.auto, None
        ]:
            continue

        if isinstance(ary, Array):
            translation_unit = lp.add_dtypes(translation_unit,
                                             {name: ary.dtype})
        else:
            assert isinstance(ary, Number)
            translation_unit = lp.add_dtypes(translation_unit,
                                             {name: type(ary)})

    translation_unit = lp.infer_unknown_types(translation_unit)

    # }}}

    # {{{ infer shapes of the translation_unit

    translation_unit = lp.infer_arg_descr(translation_unit)

    # }}}

    translation_unit = translation_unit.with_entrypoints(frozenset())

    return LoopyCall(translation_unit, bindings, entrypoint)
Esempio n. 29
0
File: p2p.py Progetto: inducer/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (
            self.get_default_src_tgt_arguments()
            + [
                lp.GlobalArg("box_target_starts",
                    None, shape=None),
                lp.GlobalArg("box_target_counts_nonchild",
                    None, shape=None),
                lp.GlobalArg("box_source_starts",
                    None, shape=None),
                lp.GlobalArg("box_source_counts_nonchild",
                    None, shape=None),
                lp.GlobalArg("source_box_starts",
                    None, shape=None),
                lp.GlobalArg("source_box_lists",
                    None, shape=None),
                lp.GlobalArg("strength", None,
                    shape="nstrengths, nsources", dim_tags="sep,C"),
                lp.GlobalArg("result", None,
                    shape="nkernels, ntargets", dim_tags="sep,C"),
                "..."
            ])

        loopy_knl = lp.make_kernel([
            "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
            "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
            "{[itgt, isrc, idim]: \
                itgt_start <= itgt < itgt_end and \
                isrc_start <= isrc < isrc_end and \
                0 <= idim < dim}",
            ],
            self.get_kernel_scaling_assignments()
            + ["""
                for itgt_box
                <> tgt_ibox = target_boxes[itgt_box]
                <> itgt_start = box_target_starts[tgt_ibox]
                <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox]

                <> isrc_box_start = source_box_starts[itgt_box]
                <> isrc_box_end = source_box_starts[itgt_box+1]

                for isrc_box
                    <> src_ibox = source_box_lists[isrc_box]
                    <> isrc_start = box_source_starts[src_ibox]
                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]

                    for itgt
                    for isrc
                        <> d[idim] = \
                            targets[idim, itgt] - sources[idim, isrc] {dup=idim}
            """] + ["""
                        <> is_self = (isrc == target_to_source[itgt])
                    """ if self.exclude_self else ""]
            + loopy_insns + kernel_exprs
            + ["    end"]
            + ["""
                    result[{i}, itgt] = result[{i}, itgt] + \
                        knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \
                        {{id_prefix=write_csr}}
                """.format(i=iknl)
                for iknl in range(len(self.kernels))]
            + ["""
                    end
                end
                end
            """],
            arguments,
            assumptions="ntgt_boxes>=1",
            name=self.name,
            silenced_warnings="write_race(write_csr*)",
            fixed_parameters=dict(
                dim=self.dim,
                nstrengths=self.strength_count,
                nkernels=len(self.kernels)),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.add_dtypes(loopy_knl,
            dict(nsources=np.int32, ntargets=np.int32))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Esempio n. 30
0
def test_rob_stroud_bernstein_full(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    # NOTE: result would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
                0 <= el < nels and \
                0 <= i2 < nqp1d and \
                0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
                \
                0 <= i1_2 < nqp1d and \
                0 <= alpha1_2 <= deg and \
                0 <= i2_2 < nqp1d \
                }",
        """
        for el
            for i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
                            {id=write_tmp,dep=init_w:aind_init}
                    for alpha2
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                            {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                            {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end

            for i1_2
                <> xi2 = qpts[0, i1_2] {dep=aind_incr}
                <> s2 = 1-xi2
                <> r2 = xi2/s2
                <> w2 = s2**deg  {id=w2_init}

                for alpha1_2
                    for i2_2
                        result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
                                w2 * tmp[alpha1_2, i2_2]  {id=res2,dep=w2_init}
                    end

                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)  \
                            {id=w2_update, dep=res2}
                end
            end
        end
        """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.

            lp.GlobalArg("coeffs", None, shape=None),
            "..."
            ],
        assumptions="deg>=0 and nels>=1",
        target=lp.PyOpenCLTarget(ctx.devices[0])
        )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)

    if 0:
        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
        knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
                slabs=(0, 1))
        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    from pickle import dumps, loads
    knl = loads(dumps(knl))

    knl = lp.add_dtypes(knl,
            dict(
                qpts=np.float32,
                tmp=np.float32,
                coeffs=np.float32,
                result=np.float32,
                ))
    print(lp.generate_code_v2(knl))
Esempio n. 31
0
model_raw = """
theta_i = theta_i + dt * (omega + coupling_value * rec_n * sum) 	
theta_i = theta_i + (sig * rand) 					
theta_i = wrap_2_pi(theta_i) 					
tavg[i_node] = tavg[i_node] + sin(theta_i) 	
state[i_node] = theta_i {nosync=*}	
"""
node = 10
#Raw coupling
couplingknl = lp.make_kernel("{ [i_node, j_node]: 0<=i_node, j_node<n_node}",
                             coupling_raw,
                             target=target)
couplingknl = lp.add_dtypes(
    couplingknl, {
        "lengths": np.float32,
        "state": np.float32,
        "weights": np.float32,
        "theta_i": np.float32,
        "rec_speed_dt": np.float32
    })
couplingknl = lp.split_iname(couplingknl, "j_node", 1, outer_tag='l.0')
#Raw model
modelknl = lp.make_kernel("{ [i_node]: 0<=i_node<n_node}",
                          model_raw,
                          target=target)
modelknl = lp.add_dtypes(modelknl, {
    "state": np.float32,
    "theta_i": np.float32,
    "tavg": np.float32
})
# Fuse
knls = couplingknl, modelknl
Esempio n. 32
0
		for j_node
			<float32> wij = weights[j_node]					{id = coupling3, dep=coupling1:coupling2}
			if wij != 0.0
                		<int> dij = lengths[j_node] * rec_speed_dt		{id = coupling4, dep=coupling3}
                		<float32> theta_j = state[j_node]
                		sum = sum + wij * sin(theta_j - theta_i)
			end
		end

		theta_i = theta_i + dt * (omega + coupling_value * rec_n * sum) 	{id = out1, dep=coupling4}	
		theta_i = theta_i + (sig * rand) 					{id = out2, dep=out1}	
		theta_i = wrap_2_pi(theta_i) 						{id = out3, dep=out2}							
		tavg[i_node] = tavg[i_node] + sin(theta_i) 				{id = out4, dep=out3}
		state[i_node] = theta_i							{dep=*coupling1}						
	end 

	""",
                        assumptions="n_node>=0")

kernel = lp.add_dtypes(
    kernel,
    dict(tavg=np.float32,
         state=np.float32,
         weights=np.float32,
         lengths=np.float32))
kernel = kernel.copy(target=lp.CudaTarget())
code = lp.generate_code_v2(kernel)
print(kernel)
print(code.host_code())
print(code.device_code())
Esempio n. 33
0
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (self.get_default_src_tgt_arguments() + [
            lp.GlobalArg("box_target_starts", None, shape=None),
            lp.GlobalArg("box_target_counts_nonchild", None, shape=None),
            lp.GlobalArg("box_source_starts", None, shape=None),
            lp.GlobalArg("box_source_counts_nonchild", None, shape=None),
            lp.GlobalArg("source_box_starts", None, shape=None),
            lp.GlobalArg("source_box_lists", None, shape=None),
            lp.GlobalArg("strength",
                         None,
                         shape="nstrengths, nsources",
                         dim_tags="sep,C"),
            lp.GlobalArg(
                "result", None, shape="nkernels, ntargets", dim_tags="sep,C"),
            "..."
        ])

        loopy_knl = lp.make_kernel(
            [
                "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
                "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
                "{[itgt, isrc, idim]: \
                itgt_start <= itgt < itgt_end and \
                isrc_start <= isrc < isrc_end and \
                0 <= idim < dim}",
            ],
            self.get_kernel_scaling_assignments() + [
                """
                for itgt_box
                <> tgt_ibox = target_boxes[itgt_box]
                <> itgt_start = box_target_starts[tgt_ibox]
                <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox]

                <> isrc_box_start = source_box_starts[itgt_box]
                <> isrc_box_end = source_box_starts[itgt_box+1]

                for isrc_box
                    <> src_ibox = source_box_lists[isrc_box]
                    <> isrc_start = box_source_starts[src_ibox]
                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]

                    for itgt
                    for isrc
                        <> d[idim] = \
                            targets[idim, itgt] - sources[idim, isrc] {dup=idim}
            """
            ] + [
                """
                        <> is_self = (isrc == target_to_source[itgt])
                    """ if self.exclude_self else ""
            ] + loopy_insns + kernel_exprs + ["    end"] + [
                """
                    result[{i}, itgt] = result[{i}, itgt] + \
                        knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \
                        {{id_prefix=write_csr}}
                """.format(i=iknl) for iknl in range(len(self.kernels))
            ] + [
                """
                    end
                end
                end
            """
            ],
            arguments,
            assumptions="ntgt_boxes>=1",
            name=self.name,
            silenced_warnings="write_race(write_csr*)",
            fixed_parameters=dict(dim=self.dim,
                                  nstrengths=self.strength_count,
                                  nkernels=len(self.kernels)),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.add_dtypes(loopy_knl,
                                  dict(nsources=np.int32, ntargets=np.int32))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl