コード例 #1
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_variable_size_temporary():
    knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''',
                         ''' out[i] = sum(j, a[i,j])''')

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    knl = lp.add_prefetch(knl, "a[:,:]", default_tag=None)

    # Make sure that code generation succeeds even if
    # there are variable-length arrays.
    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        lp.generate_code(k)
コード例 #2
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_variable_size_temporary():
    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' out[i] = sum(j, a[i,j])''')

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    knl = lp.add_prefetch(
            knl, "a[:,:]", default_tag=None)

    # Make sure that code generation succeeds even if
    # there are variable-length arrays.
    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        lp.generate_code(k)
コード例 #3
0
ファイル: test_transform.py プロジェクト: cmsquared/loopy
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_dim_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
            split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=30))
コード例 #4
0
ファイル: test_apps.py プロジェクト: inducer/loopy
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
コード例 #5
0
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}", """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_dim_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)],
                             4,
                             split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
コード例 #6
0
    def __test(loop_size, vec_width):
        knl = lp.make_kernel(
            '{{[i]: 0 <= i < {}}}'.format(loop_size),
            """
            <> x = 1.0
            a1[0] = a1[0] + x {id=set}
            ... lbarrier {id=wait, dep=set}
            for i
                a1[0] = a1[0] + 1 {id=a1, dep=set:wait, nosync=set}
            end
            """, [
                lp.GlobalArg(
                    'a1', shape=(loop_size, ), order='C', dtype=np.float32)
            ],
            target=lp.OpenCLTarget(),
            silenced_warnings=['no_device_in_pre_codegen_checks'])
        loopy_opts = type('', (object, ), {
            'depth': vec_width,
            'order': 'C',
            'use_atomic_doubles': True
        })
        knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0')

        # feed through deep specializer
        _, ds = get_deep_specializer(loopy_opts,
                                     atomic_ids=['a1'],
                                     split_ids=['set'],
                                     use_atomics=True,
                                     is_write_race=True,
                                     split_size=loop_size)
        knl = ds(knl)

        val = np.minimum(loop_size, vec_width)
        assert 'x / {:.1f}f'.format(val) in lp.generate_code(knl)[0]
コード例 #7
0
ファイル: test_domain.py プロジェクト: cmsquared/loopy
def test_divisibility_assumption(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "[n] -> {[i]: 0<=i<n}",
            [
                "b[i] = 2*a[i]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=("n",)),
                lp.GlobalArg("b", np.float32, shape=("n",)),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1 and (exists zz: n = 16*zz)")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "if" not in code

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters={"n": 16**3})
コード例 #8
0
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
コード例 #9
0
ファイル: test_domain.py プロジェクト: yueyedeai/loopy
def test_divisibility_assumption(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "[n] -> {[i]: 0<=i<n}",
            [
                "b[i] = 2*a[i]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=("n",)),
                lp.GlobalArg("b", np.float32, shape=("n",)),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1 and (exists zz: n = 16*zz)")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "if" not in code

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters={"n": 16**3})
コード例 #10
0
ファイル: tests.py プロジェクト: umarbrowser/tvb-hpc
 def test_simple_kernel(self):
     knl = lp.make_kernel("{ [i]: 0<=i<n }",
                          "out[i] = 2*a[i]",
                          target=CTarget())
     typed = lp.add_dtypes(knl, {'a': np.float32})
     code, _ = lp.generate_code(typed)
     fn = CompiledKernel(typed)  # noqa
     a, out = np.zeros((2, 10), np.float32)
     a[:] = np.r_[:a.size]
     fn(a, 10, out)
     np.testing.assert_allclose(out, a * 2)
コード例 #11
0
ファイル: numba.py プロジェクト: walterroach/tvb-hpc
 def get_kernel_executor(self, knl, *args, **kwargs):
     code, _ = lp.generate_code(knl)
     if self.no_jit:
         code = '\n'.join([
             line for line in code.split('\n') if line != '@_lpy_numba.jit'
         ])
         for i, line in enumerate(code.split('\n')):
             print(i + 1, line)
     LOG.debug(code)
     ns = {}
     exec(code, ns)
     return ns[knl.name]
コード例 #12
0
ファイル: test_target.py プロジェクト: tj-sun/loopy
def test_cuda_target():
    from loopy.target.cuda import CudaTarget

    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]",
        [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."],
        target=CudaTarget())

    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])

    print(
        lp.generate_code(lp.get_one_scheduled_kernel(
            lp.preprocess_kernel(knl)))[0])
コード例 #13
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_multiple_writes_to_local_temporary():
    # Loopy would previously only handle barrier insertion correctly if exactly
    # one instruction wrote to each local temporary. This tests that multiple
    # writes are OK.

    knl = lp.make_kernel(
        "{[i,e]: 0<=i<5 and 0<=e<nelements}", """
        <> temp[i, 0] = 17
        temp[i, 1] = 15
        """)
    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        code, _ = lp.generate_code(k)
        print(code)
コード例 #14
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_multiple_writes_to_local_temporary():
    # Loopy would previously only handle barrier insertion correctly if exactly
    # one instruction wrote to each local temporary. This tests that multiple
    # writes are OK.

    knl = lp.make_kernel(
        "{[i,e]: 0<=i<5 and 0<=e<nelements}",
        """
        <> temp[i, 0] = 17
        temp[i, 1] = 15
        """)
    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        code, _ = lp.generate_code(k)
        print(code)
コード例 #15
0
def get_code(knl, opts=None):
    """
    Returns the device code for a :class:`loopy.LoopKernel` or
    fixes alreay generated code

    Parameters
    ----------
    knl : :class:`loopy.LoopKernel` or str
        The kernel to generate code for.  If knl is a string, it is assumed
        to be pregenerated code, and only the editor script must be called
    opts: :class:`loopy_options`
        The options used in created the kernel -- used to detect platform specific
        fixes.  Ignored if not supplied

    Returns
    -------
    code: str
        Generated device code

    Notes
    -----
    The kernel's Target and name should be set for proper functioning
    """

    if isinstance(knl, str):
        code = knl
    else:
        code, _ = lp.generate_code(knl)

    extra_subs = {}
    if opts is None:
        # ignore
        pass
    elif opts.lang == 'opencl' and ('intel' in opts.platform.name.lower() and
                                    ((opts.order == 'C' and opts.width) or
                                     (opts.order == 'F' and opts.depth) or
                                     (opts.order == 'F' and opts.width))):
        # If True, this is a finite-difference Jacobian on an Intel OpenCL platform
        # Hence we have to tell the codefixer about the intel bug
        # https://software.intel.com/en-us/forums/opencl/topic/748841
        extra_subs[
            r'__kernel void __attribute__ \(\(reqd_work_group_size\(\d+, 1, 1'
            r'\)\)\) species_rates_kernel'] = r'void species_rates_kernel'

    return codefix('stdin', text_in=code, extra_subs=extra_subs)
コード例 #16
0
ファイル: test_domain.py プロジェクト: shigh/loopy
def test_eq_constraint(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<= i,j < 32}", ["a[i] = b[i]"], [
        lp.GlobalArg("a", np.float32, shape=(1000, )),
        lp.GlobalArg("b", np.float32, shape=(1000, ))
    ])

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for knl in kernel_gen:
        print(lp.generate_code(knl))
コード例 #17
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_type_inference_no_artificial_doubles(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         """
                <> bb = a[i] - b[i]
                c[i] = bb
                """, [
                             lp.GlobalArg("a", np.float32, shape=("n", )),
                             lp.GlobalArg("b", np.float32, shape=("n", )),
                             lp.GlobalArg("c", np.float32, shape=("n", )),
                             lp.ValueArg("n", np.int32),
                         ],
                         assumptions="n>=1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "double" not in code
コード例 #18
0
ファイル: test_target.py プロジェクト: cmsquared/loopy
def test_cuda_target():
    from loopy.target.cuda import CudaTarget

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i] = 2*a[i]",
            [
                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                "..."
                ],
            target=CudaTarget())

    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])

    print(
            lp.generate_code(
                lp.get_one_scheduled_kernel(
                    lp.preprocess_kernel(knl)))[0])
コード例 #19
0
ファイル: compiler.py プロジェクト: walterroach/tvb-hpc
 def __init__(self, knl: lp.LoopKernel, comp: Compiler = None):
     assert isinstance(knl.target, CTarget)
     self.knl = knl
     self.code, _ = lp.generate_code(knl)
     self.comp = comp or Compiler()
     self.dll = self.comp.build(self.code)
     self.func_decl, = generate_header(knl)
     self._arg_info = []
     # TODO knl.args[:].dtype is sufficient
     self._visit_func_decl(self.func_decl)
     self.name = self.knl.name
     restype = self.func_decl.subdecl.typename
     if restype == 'void':
         self.restype = None
     else:
         raise ValueError('Unhandled restype %r' % (restype, ))
     self._fn = getattr(self.dll, self.name)
     self._fn.restype = self.restype
     self._fn.argtypes = [ctype for name, ctype in self._arg_info]
     self._prepared_call_cache = weakref.WeakKeyDictionary()
コード例 #20
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_type_inference_no_artificial_doubles(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            """
                <> bb = a[i] - b[i]
                c[i] = bb
                """,
            [
                lp.GlobalArg("a", np.float32, shape=("n",)),
                lp.GlobalArg("b", np.float32, shape=("n",)),
                lp.GlobalArg("c", np.float32, shape=("n",)),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "double" not in code
コード例 #21
0
ファイル: test_domain.py プロジェクト: cmsquared/loopy
def test_eq_constraint(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,j]: 0<= i,j < 32}",
            [
                "a[i] = b[i]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=(1000,)),
                lp.GlobalArg("b", np.float32, shape=(1000,))
                ])

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for knl in kernel_gen:
        print(lp.generate_code(knl))
コード例 #22
0
ファイル: tests.py プロジェクト: umarbrowser/tvb-hpc
 def _dtype_and_code(self, knl, **extra_dtypes):
     dtypes = {'in': np.float32, 'out': np.float32}
     dtypes.update(extra_dtypes)
     knl = lp.add_dtypes(knl, dtypes)
     code, _ = lp.generate_code(knl)
     return code
コード例 #23
0
 def code(self, *args, **kwargs):
     knl = self.kernel(*args, **kwargs)
     code, _ = generate_code(knl)
     return code
コード例 #24
0
ファイル: test_sem.py プロジェクト: cmsquared/loopy
def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n,
            [
                "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
                "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
                "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                ],
            [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")


    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])

    knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])

    knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
    knl = lp.add_prefetch(knl, "D", ["m", "j"])

    #print seq_knl
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))
コード例 #25
0
def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [
            "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
            "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
            "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])

    knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])

    knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
    knl = lp.add_prefetch(knl, "D", ["m", "j"])

    #print seq_knl
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))
コード例 #26
0
ファイル: sparse.py プロジェクト: yueyedeai/loopy
import loopy as lp
import numpy as np

k = lp.make_kernel(["{ [i] : 0 <= i < m }", "{ [j] : 0 <= j < length }"], """
    for i
        <> rowstart = rowstarts[i]
        <> rowend = rowstarts[i+1]
        <> length = rowend - rowstart
        y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
    end
    """)

k = lp.add_and_infer_dtypes(k, {
    "values,x": np.float64,
    "rowstarts,colindices": k.index_dtype
})
print(lp.generate_code(k)[0])
コード例 #27
0
 def get_kernel_executor(self, knl, *args, **kwargs):
     code, _ = lp.generate_code(knl)
     LOG.debug(code)
     ns = {}
     exec(code, ns)
     return ns[knl.name]
コード例 #28
0
ファイル: sparse.py プロジェクト: inducer/loopy
import loopy as lp
import numpy as np

k = lp.make_kernel([
    "{ [i] : 0 <= i < m }",
    "{ [j] : 0 <= j < length }"],
    """
    for i
        <> rowstart = rowstarts[i]
        <> rowend = rowstarts[i+1]
        <> length = rowend - rowstart
        y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
    end
    """)

k = lp.add_and_infer_dtypes(k, {
    "values,x": np.float64, "rowstarts,colindices": k.index_dtype
    })
print(lp.generate_code(k)[0])