Beispiel #1
0
def test_fp_prec_comparison():
    # FIXME: This test should succeed even when the number is exactly
    # representable in single precision.
    #
    # https://gitlab.tiker.net/inducer/loopy/issues/187

    fortran_src_dp = """
        subroutine assign_scalar(a)
          real*8 a(1)

          a(1) = 1.1d0
        end
        """

    prg_dp = lp.parse_fortran(fortran_src_dp)

    fortran_src_sp = """
        subroutine assign_scalar(a)
          real*8 a(1)

          a(1) = 1.1
        end
        """

    prg_sp = lp.parse_fortran(fortran_src_sp)

    assert prg_sp != prg_dp
Beispiel #2
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv, = lp.parse_fortran(
        fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv, = lp.parse_fortran(
        fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv, = lp.parse_fortran(
        fortran_template.format(inner=(xd_line + "\n" + yd_line),
                                name="xyderiv"))

    knl = lp.fuse_kernels((xderiv, yderiv))
    knl = lp.prioritize_loops(knl, "e,i,j,k")

    assert len(knl.temporary_variables) == 2

    # This is needed for correctness, otherwise ordering could foul things up.
    knl = lp.assignment_to_subst(knl, "prev")
    knl = lp.assignment_to_subst(knl, "prev_0")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv,
                        ctx,
                        knl,
                        parameters=dict(nelements=20, ndofs=4))
Beispiel #3
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv = lp.parse_fortran(
        fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv = lp.parse_fortran(
        fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv = lp.parse_fortran(
        fortran_template.format(inner=(xd_line + "\n" + yd_line),
                                name="xyderiv"))

    knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]),
                          data_flow=[("result", 0, 1)])
    knl = knl.with_kernel(
        lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k"))

    assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv,
                        ctx,
                        knl,
                        parameters=dict(nelements=20, ndofs=4))
Beispiel #4
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv, = lp.parse_fortran(
            fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv, = lp.parse_fortran(
            fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv, = lp.parse_fortran(
            fortran_template.format(
                inner=(xd_line + "\n" + yd_line), name="xyderiv"))

    knl = lp.fuse_kernels((xderiv, yderiv))
    knl = lp.set_loop_priority(knl, "e,i,j,k")

    assert len(knl.temporary_variables) == 2

    # This is needed for correctness, otherwise ordering could foul things up.
    knl = lp.assignment_to_subst(knl, "prev")
    knl = lp.assignment_to_subst(knl, "prev_0")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
Beispiel #5
0
def test_assignment_to_subst_two_defs(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a, out(n), out2(n), inp(n)
          integer n, i

          do i = 1, n
            a = inp(i)
            out(i) = 5*a
            a = 3*inp(n)
            out2(i) = 6*a
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ref_knl = knl

    knl = lp.assignment_to_subst(knl, "a")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Beispiel #6
0
def test_tagged(ctx_factory):
    fortran_src = """
        subroutine rot_norm(out, alpha, out2, inp, inp2, n)
          implicit none
          real*8 a, b, r, out(n), out2(n), inp(n), inp2(n)
          real*8 alpha
          integer n, i

          do i = 1, n
            !$loopy begin tagged: input
            a = cos(alpha)*inp(i) + sin(alpha)*inp2(i)
            b = -sin(alpha)*inp(i) + cos(alpha)*inp2(i)
            !$loopy end tagged: input

            r = sqrt(a**2 + b**2)
            a = a/r
            b = b/r

            out(i) = a
            out2(i) = b
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2
Beispiel #7
0
def test_assignment_to_subst_two_defs(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a, out(n), out2(n), inp(n)
          integer n, i

          do i = 1, n
            a = inp(i)
            out(i) = 5*a
            a = 3*inp(n)
            out2(i) = 6*a
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ref_knl = knl

    knl = lp.assignment_to_subst(knl, "a")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Beispiel #8
0
def test_assignment_to_subst_indices(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a(n), out(n), out2(n), inp(n)
          integer n, i

          do i = 1, n
            a(i) = 6*inp(i)
          enddo

          do i = 1, n
            out(i) = 5*a(i)
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.fix_parameters(knl, n=5)

    ref_knl = knl

    assert "a" in knl.temporary_variables
    knl = lp.assignment_to_subst(knl, "a")
    assert "a" not in knl.temporary_variables

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl)
Beispiel #9
0
def test_if(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a, b, out(n), out2(n), inp(n)
          integer n, i, j

          do i = 1, n
            a = inp(i)
            if (a.ge.3) then
                b = 2*a
                do j = 1,3
                    b = 3 * b
                end do
                out(i) = 5*b
            else
                out(i) = 4*a
            endif
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ref_knl = knl

    knl = lp.assignment_to_subst(knl, "a")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Beispiel #10
0
def test_tagged(ctx_factory):
    fortran_src = """
        subroutine rot_norm(out, alpha, out2, inp, inp2, n)
          implicit none
          real*8 a, b, r, out(n), out2(n), inp(n), inp2(n)
          real*8 alpha
          integer n, i

          do i = 1, n
            !$loopy begin tagged: input
            a = cos(alpha)*inp(i) + sin(alpha)*inp2(i)
            b = -sin(alpha)*inp(i) + cos(alpha)*inp2(i)
            !$loopy end tagged: input

            r = sqrt(a**2 + b**2)
            a = a/r
            b = b/r

            out(i) = a
            out2(i) = b
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2
Beispiel #11
0
def test_assignment_to_subst_indices(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a(n), out(n), out2(n), inp(n)
          integer n, i

          do i = 1, n
            a(i) = 6*inp(i)
          enddo

          do i = 1, n
            out(i) = 5*a(i)
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.fix_parameters(knl, n=5)

    ref_knl = knl

    assert "a" in knl.temporary_variables
    knl = lp.assignment_to_subst(knl, "a")
    assert "a" not in knl.temporary_variables

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl)
Beispiel #12
0
def test_if(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a, b, out(n), out2(n), inp(n)
          integer n, i, j

          do i = 1, n
            a = inp(i)
            if (a.ge.3) then
                b = 2*a
                do j = 1,3
                    b = 3 * b
                end do
                out(i) = 5*b
            else
                out(i) = 4*a
            endif
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ref_knl = knl

    knl = lp.assignment_to_subst(knl, "a")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Beispiel #13
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv, = lp.parse_fortran(
            fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv, = lp.parse_fortran(
            fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv, = lp.parse_fortran(
            fortran_template.format(
                inner=(xd_line + "\n" + yd_line), name="xyderiv"))

    knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)])
    knl = lp.prioritize_loops(knl, "e,i,j,k")

    assert len(knl.temporary_variables) == 2

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
Beispiel #14
0
def test_matmul(ctx_factory, buffer_inames):
    ctx = ctx_factory()

    if (buffer_inames and
            ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "ell mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
            precompute_outer_inames='i_outer, j_outer, k_outer',
            default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
            precompute_outer_inames='i_outer, j_outer, k_outer',
            default_tag="l.auto")

    knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
            init_expression="0", store_expression="base+buffer")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
Beispiel #15
0
def test_precompute_some_exist(ctx_factory):
    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl = lp.parse_fortran(fortran_src)

    assert len(knl["dgemm"].domains) == 1

    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 8)
    knl = lp.assume(knl, "n mod 8 = 0")
    knl = lp.assume(knl, "m mod 8 = 0")
    knl = lp.assume(knl, "ell mod 8 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl,
                        "a_acc",
                        "k_inner,i_inner",
                        precompute_inames="ktemp,itemp",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "b_acc",
                        "j_inner,k_inner",
                        precompute_inames="itemp,k2temp",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")

    ref_knl = knl

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters=dict(n=128, m=128, ell=128))
Beispiel #16
0
def test_matmul(ctx_factory, buffer_inames):
    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,l,a,b,c)
          implicit none
          real*8 a(m,l),b(l,n),c(m,n)
          integer m,n,k,i,j,l

          do j = 1,n
            do i = 1,m
              do k = 1,l
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "l mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")

    knl = lp.buffer_array(knl,
                          "c",
                          buffer_inames=buffer_inames,
                          init_expression="0",
                          store_expression="base+buffer")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters=dict(n=128, m=128, l=128))
Beispiel #17
0
def test_batched_sparse():
    fortran_src = """
        subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y)
          implicit none

          integer rowstarts(m+1), colindices(nvals)
          real*8 values(nvals)
          real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)

          integer m, n, rowstart, rowend, length, nvals, nvecs
          integer i, j, k

          do i = 1, m
            rowstart = rowstarts(i)
            rowend = rowstarts(i+1)
            length = rowend - rowstart

            do k = 1, nvecs
              rowsum(k) = 0
            enddo
            do k = 1, nvecs
              do j = 1, length
                rowsum(k) = rowsum(k) + &
                  x(colindices(rowstart+j-1),k)*values(rowstart+j-1)
              end do
            end do
            do k = 1, nvecs
              y(i,k) = rowsum(k)
            end do
          end do
        end

        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.split_iname(knl, "i", 128)
    knl = lp.tag_inames(knl, {"i_outer": "g.0"})
    knl = lp.tag_inames(knl, {"i_inner": "l.0"})
    knl = lp.add_prefetch(knl, "values",
            default_tag="l.auto")
    knl = lp.add_prefetch(knl, "colindices",
            default_tag="l.auto")
    knl = lp.fix_parameters(knl, nvecs=4)
Beispiel #18
0
def test_batched_sparse():
    fortran_src = """
        subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y)
          implicit none

          integer rowstarts(m+1), colindices(nvals)
          real*8 values(nvals)
          real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)

          integer m, n, rowstart, rowend, length, nvals, nvecs
          integer i, j, k

          do i = 1, m
            rowstart = rowstarts(i)
            rowend = rowstarts(i+1)
            length = rowend - rowstart

            do k = 1, nvecs
              rowsum(k) = 0
            enddo
            do k = 1, nvecs
              do j = 1, length
                rowsum(k) = rowsum(k) + &
                  x(colindices(rowstart+j-1),k)*values(rowstart+j-1)
              end do
            end do
            do k = 1, nvecs
              y(i,k) = rowsum(k)
            end do
          end do
        end

        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.split_iname(knl, "i", 128)
    knl = lp.tag_inames(knl, {"i_outer": "g.0"})
    knl = lp.tag_inames(knl, {"i_inner": "l.0"})
    knl = lp.add_prefetch(knl, "values",
            default_tag="l.auto")
    knl = lp.add_prefetch(knl, "colindices",
            default_tag="l.auto")
    knl = lp.fix_parameters(knl, nvecs=4)
Beispiel #19
0
def test_matmul(ctx_factory, buffer_inames):
    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,l,a,b,c)
          implicit none
          real*8 a(m,l),b(l,n),c(m,n)
          integer m,n,k,i,j,l

          do j = 1,n
            do i = 1,m
              do k = 1,l
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "l mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")

    knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
            init_expression="0", store_expression="base+buffer")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
Beispiel #20
0
def test_precompute_some_exist(ctx_factory):
    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    knl = lp.split_iname(knl, "i", 8,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 8)
    knl = lp.assume(knl, "n mod 8 = 0")
    knl = lp.assume(knl, "m mod 8 = 0")
    knl = lp.assume(knl, "ell mod 8 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
            precompute_inames="ktemp,itemp",
            default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
            precompute_inames="itemp,k2temp",
            default_tag="l.auto")

    ref_knl = knl

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
Beispiel #21
0
def test_fill_const(ctx_factory):
    fortran_src = """
        subroutine fill(out, a, n)
          implicit none

          real*8 a, out(n)
          integer n, i

          do i = 1, n
            out(i) = 3.45
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ctx = ctx_factory()

    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5, a=5))
Beispiel #22
0
def test_fill_const(ctx_factory):
    fortran_src = """
        subroutine fill(out, a, n)
          implicit none

          real*8 a, out(n)
          integer n, i

          do i = 1, n
            out(i) = 3.45
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ctx = ctx_factory()

    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5, a=5))
Beispiel #23
0
def test_division_in_shapes(ctx_factory):
    fortran_src = """
        subroutine halve(m, a)
            implicit none
            integer m, i, j
            real*8 a(m/2,m/2)
            do i = 1,m/2
                do j = 1,m/2
                    a(i, j) = 2*a(i, j)
                end do
            end do
        end subroutine
        """
    t_unit = lp.parse_fortran(fortran_src)
    ref_t_unit = t_unit

    print(t_unit)

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit, parameters=dict(m=128))
Beispiel #24
0
def test_assign_double_precision_scalar_as_rational(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    fortran_src = """
        subroutine assign_scalar(a)
          real*8 a(1)

          a(1) = 11
          a(1) = a(1) / 10
        end
        """

    t_unit = lp.parse_fortran(fortran_src)

    a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F")
    t_unit(queue, a=a_dev)

    abs_err = abs(a_dev.get()[0] - 1.1)
    assert abs_err < 1e-15
Beispiel #25
0
def test_domain_fusion_imperfectly_nested():
    fortran_src = """
        subroutine imperfect(n, m, a, b)
            implicit none
            integer i, j, n, m
            real a(n), b(n,n)

            do i=1, n
                a(i) = i
                do j=1, m
                    b(i,j) = i*j
                end do
            end do
        end subroutine
        """

    t_unit = lp.parse_fortran(fortran_src)
    # If n > 0 and m == 0, a single domain would be empty,
    # leading (incorrectly) to no assignments to 'a'.
    assert len(t_unit["imperfect"].domains) > 1
Beispiel #26
0
def test_assign_double_precision_scalar(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    fortran_src = """
        subroutine assign_scalar(a)
          real*8 a(1)

          a(1) = 1.1d0
        end
        """

    t_unit = lp.parse_fortran(fortran_src)
    print(lp.generate_code_v2(t_unit).device_code())
    assert "1.1;" in lp.generate_code_v2(t_unit).device_code()

    a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F")
    t_unit(queue, a=a_dev)

    abs_err = abs(a_dev.get()[0] - 1.1)
    assert abs_err < 1e-15
Beispiel #27
0
def test_asterisk_in_shape(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a, out(n), out2(n), inp(*)
          integer n, i

          do i = 1, n
            a = inp(n)
            out(i) = 5*a
            out2(i) = 6*a
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl(queue, inp=np.array([1, 2, 3.]), n=3)
Beispiel #28
0
def test_asterisk_in_shape(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a, out(n), out2(n), inp(*)
          integer n, i

          do i = 1, n
            a = inp(n)
            out(i) = 5*a
            out2(i) = 6*a
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl(queue, inp=np.array([1, 2, 3.]), n=3)
Beispiel #29
0
def test_fortran_subroutines():
    fortran_src = """
        subroutine twice(n, a)
          implicit none
          real*8  a(n)
          integer i,n

          do i=1,n
            a(i) = a(i) * 2
          end do
        end subroutine

        subroutine twice_cross(n, a, i)
          implicit none
          integer i, n
          real*8  a(n,n)

          call twice(n, a(1:n, i))
          call twice(n, a(i, 1:n))
        end subroutine
        """
    t_unit = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross")
    print(lp.generate_code_v2(t_unit).device_code())
    def fortran_kernel(self, line, cell):
        result = lp.parse_fortran(cell)

        for knl in result:
            self.shell.user_ns[knl.name] = knl
Beispiel #31
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
        knl
        for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
        if "KernelR" in knl.name or "KernelS" in knl.name
    ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(
            hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero",
                             "-cl-fast-relaxed-math",
                             "-cl-finite-math-only",
                             "-cl-mad-enable",
                             "-cl-no-signed-zeros",
                         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Beispiel #32
0
    def fortran_kernel(self, line, cell):
        result = lp.parse_fortran(cell)

        for knl in result:
            self.shell.user_ns[knl.name] = knl
Beispiel #33
0
 def fortran_kernel(self, line, cell):
     result = lp.parse_fortran(cell)
     self.shell.user_ns["prog"] = result
Beispiel #34
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
           knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
           if "KernelR" in knl.name or "KernelS" in knl.name
           ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (
          fix_euler_parameters,
          set_q_storage_format, set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner",)
        flux_ilp_inames = ("kk",)
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
                  ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)),
                  ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)),
                  ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(hsv,
                  "tag:{knl_tag} and reads:{flux_var}"
                  .format(knl_tag=knl_tag, flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames,
                temporary_name=flux_store_name,
                precompute_inames=flux_precomp_inames + flux_ilp_inames,
                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id,
                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
            lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
          fetch_bounding_box=True, default_tag="for",
          init_expression="0", store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv,
            {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"},
            ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(hsv, dict(
          rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec",
          rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
          Q_dim_field_inner="vec",
          Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf",
          vary_by_axes=(0,) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv, cl_build_options=[
         "-cl-denorms-are-zero",
         "-cl-fast-relaxed-math",
         "-cl-finite-math-only",
         "-cl-mad-enable",
         "-cl-no-signed-zeros",
         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
            quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)