Ejemplo n.º 1
0
def trans(psy):
    '''
    Transformation routine for use with PSyclone. Applies the OpenCL
    transform to the first Invoke in the psy object.

    :param psy: the PSy object which this script will transform.
    :type psy: :py:class:`psyclone.psyGen.PSy`
    :returns: the transformed PSy object.
    :rtype: :py:class:`psyclone.psyGen.PSy`

    '''

    # Get the Schedule associated with the first Invoke
    invoke = psy.invokes.invoke_list[0]
    sched = invoke.schedule

    # Convert any kernel accesses to global data into arguments
    ktrans = KernelGlobalsToArguments()
    for kern in sched.kernels():
        ktrans.apply(kern)

    # Transform the Schedule
    cltrans = OCLTrans()
    cltrans.apply(sched, options={"end_barrier": True})

    # Provide kernel-specific OpenCL optimization options
    move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kern in sched.kernels():
        # Move the PSy-layer loop boundaries inside the kernel as a kernel
        # mask, this allows to iterate through the whole domain
        move_boundaries_trans.apply(kern)
        # Specify the OpenCL queue and workgroup size of the kernel
        kern.set_opencl_options({"queue_number": 1, 'local_size': 4})

    return psy
Ejemplo n.º 2
0
def test_set_kern_args_real_grid_property():
    ''' Check that we generate correct code to set a real scalar grid
    property. '''
    psy, _ = get_invoke("driver_test.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen)
    expected = '''\
    SUBROUTINE compute_kernel_code_set_args(kernel_obj, out_fld, in_out_fld, \
in_fld, dx, dx_1, gphiu, xstart, xstop, ystart, ystop)
      USE clfortran, ONLY: clSetKernelArg
      USE iso_c_binding, ONLY: c_sizeof, c_loc, c_intptr_t
      USE ocl_utils_mod, ONLY: check_status
      INTEGER(KIND=c_intptr_t), intent(in), target :: out_fld, in_out_fld, \
in_fld, dx, gphiu
      REAL(KIND=go_wp), intent(in), target :: dx_1
      INTEGER, intent(in), target :: xstart, xstop, ystart, ystop'''
    assert expected in generated_code
Ejemplo n.º 3
0
def trans(psy):
    '''
    Transformation routine for use with PSyclone. Applies the OpenCL
    transform to the first Invoke in the psy object.

    :param psy: the PSy object which this script will transform.
    :type psy: :py:class:`psyclone.psyGen.PSy`
    :returns: the transformed PSy object.
    :rtype: :py:class:`psyclone.psyGen.PSy`

    '''
    ocl_trans = OCLTrans()
    fold_trans = FoldConditionalReturnExpressionsTrans()
    move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans()

    # Get the Schedule associated with the first Invoke
    invoke = psy.invokes.invoke_list[0]
    sched = invoke.schedule

    # Transform the Schedule
    ocl_trans.apply(sched, options={"end_barrier": True})

    # Provide kernel-specific OpenCL optimization options
    for kern in sched.kernels():
        # Move the PSy-layer loop boundaries inside the kernel as a kernel
        # mask, this allows to iterate through the whole domain
        move_boundaries_trans.apply(kern)
        # Change the syntax to remove the return statements introduced by the
        # previous transformation
        fold_trans.apply(kern.get_kernel_schedule())
        # Specify the OpenCL queue and workgroup size of the kernel
        kern.set_opencl_options({"queue_number": 1, 'local_size': 4})

    return psy
Ejemplo n.º 4
0
def test_set_kern_args(kernel_outputdir):
    ''' Check that we generate the necessary code to set kernel arguments. '''
    psy, _ = get_invoke("single_invoke_two_kernels.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen)
    # Check we've only generated one set-args routine with arguments:
    # kernel object + kernel arguments + boundary values
    assert generated_code.count("SUBROUTINE compute_cu_code_set_args("
                                "kernel_obj, cu_fld, p_fld, u_fld, xstart, "
                                "xstop, ystart, ystop)") == 1
    # Declarations
    expected = '''\
      USE clfortran, ONLY: clSetKernelArg
      USE iso_c_binding, ONLY: c_sizeof, c_loc, c_intptr_t
      USE ocl_utils_mod, ONLY: check_status
      INTEGER(KIND=c_intptr_t), intent(in), target :: cu_fld, p_fld, u_fld
      INTEGER, intent(in), target :: xstart, xstop, ystart, ystop
      INTEGER ierr
      INTEGER(KIND=c_intptr_t), target :: kernel_obj'''
    assert expected in generated_code
    expected = '''\
      ! Set the arguments for the compute_cu_code OpenCL Kernel
      ierr = clSetKernelArg(kernel_obj, 0, C_SIZEOF(cu_fld), C_LOC(cu_fld))
      CALL check_status('clSetKernelArg: arg 0 of compute_cu_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 1, C_SIZEOF(p_fld), C_LOC(p_fld))
      CALL check_status('clSetKernelArg: arg 1 of compute_cu_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 2, C_SIZEOF(u_fld), C_LOC(u_fld))
      CALL check_status('clSetKernelArg: arg 2 of compute_cu_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 3, C_SIZEOF(xstart), C_LOC(xstart))
      CALL check_status('clSetKernelArg: arg 3 of compute_cu_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 4, C_SIZEOF(xstop), C_LOC(xstop))
      CALL check_status('clSetKernelArg: arg 4 of compute_cu_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 5, C_SIZEOF(ystart), C_LOC(ystart))
      CALL check_status('clSetKernelArg: arg 5 of compute_cu_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 6, C_SIZEOF(ystop), C_LOC(ystop))
      CALL check_status('clSetKernelArg: arg 6 of compute_cu_code', ierr)
    END SUBROUTINE compute_cu_code_set_args'''
    assert expected in generated_code

    # The call to the set_args matches the expected kernel signature with
    # the boundary values converted to 0-indexing
    assert ("CALL compute_cu_code_set_args(kernel_compute_cu_code, "
            "cu_fld%device_ptr, p_fld%device_ptr, u_fld%device_ptr, "
            "xstart - 1, xstop - 1, "
            "ystart - 1, ystop - 1)" in generated_code)

    # There is also only one version of the set_args for the second kernel
    assert generated_code.count("SUBROUTINE time_smooth_code_set_args("
                                "kernel_obj, u_fld, unew_fld, uold_fld, "
                                "xstart_1, xstop_1, ystart_1, ystop_1)") == 1
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def test_validation():
    ''' Check that the transformation can only be applied to routine nodes '''
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    with pytest.raises(TransformationError) as info:
        trans.apply(None)
    assert("Error in GOMoveIterationBoundariesInsideKernelTrans "
           "transformation. This transformation can only be applied to "
           "'GOKern' nodes, but found 'NoneType'." in str(info.value))
Ejemplo n.º 6
0
def test_field_arguments(kernel_outputdir):
    # pylint: disable=unused-argument
    ''' Test that with an invoke transformed to OpenCL that has fields,
    all the fields are initialized into OpenCL buffers, the data is copied
    in, and a function to get the data back from the device is also
    generated.'''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen).lower()

    # The array size expression always uses the same field, in this case p_fld
    size_expression = "int(p_fld%grid%nx*p_fld%grid%ny, 8)"

    # For each of the invoke fields, add a conditional block that:
    # - Creates a OpenCL rw buffer.
    # - Writes the field data into the buffer.
    # - Marks data_on_device flag as true.
    # - Points the read_from_device_f attribute to the read_from_device
    # local function.
    # - Blocks OpenCL Queue until the data copy has finished.
    single_invoke_fields = ["cu_fld", "p_fld", "u_fld"]
    for field in single_invoke_fields:
        expected = (
            "      if (.not. {0}%data_on_device) then\n"
            "        size_in_bytes = " + size_expression +
            "*c_sizeof({0}%data(1,1))\n"
            "        ! create buffer on device\n"
            "        {0}%device_ptr = create_rw_buffer(size_in_bytes)\n"
            "        ierr = clenqueuewritebuffer(cmd_queues(1), "
            "{0}%device_ptr, cl_true, 0_8, size_in_bytes, "
            "c_loc({0}%data), 0, c_null_ptr, "
            "c_loc(write_event))\n"
            "        {0}%data_on_device = .true.\n"
            "        {0}%read_from_device_f => read_from_device\n"
            "        ! block until data copies have finished\n"
            "        ierr = clfinish(cmd_queues(1))\n"
            "      end if\n").format(field)
        assert expected in generated_code

    # Check that the read_from_device routine has also been generated.
    expected = ("    subroutine read_from_device(from, to, nx, ny, width)\n"
                "      use iso_c_binding, only: c_intptr_t\n"
                "      use fortcl, only: read_buffer\n"
                "      integer(kind=c_intptr_t), intent(in) :: from\n"
                "      real(kind=go_wp), intent(inout), dimension(:,:) :: to\n"
                "      integer, intent(in) :: nx, ny, width\n"
                "      call read_buffer(from, to, int(width*ny, kind=8))\n"
                "    end subroutine read_from_device\n")
    assert expected in generated_code
Ejemplo n.º 7
0
def test_opencl_options_validation():
    ''' Check that OpenCL options which are not supported provide appropiate
    errors.
    '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()

    # Unsupported options are not accepted
    with pytest.raises(TransformationError) as err:
        otrans.apply(sched, options={'unsupported': 1})
    assert "InvokeSchedule does not support the OpenCL option 'unsupported'." \
        in str(err.value)

    # end_barrier option must be a boolean
    with pytest.raises(TransformationError) as err:
        otrans.apply(sched, options={'end_barrier': 1})
    assert "InvokeSchedule OpenCL option 'end_barrier' should be a boolean." \
        in str(err.value)

    # enable_profiling option must be a boolean
    with pytest.raises(TransformationError) as err:
        otrans.apply(sched, options={'enable_profiling': 1})
    assert ("InvokeSchedule OpenCL option 'enable_profiling' should be a "
            "boolean." in str(err.value))

    # out_of_order option must be a boolean
    with pytest.raises(TransformationError) as err:
        otrans.apply(sched, options={'out_of_order': 1})
    assert "InvokeSchedule OpenCL option 'out_of_order' should be a boolean." \
        in str(err.value)

    # Unsupported kernel options are not accepted
    with pytest.raises(AttributeError) as err:
        sched.coded_kernels()[0].set_opencl_options({'unsupported': 1})
    assert "CodedKern does not support the OpenCL option 'unsupported'." \
        in str(err.value)

    # local_size must be an integer
    with pytest.raises(TypeError) as err:
        sched.coded_kernels()[0].set_opencl_options({'local_size': 'a'})
    assert "CodedKern OpenCL option 'local_size' should be an integer." \
        in str(err.value)

    # queue_number must be an integer
    with pytest.raises(TypeError) as err:
        sched.coded_kernels()[0].set_opencl_options({'queue_number': 'a'})
    assert "CodedKern OpenCL option 'queue_number' should be an integer." \
        in str(err.value)
Ejemplo n.º 8
0
def test_set_kern_float_arg():
    ''' Check that we generate correct code to set a real, scalar kernel
    argument. '''
    psy, _ = get_invoke("single_invoke_scalar_float_arg.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen)
    # This set_args has a name clash on xstop (one is a grid property and the
    # other a loop boundary). One of they should appear as 'xstop_1'
    expected = '''\
    SUBROUTINE bc_ssh_code_set_args(kernel_obj, a_scalar, ssh_fld, xstop, \
tmask, xstart, xstop_1, ystart, ystop)
      USE clfortran, ONLY: clSetKernelArg
      USE iso_c_binding, ONLY: c_sizeof, c_loc, c_intptr_t
      USE ocl_utils_mod, ONLY: check_status
      INTEGER(KIND=c_intptr_t), intent(in), target :: ssh_fld, tmask
      INTEGER, intent(in), target :: xstop
      REAL(KIND=go_wp), intent(in), target :: a_scalar
      INTEGER, intent(in), target :: xstart, xstop_1, ystart, ystop
      INTEGER ierr
      INTEGER(KIND=c_intptr_t), target :: kernel_obj
'''
    assert expected in generated_code
    expected = '''\
      ! Set the arguments for the bc_ssh_code OpenCL Kernel
      ierr = clSetKernelArg(kernel_obj, 0, C_SIZEOF(a_scalar), C_LOC(a_scalar))
      CALL check_status('clSetKernelArg: arg 0 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 1, C_SIZEOF(ssh_fld), C_LOC(ssh_fld))
      CALL check_status('clSetKernelArg: arg 1 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 2, C_SIZEOF(xstop), C_LOC(xstop))
      CALL check_status('clSetKernelArg: arg 2 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 3, C_SIZEOF(tmask), C_LOC(tmask))
      CALL check_status('clSetKernelArg: arg 3 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 4, C_SIZEOF(xstart), C_LOC(xstart))
      CALL check_status('clSetKernelArg: arg 4 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 5, C_SIZEOF(xstop_1), C_LOC(xstop_1))
      CALL check_status('clSetKernelArg: arg 5 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 6, C_SIZEOF(ystart), C_LOC(ystart))
      CALL check_status('clSetKernelArg: arg 6 of bc_ssh_code', ierr)
      ierr = clSetKernelArg(kernel_obj, 7, C_SIZEOF(ystop), C_LOC(ystop))
      CALL check_status('clSetKernelArg: arg 7 of bc_ssh_code', ierr)
    END SUBROUTINE bc_ssh_code_set_args'''
    assert expected in generated_code
Ejemplo n.º 9
0
def test_opencl_options_effects():
    ''' Check that the OpenCL options produce the expected changes in the
    PSy layer.
    '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen)

    # By default there is 1 queue, with an end barrier and local_size is 64
    assert "localsize = (/64, 1/)" in generated_code
    assert "ierr = clEnqueueNDRangeKernel(cmd_queues(1), " \
        "kernel_compute_cu_code, 2, C_NULL_PTR, C_LOC(globalsize), " \
        "C_LOC(localsize), 0, C_NULL_PTR, C_NULL_PTR)" in generated_code
    assert "! Block until all kernels have finished\n" \
        "      ierr = clFinish(cmd_queues(1))" in generated_code
    assert "ierr = clFinish(cmd_queues(2))" not in generated_code

    # Change kernel local_size to 4
    sched.coded_kernels()[0].set_opencl_options({'local_size': 4})
    generated_code = str(psy.gen)
    assert "localsize = (/4, 1/)" in generated_code

    # Change kernel queue to 2 (the barrier should then also go up to 2)
    sched.coded_kernels()[0].set_opencl_options({'queue_number': 2})
    generated_code = str(psy.gen)
    assert "ierr = clEnqueueNDRangeKernel(cmd_queues(2), " \
        "kernel_compute_cu_code, 2, C_NULL_PTR, C_LOC(globalsize), " \
        "C_LOC(localsize), 0, C_NULL_PTR, C_NULL_PTR)" in generated_code
    assert "! Block until all kernels have finished\n" \
        "      ierr = clFinish(cmd_queues(1))\n" \
        "      ierr = clFinish(cmd_queues(2))\n" in generated_code
    assert "ierr = clFinish(cmd_queues(3))" not in generated_code

    # Remove barrier at the end of the Invoke
    otrans.apply(sched, options={'end_barrier': False})
    generated_code = str(psy.gen)
    assert "! Block until all kernels have finished" not in generated_code
    assert "ierr = clFinish(cmd_queues(2))" not in generated_code
Ejemplo n.º 10
0
def test_opencl_kernel_output_file_with_suffix(kernel_outputdir):
    '''Check that an OpenCL file named modulename_kernelname_0 is
    generated without the _code suffix in the kernelname
    '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    _ = psy.gen  # Generates the OpenCL kernels as a side-effect.

    assert os.path.exists(
        os.path.join(str(kernel_outputdir), "compute_cu_compute_cu_0.cl"))
Ejemplo n.º 11
0
def test_invoke_use_stmts_and_decls(kernel_outputdir, monkeypatch, debug_mode):
    ''' Test that generating code for OpenCL results in the correct
    module use statements and declarations. '''
    api_config = Config.get().api_conf("gocean1.0")
    monkeypatch.setattr(api_config, "_debug_mode", debug_mode)
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule

    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen).lower()
    expected = '''\
    subroutine invoke_0_compute_cu(cu_fld, p_fld, u_fld)
      use fortcl, only: create_rw_buffer\n'''

    # When in debug mode, import also the check_status function
    if debug_mode:
        expected += "      use ocl_utils_mod, only: check_status\n"

    expected += '''\
      use fortcl, only: get_num_cmd_queues, get_cmd_queues, get_kernel_by_name
      use clfortran
      use iso_c_binding
      type(r2d_field), intent(inout), target :: cu_fld, p_fld, u_fld
      integer xstart, xstop, ystart, ystop
      integer(kind=c_size_t), target :: localsize(2)
      integer(kind=c_size_t), target :: globalsize(2)
      integer(kind=c_intptr_t), target :: write_event
      integer(kind=c_size_t) size_in_bytes
      integer(kind=c_intptr_t), target, save :: kernel_compute_cu_code
      logical, save :: first_time=.true.
      integer ierr
      integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)
      integer, save :: num_cmd_queues
      '''
    assert expected in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
Ejemplo n.º 12
0
def test_grid_proprty(kernel_outputdir):
    # pylint: disable=unused-argument
    ''' Test that using nx and ny from the gocean property dictionary
    works.'''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen).lower()
    assert "globalsize = (/p_fld%grid%nx, p_fld%grid%ny/)" in generated_code
    expected = "size_in_bytes = int(p_fld%grid%nx*p_fld%grid%ny, 8)*" \
               "c_sizeof(p_fld%data(1,1))"
    assert expected in generated_code
Ejemplo n.º 13
0
def test_invoke_opencl_initialisation(kernel_outputdir):
    ''' Test that generating code for OpenCL results in the correct
    OpenCL first time initialisation code '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen).lower()

    # Test that the necessary variables are declared at the beginning
    # of the invoke
    expected = '''\
      integer(kind=c_size_t), target :: localsize(2)
      integer(kind=c_size_t), target :: globalsize(2)
      integer(kind=c_intptr_t), target :: write_event
      integer(kind=c_size_t) size_in_bytes
      integer(kind=c_intptr_t), target, save :: kernel_compute_cu_code
      logical, save :: first_time=.true.
      integer ierr
      integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)
      integer, save :: num_cmd_queues'''
    assert expected in generated_code

    # Test that a conditional 'first_time' code is generated with the
    # expected initialisation statements
    expected = '''\
      if (first_time) then
        first_time = .false.
        ! ensure opencl run-time is initialised for this psy-layer module
        call psy_init
        num_cmd_queues = get_num_cmd_queues()
        cmd_queues => get_cmd_queues()
        kernel_compute_cu_code = get_kernel_by_name("compute_cu_code")
      end if'''
    assert expected in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
Ejemplo n.º 14
0
def trans(psy):
    '''
    Transformation routine for use with PSyclone. Converts any global-variable
    accesses into kernel arguments and then applies the OpenCL transformation
    to the PSy layer.

    :param psy: the PSy object which this script will transform.
    :type psy: :py:class:`psyclone.psyGen.PSy`
    :returns: the transformed PSy object.
    :rtype: :py:class:`psyclone.psyGen.PSy`

    '''

    # Get the necessary transformations
    tinfo = TransInfo()
    globaltrans = tinfo.get_trans_name('KernelGlobalsToArguments')
    move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans()
    cltrans = tinfo.get_trans_name('OCLTrans')

    for invoke in psy.invokes.invoke_list:
        print("Converting to OpenCL invoke: " + invoke.name)
        schedule = invoke.schedule

        # Skip invoke_2 as its time_smooth_code kernel contains a
        # module variable (alpha) which is not dealt with by the
        # KernelGlobalsToArguments transformation, see issue #826.
        if invoke.name == "invoke_2":
            continue

        # Remove the globals from inside each kernel and move PSy-layer
        # loop boundaries inside the kernel as a mask.
        for kern in schedule.kernels():
            print("Update kernel: " + kern.name)
            move_boundaries_trans.apply(kern)
            globaltrans.apply(kern)

        # Transform invoke to OpenCL
        cltrans.apply(schedule)

    return psy
Ejemplo n.º 15
0
def test_opencl_code_generation_with_boundary_mask():
    ''' Tests that OpenCL kernel generated after applying the
    GOMoveIterationBoundariesInsideKernelTrans has the 4 boundary values as
    kernel arguments and has a masking statement at the beginning of the
    executable code.
    '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0, dist_mem=False)
    sched = psy.invokes.invoke_list[0].schedule
    kernel = sched.children[0].loop_body[0].loop_body[0]  # compute_cu kernel
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    trans.apply(kernel)
    kschedule = kernel.get_kernel_schedule()

    expected_code = ("__kernel void compute_cu_code(\n"
                     "  __global double * restrict cu,\n"
                     "  __global double * restrict p,\n"
                     "  __global double * restrict u,\n"
                     "  int xstart,\n"
                     "  int xstop,\n"
                     "  int ystart,\n"
                     "  int ystop\n"
                     "  ){\n"
                     "  int cuLEN1 = get_global_size(0);\n"
                     "  int cuLEN2 = get_global_size(1);\n"
                     "  int pLEN1 = get_global_size(0);\n"
                     "  int pLEN2 = get_global_size(1);\n"
                     "  int uLEN1 = get_global_size(0);\n"
                     "  int uLEN2 = get_global_size(1);\n"
                     "  int i = get_global_id(0);\n"
                     "  int j = get_global_id(1);\n"
                     "  if ((((i < xstart) || (i > xstop)) || ((j < ystart) ||"
                     " (j > ystop)))) {\n"
                     "    return;\n"
                     "  }\n"
                     "  cu[j * cuLEN1 + i] = ((0.5e0 * (p[j * pLEN1 + (i + 1)]"
                     " + p[j * pLEN1 + i])) * u[j * uLEN1 + i]);\n"
                     "}\n\n")

    openclwriter = OpenCLWriter()
    assert expected_code == openclwriter(kschedule)
Ejemplo n.º 16
0
def test_psy_init_with_options(kernel_outputdir):
    ''' Check that we create a psy_init() routine that sets-up the
    OpenCL environment with the provided non-default options. '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched,
                 options={
                     "end_barrier": True,
                     "enable_profiling": True,
                     "out_of_order": True
                 })
    generated_code = str(psy.gen)
    assert "CALL ocl_env_init(1, ocl_device_num, .True., .True.)\n" \
        in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
Ejemplo n.º 17
0
def test_opencl_multi_invoke_options_validation(option_to_check):
    ''' Check that the OpenCL options constrains are enforced when there are
    multiple invokes.
    '''
    psy, _ = get_invoke("test12_two_invokes_two_kernels.f90", API, idx=0)
    invoke1_schedule = psy.invokes.invoke_list[0].schedule
    invoke2_schedule = psy.invokes.invoke_list[1].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in invoke1_schedule.coded_kernels():
        trans.apply(kernel)
    for kernel in invoke2_schedule.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(invoke1_schedule, options={option_to_check: False})
    otrans.apply(invoke2_schedule, options={option_to_check: True})
    with pytest.raises(NotImplementedError) as err:
        _ = str(psy.gen)
    assert ("The current implementation creates a single OpenCL context for "
            "all the invokes which needs certain OpenCL options to match "
            "between invokes. Found '{0}' with unmatching values between "
            "invokes.".format(option_to_check) in str(err.value))
Ejemplo n.º 18
0
def trans(psy):
    ''' Transform the schedule for OpenCL generation '''

    # Import transformations
    tinfo = TransInfo()
    globaltrans = tinfo.get_trans_name('KernelImportsToArguments')
    move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans()
    cltrans = GOOpenCLTrans()

    # Get the invoke routine
    schedule = psy.invokes.get('invoke_0').schedule

    # Map the kernels by their name to different OpenCL queues. The multiple
    # command queues can be executed concurrently while each command queue
    # executes in-order its kernels. This provides functional parallelism
    # when kernels don't have dependencies between them.
    qmap = {
        'continuity_code': 1,
        'momentum_u_code': 2,
        'momentum_v_code': 3,
        'bc_ssh_code': 1,
        'bc_solid_u_code': 2,
        'bc_solid_v_code': 3,
        'bc_flather_u_code': 2,
        'bc_flather_v_code': 3,
        'field_copy_code': 1,
        'next_sshu_code': 1,
        'next_sshv_code': 1
    }

    # Remove global variables from inside each kernel, pass the boundary
    # values as arguments to the kernel and set the OpenCL work size to 64,
    # which is required for performance (with OpenCL < 1.2 this requires
    # the resulting application to be executed with DL_ESM_ALIGNMENT=64).
    # Technically the OpenCL global_size (which is controlled by
    # DL_ESM_ALIGNMENT) must be divisible by the work_group_size (which
    # is set to 64 in the psyclone script) in OpenCL implementations < 2.0.
    # But from OpenCL 2.0 the standard says its not necessary anymore.
    # In practice it is safe to always use it as most implementations
    # are lacking in this aspect.
    # If using a different WORK_GROUP_SIZE, make sure to update the
    # DL_ESM_ALIGNMENT to match.
    for kern in schedule.kernels():
        print(kern.name)
        globaltrans.apply(kern)
        if MOVE_BOUNDARIES:
            move_boundaries_trans.apply(kern)
        if FUCTIONAL_PARALLELISM:
            kern.set_opencl_options({
                'local_size': WORK_GROUP_SIZE,
                'queue_number': qmap[kern.name]
            })
        else:
            kern.set_opencl_options({'local_size': WORK_GROUP_SIZE})

    # Transform invoke to OpenCL
    cltrans.apply(schedule)

    if XILINX_CONFIG_FILE:
        # Create a Xilinx Compiler Configuration file
        path = Config.get().kernel_output_dir
        with open(os.path.join(path, "xilinx.cfg"), "w") as cfgfile:
            cfgfile.write("# Xilinx FPGA configuration file\n")
            # cfgfile.write("[connectivity]\n")
            # cfgfile.write("# Create 2 CU of the given kernels\n")
            # cfgfile.write("nk=continuity_code:2\n")
            # cfgfile.write("nk=momentum_u_code:2\n")
            # cfgfile.write("nk=momentum_v_code:2\n")
            # cfgfile.write("nk=bc_ssh_code:2\n")

            # cfgfile.write("\n[hls]\n")
            # cfgfile.write("# Assign CUs to different SLRs\n")
            # cfgfile.write("slr=momentum_u_code_1:SLR0\n")
            # cfgfile.write("slr=momentum_u_code_2:SLR0\n")
            # cfgfile.write("slr=momentum_v_code_1:SLR2\n")
            # cfgfile.write("slr=momentum_v_code_2:SLR2\n")

    return psy
Ejemplo n.º 19
0
def test_psy_init(kernel_outputdir, monkeypatch):
    ''' Check that we create a psy_init() routine that sets-up the
    OpenCL environment. '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen)
    expected = (
        "    SUBROUTINE psy_init()\n"
        "      USE fortcl, ONLY: ocl_env_init, add_kernels\n"
        "      CHARACTER(LEN=30) kernel_names(1)\n"
        "      INTEGER :: ocl_device_num=1\n"
        "      LOGICAL, save :: initialised=.False.\n"
        "      ! Check to make sure we only execute this routine once\n"
        "      IF (.not. initialised) THEN\n"
        "        initialised = .True.\n"
        "        ! Initialise the OpenCL environment/device\n"
        "        CALL ocl_env_init(1, ocl_device_num, .False., .False.)\n"
        "        ! The kernels this PSy layer module requires\n"
        "        kernel_names(1) = \"compute_cu_code\"\n"
        "        ! Create the OpenCL kernel objects. Expects to find all of "
        "the compiled\n"
        "        ! kernels in FORTCL_KERNELS_FILE.\n"
        "        CALL add_kernels(1, kernel_names)\n"
        "      END IF\n"
        "    END SUBROUTINE psy_init\n")
    assert expected in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)

    # Test with a non-default number of OpenCL queues
    sched.coded_kernels()[0].set_opencl_options({'queue_number': 5})
    generated_code = str(psy.gen)
    expected = (
        "    SUBROUTINE psy_init()\n"
        "      USE fortcl, ONLY: ocl_env_init, add_kernels\n"
        "      CHARACTER(LEN=30) kernel_names(1)\n"
        "      INTEGER :: ocl_device_num=1\n"
        "      LOGICAL, save :: initialised=.False.\n"
        "      ! Check to make sure we only execute this routine once\n"
        "      IF (.not. initialised) THEN\n"
        "        initialised = .True.\n"
        "        ! Initialise the OpenCL environment/device\n"
        "        CALL ocl_env_init(5, ocl_device_num, .False., .False.)\n"
        "        ! The kernels this PSy layer module requires\n"
        "        kernel_names(1) = \"compute_cu_code\"\n"
        "        ! Create the OpenCL kernel objects. Expects to find all of "
        "the compiled\n"
        "        ! kernels in FORTCL_KERNELS_FILE.\n"
        "        CALL add_kernels(1, kernel_names)\n"
        "      END IF\n"
        "    END SUBROUTINE psy_init\n")
    assert expected in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)

    # Test with a different configuration value for OCL_DEVICES_PER_NODE
    # that needs a mod() and a get_rank() expression.
    monkeypatch.setattr(Config.get(), "_ocl_devices_per_node", 2)
    generated_code = str(psy.gen)
    expected = (
        "    SUBROUTINE psy_init()\n"
        "      USE parallel_mod, ONLY: get_rank\n"
        "      USE fortcl, ONLY: ocl_env_init, add_kernels\n"
        "      CHARACTER(LEN=30) kernel_names(1)\n"
        "      INTEGER :: ocl_device_num=1\n"
        "      LOGICAL, save :: initialised=.False.\n"
        "      ! Check to make sure we only execute this routine once\n"
        "      IF (.not. initialised) THEN\n"
        "        initialised = .True.\n"
        "        ! Initialise the OpenCL environment/device\n"
        "        ocl_device_num = mod(get_rank() - 1, 2) + 1\n"
        "        CALL ocl_env_init(5, ocl_device_num, .False., .False.)\n"
        "        ! The kernels this PSy layer module requires\n"
        "        kernel_names(1) = \"compute_cu_code\"\n"
        "        ! Create the OpenCL kernel objects. Expects to find all of "
        "the compiled\n"
        "        ! kernels in FORTCL_KERNELS_FILE.\n"
        "        CALL add_kernels(1, kernel_names)\n"
        "      END IF\n"
        "    END SUBROUTINE psy_init\n")
    assert expected in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def test_go_move_iteration_boundaries_inside_kernel_trans():
    ''' Tests that the GOMoveIterationBoundariesInsideKernelTrans
    transformation for the GOcean API adds the 4 boundary values as kernel
    arguments and adds a masking statement at the beginning of the code.
    '''
    psy, _ = get_invoke("single_invoke.f90", API, idx=0, dist_mem=False)
    sched = psy.invokes.invoke_list[0].schedule
    kernel = sched.children[0].loop_body[0].loop_body[0]  # compute_cu kernel
    num_args = len(kernel.arguments.args)

    # Add some name conflicting symbols in the Invoke and the Kernel
    kernel.root.symbol_table.new_symbol("xstop")
    kernel.get_kernel_schedule().symbol_table.new_symbol("ystart")

    # Apply the transformation
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    trans.apply(kernel)

    # Check that the kernel call have been transformed:
    # - Immediately before the loop there are the boundaries assignments
    assert isinstance(sched.children[0], Assignment)
    assert sched.children[0].lhs.symbol.name == "xstart"
    assert isinstance(sched.children[1], Assignment)
    assert sched.children[1].lhs.symbol.name == "xstop_1"
    assert isinstance(sched.children[2], Assignment)
    assert sched.children[2].lhs.symbol.name == "ystart"
    assert isinstance(sched.children[3], Assignment)
    assert sched.children[3].lhs.symbol.name == "ystop"

    # - The loops have been transformed
    assert isinstance(sched.children[4], GOLoop)
    assert sched.children[4].field_space == "go_every"
    assert sched.children[4].iteration_space == "go_all_pts"
    assert isinstance(sched.children[4].loop_body[0], GOLoop)
    assert sched.children[4].loop_body[0].field_space == "go_every"
    assert sched.children[4].loop_body[0].iteration_space == "go_all_pts"

    # -  And the appropriate arguments have been added to the kernel call
    assert len(kernel.arguments.args) == num_args + 4
    assert kernel.arguments.args[-4].name == "xstart"
    assert kernel.arguments.args[-4].argument_type == "scalar"
    assert kernel.arguments.args[-3].name == "xstop_1"
    assert kernel.arguments.args[-3].argument_type == "scalar"
    assert kernel.arguments.args[-2].name == "ystart"
    assert kernel.arguments.args[-1].argument_type == "scalar"
    assert kernel.arguments.args[-1].name == "ystop"
    assert kernel.arguments.args[-1].argument_type == "scalar"

    # Check that the kernel subroutine has been transformed:
    kschedule = kernel.get_kernel_schedule()

    # - It has the boundary conditions mask
    assert isinstance(kschedule.children[0], IfBlock)
    assert str(kschedule.children[0].condition) == (
        "BinaryOperation[operator:'OR']\n"
        "BinaryOperation[operator:'OR']\n"
        "BinaryOperation[operator:'LT']\n"
        "Reference[name:'i']\n"
        "Reference[name:'xstart']\n"
        "BinaryOperation[operator:'GT']\n"
        "Reference[name:'i']\n"
        "Reference[name:'xstop']\n"
        "BinaryOperation[operator:'OR']\n"
        "BinaryOperation[operator:'LT']\n"
        "Reference[name:'j']\n"
        "Reference[name:'ystart_1']\n"
        "BinaryOperation[operator:'GT']\n"
        "Reference[name:'j']\n"
        "Reference[name:'ystop']")
    assert isinstance(kschedule.children[0].if_body[0], Return)

    # - It has the boundary symbol as kernel arguments
    assert isinstance(kschedule.symbol_table.lookup("xstart").interface,
                      ArgumentInterface)
    assert isinstance(kschedule.symbol_table.lookup("xstop").interface,
                      ArgumentInterface)
    assert isinstance(kschedule.symbol_table.lookup("ystart_1").interface,
                      ArgumentInterface)
    assert isinstance(kschedule.symbol_table.lookup("ystop").interface,
                      ArgumentInterface)
def test_go_move_iteration_boundaries_inside_kernel_two_kernels_apply_twice():
    ''' Tests that the GOMoveIterationBoundariesInsideKernelTrans
    transformation for the GOcean API produces the expected code when the
    invoke has two kernels and the transformation is applied twice.
    We check that the kernels don't use the same boundary values (some are
    postfixed with a number) and that kernels don't duplicate boundary
    arguments themself when applying the transformation twice.
    '''
    psy, _ = get_invoke("single_invoke_two_kernels.f90", API, idx=0,
                        dist_mem=False)
    sched = psy.invokes.invoke_list[0].schedule

    # Apply the transformation twice
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)
        trans.apply(kernel)

    expected = '''subroutine invoke_0(cu_fld, p_fld, u_fld, u_fld, unew_fld, \
uold_fld)
  use compute_cu_mod, only : compute_cu_code
  use time_smooth_mod, only : time_smooth_code
  type(r2d_type), intent(out) :: cu_fld
  type(r2d_type), intent(in) :: p_fld
  type(r2d_type), intent(in) :: u_fld
  type(r2d_type), intent(in) :: unew_fld
  type(r2d_type), intent(inout) :: uold_fld
  integer :: j
  integer :: xstart
  integer :: xstop
  integer :: ystart
  integer :: ystop
  integer :: xstart_1
  integer :: xstop_1
  integer :: ystart_1
  integer :: ystop_1
  integer :: i
  integer :: i_1

  xstart = cu_fld%internal%xstart
  xstop = cu_fld%internal%xstop
  ystart = cu_fld%internal%ystart
  ystop = cu_fld%internal%ystop
  do j = 1, SIZE(cu_fld%data, 2), 1
    do i = 1, SIZE(cu_fld%data, 1), 1
      call compute_cu_code(i, j, cu_fld%data, p_fld%data, u_fld%data, xstart, \
xstop, ystart, ystop)
    enddo
  enddo
  xstart_1 = 1
  xstop_1 = SIZE(uold_fld%data, 1)
  ystart_1 = 1
  ystop_1 = SIZE(uold_fld%data, 2)
  do j = 1, SIZE(uold_fld%data, 2), 1
    do i_1 = 1, SIZE(uold_fld%data, 1), 1
      call time_smooth_code(i_1, j, u_fld%data, unew_fld%data, uold_fld%data, \
xstart_1, xstop_1, ystart_1, ystop_1)
    enddo
  enddo

end subroutine invoke_0
'''

    writer = FortranWriter()
    sched.lower_to_language_level()
    assert writer(sched) == expected
Ejemplo n.º 22
0
def test_invoke_opencl_kernel_call(kernel_outputdir, monkeypatch, debug_mode):
    ''' Check that the Invoke OpenCL produce the expected kernel enqueue
    statement to launch OpenCL kernels. '''
    api_config = Config.get().api_conf("gocean1.0")
    monkeypatch.setattr(api_config, "_debug_mode", debug_mode)
    psy, _ = get_invoke("single_invoke.f90", API, idx=0)
    sched = psy.invokes.invoke_list[0].schedule
    # Currently, moving the boundaries inside the kernel is a prerequisite
    # for the GOcean gen_ocl() code generation.
    trans = GOMoveIterationBoundariesInsideKernelTrans()
    for kernel in sched.coded_kernels():
        trans.apply(kernel)

    otrans = OCLTrans()
    otrans.apply(sched)
    generated_code = str(psy.gen)

    # Set up globalsize and localsize values
    expected = '''\
      globalsize = (/p_fld%grid%nx, p_fld%grid%ny/)
      localsize = (/64, 1/)'''

    if debug_mode:
        # Check that the globalsize first dimension is a multiple of
        # the localsize first dimension
        expected += '''
      IF (mod(p_fld%grid%nx, 64) .ne. 0) THEN
        CALL check_status("Global size is not a multiple of local size \
(mandatory in OpenCL < 2.0).", -1)
      END IF'''
    assert expected in generated_code

    # Call the set_args subroutine with the boundaries corrected for the
    # OpenCL 0-indexing
    expected += '''
      CALL compute_cu_code_set_args(kernel_compute_cu_code, \
cu_fld%device_ptr, p_fld%device_ptr, u_fld%device_ptr, \
xstart - 1, xstop - 1, \
ystart - 1, ystop - 1)'''

    expected += '''
      ! Launch the kernel'''

    if debug_mode:
        # Check that there is no pending error in the queue before launching
        # the kernel
        expected += '''
      ierr = clFinish(cmd_queues(1))
      CALL check_status('Errors before compute_cu_code launch', ierr)'''

    expected += '''
      ierr = clEnqueueNDRangeKernel(cmd_queues(1), kernel_compute_cu_code, \
2, C_NULL_PTR, C_LOC(globalsize), C_LOC(localsize), 0, C_NULL_PTR, \
C_NULL_PTR)
      !'''

    if debug_mode:
        # Check that there are no errors during the kernel launch or during
        # the execution of the kernel.
        expected += '''
      CALL check_status('compute_cu_code clEnqueueNDRangeKernel', ierr)
      ierr = clFinish(cmd_queues(1))
      CALL check_status('Errors during compute_cu_code', ierr)'''

    assert expected in generated_code
    assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)