def trans(psy): ''' Python script intended to be passed to PSyclone's generate() function via the -s option. Applies OpenMP to every loop before enclosing them all within a single OpenMP PARALLEL region. ''' from psyclone.psyGen import TransInfo tinfo = TransInfo() ltrans = tinfo.get_trans_name('GOceanOMPLoopTrans') rtrans = tinfo.get_trans_name('OMPParallelTrans') schedule = psy.invokes.get('invoke_0').schedule # schedule.view() # Apply the OpenMP Loop transformation to *every* loop # in the schedule for child in schedule.children: newschedule, _ = ltrans.apply(child) schedule = newschedule # Enclose all of these loops within a single OpenMP # PARALLEL region newschedule, _ = rtrans.apply(schedule.children) psy.invokes.get('invoke_0').schedule = newschedule return psy
def trans(psy): ''' Transformation entry point ''' config = Config.get() tinfo = TransInfo() parallel_loop_trans = tinfo.get_trans_name('GOceanOMPParallelLoopTrans') loop_trans = tinfo.get_trans_name('GOceanOMPLoopTrans') parallel_trans = tinfo.get_trans_name('OMPParallelTrans') module_inline_trans = tinfo.get_trans_name('KernelModuleInline') schedule = psy.invokes.get('invoke_0').schedule # Inline all kernels in this Schedule for kernel in schedule.kernels(): module_inline_trans.apply(kernel) # Apply the OpenMPLoop transformation to every child in the schedule or # OpenMPParallelLoop to every Loop if it has distributed memory. for child in schedule.children: if config.distributed_memory: if isinstance(child, Loop): parallel_loop_trans.apply(child) else: loop_trans.apply(child) if not config.distributed_memory: # If it is not distributed memory, enclose all of these loops # within a single OpenMP PARALLEL region parallel_trans.apply(schedule.children) return psy
def trans(psy): ''' Take the supplied psy object, apply OpenACC transformations to the schedule of invoke_0 and return the new psy object ''' tinfo = TransInfo() parallel_trans = tinfo.get_trans_name('ACCParallelTrans') loop_trans = tinfo.get_trans_name('ACCLoopTrans') enter_data_trans = tinfo.get_trans_name('ACCEnterDataTrans') routine_trans = tinfo.get_trans_name('ACCRoutineTrans') glo2arg_trans = tinfo.get_trans_name('KernelImportsToArguments') inline_trans = tinfo.get_trans_name('KernelModuleInline') invoke = psy.invokes.get('invoke_0') schedule = invoke.schedule # Apply the OpenACC Loop transformation to *every* loop # in the schedule for child in schedule.children: if isinstance(child, Loop): loop_trans.apply(child, {"collapse": 2}) # Put all of the loops in a single parallel region parallel_trans.apply(schedule) # Add an enter-data directive enter_data_trans.apply(schedule) # Apply ACCRoutineTrans to each kernel, which also requires that # any global variables must be removed first. for kern in schedule.coded_kernels(): glo2arg_trans.apply(kern) routine_trans.apply(kern) inline_trans.apply(kern) return psy
def trans(psy): ''' Transformation routine for use with PSyclone. Converts any global-variable accesses into kernel arguments and then applies the OpenCL transformation to the PSy layer. :param psy: the PSy object which this script will transform. :type psy: :py:class:`psyclone.psyGen.PSy` :returns: the transformed PSy object. :rtype: :py:class:`psyclone.psyGen.PSy` ''' # Get the necessary transformations tinfo = TransInfo() globaltrans = tinfo.get_trans_name('KernelGlobalsToArguments') move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() cltrans = tinfo.get_trans_name('OCLTrans') for invoke in psy.invokes.invoke_list: print("Converting to OpenCL invoke: " + invoke.name) schedule = invoke.schedule # Skip invoke_2 as its time_smooth_code kernel contains a # module variable (alpha) which is not dealt with by the # KernelGlobalsToArguments transformation, see issue #826. if invoke.name == "invoke_2": continue # Remove the globals from inside each kernel and move PSy-layer # loop boundaries inside the kernel as a mask. for kern in schedule.kernels(): print("Update kernel: " + kern.name) move_boundaries_trans.apply(kern) globaltrans.apply(kern) # Transform invoke to OpenCL cltrans.apply(schedule) return psy
def trans(psy): ''' Transformation script entry function ''' tinfo = TransInfo() itrans = tinfo.get_trans_name('KernelModuleInline') schedule = psy.invokes.get('invoke_0').schedule # Module-Inline all coded kernels in this Schedule for kernel in schedule.coded_kernels(): itrans.apply(kernel) return psy
from __future__ import print_function from psyclone.parse.algorithm import parse from psyclone.psyGen import PSyFactory from psyclone.psyGen import TransInfo API = "dynamo0.1" _, INVOKEINFO = parse("dynamo_algorithm_mod.F90", api=API) PSY = PSyFactory(API).create(INVOKEINFO) print(PSY.gen) print(PSY.invokes.names) TRANS = TransInfo() print(TRANS.list) LOOP_FUSE = TRANS.get_trans_name('LoopFuse') OMP_PAR = TRANS.get_trans_name('OMPParallelLoopTrans') SCHEDULE = PSY.invokes.get('invoke_0').schedule SCHEDULE.view() FUSE_SCHEDULE, _ = LOOP_FUSE.apply(SCHEDULE.children[0], SCHEDULE.children[1]) FUSE_SCHEDULE.view() OMP_SCHEDULE, _ = OMP_PAR.apply(FUSE_SCHEDULE.children[0]) OMP_SCHEDULE.view() PSY.invokes.get('invoke_0').schedule = OMP_SCHEDULE SCHEDULE = PSY.invokes.get('invoke_1_v2_kernel_type').schedule SCHEDULE.view()
if __name__ == "__main__": from psyclone.parse.algorithm import parse from psyclone.psyGen import PSyFactory, TransInfo API = "gocean1.0" _, INVOKEINFO = parse("shallow_alg.f90", api=API) PSY = PSyFactory(API, distributed_memory=False).create(INVOKEINFO) print(PSY.gen) print(PSY.invokes.names) SCHEDULE = PSY.invokes.get('invoke_0').schedule SCHEDULE.view() TRANS_INFO = TransInfo() print(TRANS_INFO.list) FUSE_TRANS = TRANS_INFO.get_trans_name('LoopFuse') PTRANS = TRANS_INFO.get_trans_name('ACCParallelTrans') DTRANS = TRANS_INFO.get_trans_name('ACCEnterDataTrans') LTRANS = TRANS_INFO.get_trans_name('ACCLoopTrans') # invoke0 # fuse all outer loops LF1_SCHEDULE, _ = FUSE_TRANS.apply(SCHEDULE.children[0], SCHEDULE.children[1]) LF2_SCHEDULE, _ = FUSE_TRANS.apply(LF1_SCHEDULE.children[0], LF1_SCHEDULE.children[1]) LF3_SCHEDULE, _ = FUSE_TRANS.apply(LF2_SCHEDULE.children[0], LF2_SCHEDULE.children[1]) LF3_SCHEDULE.view() # fuse all inner loops
from psyclone.parse import parse from psyclone.psyGen import PSyFactory, TransInfo API = "gocean1.0" _, INVOKEINFO = parse("shallow_alg.f90", api=API) PSY = PSyFactory(API).create(INVOKEINFO) # Print the vanilla, generated Fortran print PSY.gen print PSY.invokes.names SCHEDULE = PSY.invokes.get('invoke_0').schedule SCHEDULE.view() TRANS_INFO = TransInfo() print TRANS_INFO.list FUSE_TRANS = TRANS_INFO.get_trans_name('LoopFuse') # fuse all outer loops LF1_SCHED, _ = FUSE_TRANS.apply(SCHEDULE.children[0], SCHEDULE.children[1]) LF2_SCHED, _ = FUSE_TRANS.apply(LF1_SCHED.children[0], LF1_SCHED.children[1]) LF3_SCHED, _ = FUSE_TRANS.apply(LF2_SCHED.children[0], LF2_SCHED.children[1]) LF3_SCHED.view() # fuse all inner loops LF4_SCHED, _ = FUSE_TRANS.apply(LF3_SCHED.children[0].children[0], LF3_SCHED.children[0].children[1]) LF5_SCHED, _ = FUSE_TRANS.apply(LF4_SCHED.children[0].children[0], LF4_SCHED.children[0].children[1]) LF6_SCHED, _ = FUSE_TRANS.apply(LF5_SCHED.children[0].children[0], LF5_SCHED.children[0].children[1]) LF6_SCHED.view()
from psyclone.parse import parse from psyclone.psyGen import PSyFactory, TransInfo if __name__ == "__main__": from psyclone.nemo import NemoKern API = "nemo" _, INVOKEINFO = parse("tra_adv.F90", api=API) PSY = PSyFactory(API).create(INVOKEINFO) print(PSY.gen) print("Invokes found:") print(PSY.invokes.names) SCHED = PSY.invokes.get('tra_adv').schedule SCHED.view() TRANS_INFO = TransInfo() OMP_TRANS = TRANS_INFO.get_trans_name('OMPParallelLoopTrans') for loop in SCHED.loops(): # TODO loop.kernel method needs extending to cope with # multiple kernels kernels = loop.walk(loop.children, NemoKern) if kernels and loop.loop_type == "levels": SCHED, _ = OMP_TRANS.apply(loop) SCHED.view() PSY.invokes.get('tra_adv').schedule = SCHED print(PSY.gen)
from psyclone.parse import parse from psyclone.psyGen import PSyFactory, TransInfo API = "gocean1.0" _, INVOKEINFO = parse("shallow_alg.f90", api=API) PSY = PSyFactory(API).create(INVOKEINFO) print PSY.gen print PSY.invokes.names SCHEDULE = PSY.invokes.get('invoke_0').schedule SCHEDULE.view() TRANS_INFO = TransInfo() print TRANS_INFO.list FUSE_TRANS = TRANS_INFO.get_trans_name('LoopFuse') OMP_TRANS = TRANS_INFO.get_trans_name('GOceanOMPParallelLoopTrans') # invoke0 # fuse all outer loops LF1_SCHEDULE, _ = FUSE_TRANS.apply(SCHEDULE.children[0], SCHEDULE.children[1]) LF2_SCHEDULE, _ = FUSE_TRANS.apply(LF1_SCHEDULE.children[0], LF1_SCHEDULE.children[1]) LF3_SCHEDULE, _ = FUSE_TRANS.apply(LF2_SCHEDULE.children[0], LF2_SCHEDULE.children[1]) LF3_SCHEDULE.view() # fuse all inner loops LF4_SCHEDULE, _ = FUSE_TRANS.apply(LF3_SCHEDULE.children[0].children[0], LF3_SCHEDULE.children[0].children[1]) LF5_SCHEDULE, _ = FUSE_TRANS.apply(LF4_SCHEDULE.children[0].children[0],
if __name__ == "__main__": from psyclone.parse import parse from psyclone.psyGen import PSyFactory, TransInfo api = "gocean1.0" _, invokeinfo = parse("shallow_alg.f90", api=api) psy = PSyFactory(api).create(invokeinfo) print(psy.gen) print(psy.invokes.names) schedule = psy.invokes.get('invoke_0').schedule schedule.view() trans_info = TransInfo() print(trans_info.list) fuse_trans = trans_info.get_trans_name('LoopFuse') ptrans = trans_info.get_trans_name('ACCParallelTrans') dtrans = trans_info.get_trans_name('ACCDataTrans') ltrans = trans_info.get_trans_name('ACCLoopTrans') # invoke0 # fuse all outer loops lf1_schedule, _ = fuse_trans.apply(schedule.children[0], schedule.children[1]) lf2_schedule, _ = fuse_trans.apply(lf1_schedule.children[0], lf1_schedule.children[1]) lf3_schedule, _ = fuse_trans.apply(lf2_schedule.children[0], lf2_schedule.children[1]) lf3_schedule.view() # fuse all inner loops
def trans(psy): ''' Transform the schedule for OpenCL generation ''' # Import transformations tinfo = TransInfo() globaltrans = tinfo.get_trans_name('KernelImportsToArguments') move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() cltrans = GOOpenCLTrans() # Get the invoke routine schedule = psy.invokes.get('invoke_0').schedule # Map the kernels by their name to different OpenCL queues. The multiple # command queues can be executed concurrently while each command queue # executes in-order its kernels. This provides functional parallelism # when kernels don't have dependencies between them. qmap = { 'continuity_code': 1, 'momentum_u_code': 2, 'momentum_v_code': 3, 'bc_ssh_code': 1, 'bc_solid_u_code': 2, 'bc_solid_v_code': 3, 'bc_flather_u_code': 2, 'bc_flather_v_code': 3, 'field_copy_code': 1, 'next_sshu_code': 1, 'next_sshv_code': 1 } # Remove global variables from inside each kernel, pass the boundary # values as arguments to the kernel and set the OpenCL work size to 64, # which is required for performance (with OpenCL < 1.2 this requires # the resulting application to be executed with DL_ESM_ALIGNMENT=64). # Technically the OpenCL global_size (which is controlled by # DL_ESM_ALIGNMENT) must be divisible by the work_group_size (which # is set to 64 in the psyclone script) in OpenCL implementations < 2.0. # But from OpenCL 2.0 the standard says its not necessary anymore. # In practice it is safe to always use it as most implementations # are lacking in this aspect. # If using a different WORK_GROUP_SIZE, make sure to update the # DL_ESM_ALIGNMENT to match. for kern in schedule.kernels(): print(kern.name) globaltrans.apply(kern) if MOVE_BOUNDARIES: move_boundaries_trans.apply(kern) if FUCTIONAL_PARALLELISM: kern.set_opencl_options({ 'local_size': WORK_GROUP_SIZE, 'queue_number': qmap[kern.name] }) else: kern.set_opencl_options({'local_size': WORK_GROUP_SIZE}) # Transform invoke to OpenCL cltrans.apply(schedule) if XILINX_CONFIG_FILE: # Create a Xilinx Compiler Configuration file path = Config.get().kernel_output_dir with open(os.path.join(path, "xilinx.cfg"), "w") as cfgfile: cfgfile.write("# Xilinx FPGA configuration file\n") # cfgfile.write("[connectivity]\n") # cfgfile.write("# Create 2 CU of the given kernels\n") # cfgfile.write("nk=continuity_code:2\n") # cfgfile.write("nk=momentum_u_code:2\n") # cfgfile.write("nk=momentum_v_code:2\n") # cfgfile.write("nk=bc_ssh_code:2\n") # cfgfile.write("\n[hls]\n") # cfgfile.write("# Assign CUs to different SLRs\n") # cfgfile.write("slr=momentum_u_code_1:SLR0\n") # cfgfile.write("slr=momentum_u_code_2:SLR0\n") # cfgfile.write("slr=momentum_v_code_1:SLR2\n") # cfgfile.write("slr=momentum_v_code_2:SLR2\n") return psy
print(psy.gen) # List the various invokes that the PSy layer contains print(psy.invokes.names) # Get the loop schedule associated with one of these # invokes schedule = psy.invokes.get('invoke_0_v3_kernel_type').schedule schedule.view() # Get the list of possible loop transformations t = TransInfo() print(t.list) # Create an OpenMPLoop-transformation object ol = t.get_trans_name('OMPParallelLoopTrans') # Apply it to the loop schedule of the selected invoke ol.apply(schedule.children[0]) schedule.view() # Generate the Fortran code for the new PSy layer print(psy.gen) schedule = psy.invokes.get('invoke_1_v3_solver_kernel_type').schedule schedule.view() ol.apply(schedule.children[0]) schedule.view() print(psy.gen)
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- # Author R. Ford STFC Daresbury Lab from __future__ import print_function from psyclone.parse.algorithm import parse from psyclone.psyGen import PSyFactory from psyclone.psyGen import TransInfo api = "dynamo0.1" ast, invokeInfo = parse("dynamo_algorithm_mod.F90", api=api) psy = PSyFactory(api).create(invokeInfo) print(psy.gen) print(psy.invokes.names) schedule = psy.invokes.get('invoke_0').schedule schedule.view() t = TransInfo() print(t.list) lf = t.get_trans_name('LoopFuse') schedule.view() new_schedule, memento = lf.apply(schedule.children[0], schedule.children[1]) new_schedule.view() psy.invokes.get('invoke_0').schedule = new_schedule print(psy.gen)
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- # Authors: R. Ford and A. R. Porter, STFC Daresbury Lab from __future__ import print_function from psyclone.parse.algorithm import parse from psyclone.psyGen import PSyFactory from psyclone.psyGen import TransInfo api = "dynamo0.1" ast, invokeInfo = parse("dynamo_algorithm_mod.F90", api=api) psy = PSyFactory(api).create(invokeInfo) print(psy.gen) print(psy.invokes.names) schedule = psy.invokes.get('invoke_0').schedule schedule.view() t = TransInfo() print(t.list) lf = t.get_trans_name('LoopFuseTrans') schedule.view() new_schedule, memento = lf.apply(schedule.children[0], schedule.children[1]) new_schedule.view() psy.invokes.get('invoke_0').schedule = new_schedule print(psy.gen)
if __name__ == "__main__": from psyclone.nemo import NemoKern, NemoImplicitLoop API = "nemo" _, INVOKEINFO = parse("traldf_iso.F90", api=API) PSY = PSyFactory(API).create(INVOKEINFO) print(PSY.gen) print("Invokes found:") print(PSY.invokes.names) SCHED = PSY.invokes.get('tra_ldf_iso').schedule SCHED.view() TRANS_INFO = TransInfo() print(TRANS_INFO.list) OMP_TRANS = TRANS_INFO.get_trans_name('OMPParallelLoopTrans') DO_TRANS = TRANS_INFO.get_trans_name('NemoExplicitLoopTrans') # Transform each implicit loop to make the outermost loop explicit for loop in SCHED.loops(): if isinstance(loop, NemoImplicitLoop): _, _ = DO_TRANS.apply(loop) for loop in SCHED.loops(): # TODO loop.kernel method needs extending to cope with # multiple kernels kernels = loop.walk(loop.children, NemoKern) if kernels and loop.loop_type == "levels": sched, _ = OMP_TRANS.apply(loop) SCHED.view() PSY.invokes.get('tra_ldf_iso').schedule = SCHED