def __init__(self, cfg): super().__init__(cfg) from pyfr.backends.hip.compiler import HIPRTC from pyfr.backends.hip.driver import HIP # Load and wrap HIP and HIPRTC self.hip = HIP() self.hiprtc = HIPRTC() # Get the desired HIP device devid = cfg.get('backend-hip', 'device-id', 'local-rank') if not re.match(r'(local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Set the device self.hip.set_device(int(devid)) # Get its properties self.props = self.hip.device_properties(int(devid)) # Take the required alignment to be 128 bytes self.alignb = 128 # Take the SoA size to be 32 elements self.soasz = 32 self.csubsz = self.soasz # Get the MPI runtime type self.mpitype = cfg.get('backend-hip', 'mpi-type', 'standard') if self.mpitype not in {'standard', 'hip-aware'}: raise ValueError('Invalid HIP backend MPI type') from pyfr.backends.hip import (blasext, gimmik, packing, provider, rocblas, types) # Register our data types self.base_matrix_cls = types.HIPMatrixBase self.const_matrix_cls = types.HIPConstMatrix self.matrix_cls = types.HIPMatrix self.matrix_slice_cls = types.HIPMatrixSlice self.queue_cls = types.HIPQueue self.view_cls = types.HIPView self.xchg_matrix_cls = types.HIPXchgMatrix self.xchg_view_cls = types.HIPXchgView # Instantiate the base kernel providers kprovs = [ provider.HIPPointwiseKernelProvider, blasext.HIPBlasExtKernels, packing.HIPPackingKernels, gimmik.HIPGiMMiKKernels, rocblas.HIPRocBLASKernels ] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(CUDABackend, self).__init__(cfg) # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'round-robin') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # In the non round-robin case set CUDA_DEVICE to be the desired # CUDA device number (used by pycuda.autoinit) os.environ.pop('CUDA_DEVICE', None) if devid != 'round-robin': os.environ['CUDA_DEVICE'] = devid # Create a CUDA context from pycuda.autoinit import context import pycuda.driver as cuda # Take the required alignment to be 128 bytes self.alignb = 128 # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. context.set_cache_config(cuda.func_cache.PREFER_SHARED) from pyfr.backends.cuda import (blasext, cublas, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.mpi_matrix_cls = types.CUDAMPIMatrix self.mpi_view_cls = types.CUDAMPIView self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels') # Instantiate the base kernel providers kprovs = [ provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, cublas.CUDACUBLASKernels ] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(CUDABackend, self).__init__(cfg) # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'round-robin') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # In the non round-robin case set CUDA_DEVICE to be the desired # CUDA device number (used by pycuda.autoinit) os.environ.pop('CUDA_DEVICE', None) if devid != 'round-robin': os.environ['CUDA_DEVICE'] = devid # Create a CUDA context from pycuda.autoinit import context import pycuda.driver as cuda # Take the required alignment to be 128 bytes self.alignb = 128 # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. context.set_cache_config(cuda.func_cache.PREFER_SHARED) from pyfr.backends.cuda import (blasext, cublas, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.mpi_matrix_cls = types.CUDAMPIMatrix self.mpi_view_cls = types.CUDAMPIView self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels') # Instantiate the base kernel providers kprovs = [provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, cublas.CUDACUBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super().__init__(cfg) import pymic as mic # Get the device ID to use devid = cfg.get('backend-mic', 'device-id', 'local-rank') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Get a handle to the desired device self.dev = mic.devices[int(devid)] # Default stream self.sdflt = self.dev.get_default_stream() # Take the alignment requirement to be 64-bytes self.alignb = 64 from pyfr.backends.mic import (blasext, cblas, packing, provider, types) # Register our data types self.base_matrix_cls = types.MICMatrixBase self.const_matrix_cls = types.MICConstMatrix self.matrix_cls = types.MICMatrix self.matrix_bank_cls = types.MICMatrixBank self.matrix_rslice_cls = types.MICMatrixRSlice self.queue_cls = types.MICQueue self.view_cls = types.MICView self.xchg_matrix_cls = types.MICXchgMatrix self.xchg_view_cls = types.MICXchgView # Template lookup self.lookup = DottedTemplateLookup( 'pyfr.backends.mic.kernels', fpdtype=self.fpdtype, alignb=self.alignb ) # Kernel provider classes kprovcls = [provider.MICPointwiseKernelProvider, blasext.MICBlasExtKernels, packing.MICPackingKernels, cblas.MICCBLASKernels] self._providers = [k(self) for k in kprovcls] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super().__init__(cfg) import pymic as mic # Get the device ID to use devid = cfg.get('backend-mic', 'device-id', 'local-rank') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Get a handle to the desired device self.dev = mic.devices[int(devid)] # Default stream self.sdflt = self.dev.get_default_stream() # Take the alignment requirement to be 64-bytes self.alignb = 64 from pyfr.backends.mic import (blasext, cblas, packing, provider, types) # Register our data types self.base_matrix_cls = types.MICMatrixBase self.const_matrix_cls = types.MICConstMatrix self.matrix_cls = types.MICMatrix self.matrix_bank_cls = types.MICMatrixBank self.matrix_rslice_cls = types.MICMatrixRSlice self.queue_cls = types.MICQueue self.view_cls = types.MICView self.xchg_matrix_cls = types.MICXchgMatrix self.xchg_view_cls = types.MICXchgView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.mic.kernels', fpdtype=self.fpdtype, alignb=self.alignb) # Kernel provider classes kprovcls = [ provider.MICPointwiseKernelProvider, blasext.MICBlasExtKernels, packing.MICPackingKernels, cblas.MICCBLASKernels ] self._providers = [k(self) for k in kprovcls] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(OpenCLBackend, self).__init__(cfg) import pyopencl as cl # Get the platform/device info from the config file platid = cfg.get('backend-opencl', 'platform-id', '0').lower() devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower() devtype = cfg.get('backend-opencl', 'device-type', 'all').upper() # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Map the device type to the corresponding PyOpenCL constant devtype = getattr(cl.device_type, devtype) # Determine the OpenCL platform to use for i, platform in enumerate(cl.get_platforms()): if platid == str(i) or platid == platform.name.lower(): break else: raise ValueError('No suitable OpenCL platform found') # Determine the OpenCL device to use for i, device in enumerate(platform.get_devices(devtype)): if devid == str(i) or devid == device.name.lower(): break else: raise ValueError('No suitable OpenCL device found') # Create a OpenCL context on this device self.ctx = cl.Context([device]) # Create a queue for initialisation-type operations self.qdflt = cl.CommandQueue(self.ctx) # Compute the alignment requirement for the context self.alignb = device.mem_base_addr_align // 8 from pyfr.backends.opencl import (blasext, clblas, packing, provider, types) # Register our data types self.base_matrix_cls = types.OpenCLMatrixBase self.const_matrix_cls = types.OpenCLConstMatrix self.matrix_cls = types.OpenCLMatrix self.matrix_bank_cls = types.OpenCLMatrixBank self.matrix_rslice_cls = types.OpenCLMatrixRSlice self.mpi_matrix_cls = types.OpenCLMPIMatrix self.mpi_view_cls = types.OpenCLMPIView self.queue_cls = types.OpenCLQueue self.view_cls = types.OpenCLView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.opencl.kernels') # Instantiate the base kernel providers kprovs = [ provider.OpenCLPointwiseKernelProvider, blasext.OpenCLBlasExtKernels, packing.OpenCLPackingKernels, clblas.OpenCLClBLASKernels ] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super().__init__(cfg) from pyfr.backends.cuda.compiler import NVRTC from pyfr.backends.cuda.driver import CUDA, CUDAError # Load and wrap CUDA and NVRTC self.cuda = CUDA() self.nvrtc = NVRTC() # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'round-robin') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # For round-robin try each device until we find one that works if devid == 'round-robin': for i in range(self.cuda.device_count()): try: self.cuda.set_device(i) break except CUDAError: pass else: raise RuntimeError('Unable to create a CUDA context') elif devid == 'local-rank': self.cuda.set_device(get_local_rank()) else: self.cuda.set_device(int(devid)) # Take the required alignment to be 128 bytes self.alignb = 128 # Take the SoA size to be 32 elements self.soasz = 32 # Get the MPI runtime type self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard') if self.mpitype not in {'standard', 'cuda-aware'}: raise ValueError('Invalid CUDA backend MPI type') # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. self.cuda.set_cache_pref(prefer_shared=True) from pyfr.backends.cuda import (blasext, cublas, gimmik, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_slice_cls = types.CUDAMatrixSlice self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView self.xchg_matrix_cls = types.CUDAXchgMatrix self.xchg_view_cls = types.CUDAXchgView # Instantiate the base kernel providers kprovs = [ provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, gimmik.CUDAGiMMiKKernels, cublas.CUDACUBLASKernels ] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super().__init__(cfg) import pyopencl as cl # Get the platform/device info from the config file platid = cfg.get('backend-opencl', 'platform-id', '0').lower() devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower() devtype = cfg.get('backend-opencl', 'device-type', 'all').upper() # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Map the device type to the corresponding PyOpenCL constant devtype = getattr(cl.device_type, devtype) # Determine the OpenCL platform to use for i, platform in enumerate(cl.get_platforms()): if platid == str(i) or platid == platform.name.lower(): break else: raise ValueError('No suitable OpenCL platform found') # Determine the OpenCL device to use for i, device in enumerate(platform.get_devices(devtype)): if devid == str(i) or devid == device.name.lower(): break else: raise ValueError('No suitable OpenCL device found') # Determine if the device supports double precision arithmetic if self.fpdtype == np.float64 and not device.double_fp_config: raise ValueError('Device does not support double precision') # Create a OpenCL context on this device self.ctx = cl.Context([device]) # Create a queue for initialisation-type operations self.qdflt = cl.CommandQueue(self.ctx) # Compute the alignment requirement for the context self.alignb = device.mem_base_addr_align // 8 # Compute the SoA size self.soasz = 2 * self.alignb // np.dtype(self.fpdtype).itemsize self.csubsz = self.soasz from pyfr.backends.opencl import (blasext, clblast, gimmik, packing, provider, types) # Register our data types self.base_matrix_cls = types.OpenCLMatrixBase self.const_matrix_cls = types.OpenCLConstMatrix self.matrix_cls = types.OpenCLMatrix self.matrix_bank_cls = types.OpenCLMatrixBank self.matrix_slice_cls = types.OpenCLMatrixSlice self.queue_cls = types.OpenCLQueue self.view_cls = types.OpenCLView self.xchg_matrix_cls = types.OpenCLXchgMatrix self.xchg_view_cls = types.OpenCLXchgView # Instantiate the base kernel providers kprovs = [ provider.OpenCLPointwiseKernelProvider, blasext.OpenCLBlasExtKernels, packing.OpenCLPackingKernels, gimmik.OpenCLGiMMiKKernels, clblast.OpenCLCLBlastKernels ] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(OpenCLBackend, self).__init__(cfg) import pyopencl as cl # Get the platform/device info from the config file platid = cfg.get('backend-opencl', 'platform-id', '0').lower() devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower() devtype = cfg.get('backend-opencl', 'device-type', 'all').upper() # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Map the device type to the corresponding PyOpenCL constant devtype = getattr(cl.device_type, devtype) # Determine the OpenCL platform to use for i, platform in enumerate(cl.get_platforms()): if platid == str(i) or platid == platform.name.lower(): break else: raise ValueError('No suitable OpenCL platform found') # Determine the OpenCL device to use for i, device in enumerate(platform.get_devices(devtype)): if devid == str(i) or devid == device.name.lower(): break else: raise ValueError('No suitable OpenCL device found') # Create a OpenCL context on this device self.ctx = cl.Context([device]) # Create a queue for initialisation-type operations self.qdflt = cl.CommandQueue(self.ctx) # Compute the alignment requirement for the context self.alignb = device.mem_base_addr_align // 8 from pyfr.backends.opencl import (blasext, clblas, packing, provider, types) # Register our data types self.base_matrix_cls = types.OpenCLMatrixBase self.const_matrix_cls = types.OpenCLConstMatrix self.matrix_cls = types.OpenCLMatrix self.matrix_bank_cls = types.OpenCLMatrixBank self.matrix_rslice_cls = types.OpenCLMatrixRSlice self.mpi_matrix_cls = types.OpenCLMPIMatrix self.mpi_view_cls = types.OpenCLMPIView self.queue_cls = types.OpenCLQueue self.view_cls = types.OpenCLView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.opencl.kernels') # Instantiate the base kernel providers kprovs = [provider.OpenCLPointwiseKernelProvider, blasext.OpenCLBlasExtKernels, packing.OpenCLPackingKernels, clblas.OpenCLClBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super().__init__(cfg) import pyopencl as cl # Get the platform/device info from the config file platid = cfg.get('backend-opencl', 'platform-id', '0').lower() devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower() devtype = cfg.get('backend-opencl', 'device-type', 'all').upper() # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # Map the device type to the corresponding PyOpenCL constant devtype = getattr(cl.device_type, devtype) # Determine the OpenCL platform to use for i, platform in enumerate(cl.get_platforms()): if platid == str(i) or platid == platform.name.lower(): break else: raise ValueError('No suitable OpenCL platform found') # Determine the OpenCL device to use for i, device in enumerate(platform.get_devices(devtype)): if devid == str(i) or devid == device.name.lower(): break else: raise ValueError('No suitable OpenCL device found') # Determine if the device supports double precision arithmetic if self.fpdtype == np.float64 and not device.double_fp_config: raise ValueError('Device does not support double precision') # Create a OpenCL context on this device self.ctx = cl.Context([device]) # Create a queue for initialisation-type operations self.qdflt = cl.CommandQueue(self.ctx) # Compute the alignment requirement for the context self.alignb = device.mem_base_addr_align // 8 # Compute the SoA size self.soasz = 2*self.alignb // np.dtype(self.fpdtype).itemsize from pyfr.backends.opencl import (blasext, clblas, gimmik, packing, provider, types) # Register our data types self.base_matrix_cls = types.OpenCLMatrixBase self.const_matrix_cls = types.OpenCLConstMatrix self.matrix_cls = types.OpenCLMatrix self.matrix_bank_cls = types.OpenCLMatrixBank self.matrix_rslice_cls = types.OpenCLMatrixRSlice self.queue_cls = types.OpenCLQueue self.view_cls = types.OpenCLView self.xchg_matrix_cls = types.OpenCLXchgMatrix self.xchg_view_cls = types.OpenCLXchgView # Instantiate the base kernel providers kprovs = [provider.OpenCLPointwiseKernelProvider, blasext.OpenCLBlasExtKernels, packing.OpenCLPackingKernels, gimmik.OpenCLGiMMiKKernels, clblas.OpenCLClBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]