Exemple #1
0
    def __init__(self, cfg):
        super().__init__(cfg)

        from pyfr.backends.hip.compiler import HIPRTC
        from pyfr.backends.hip.driver import HIP

        # Load and wrap HIP and HIPRTC
        self.hip = HIP()
        self.hiprtc = HIPRTC()

        # Get the desired HIP device
        devid = cfg.get('backend-hip', 'device-id', 'local-rank')
        if not re.match(r'(local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Set the device
        self.hip.set_device(int(devid))

        # Get its properties
        self.props = self.hip.device_properties(int(devid))

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Take the SoA size to be 32 elements
        self.soasz = 32
        self.csubsz = self.soasz

        # Get the MPI runtime type
        self.mpitype = cfg.get('backend-hip', 'mpi-type', 'standard')
        if self.mpitype not in {'standard', 'hip-aware'}:
            raise ValueError('Invalid HIP backend MPI type')

        from pyfr.backends.hip import (blasext, gimmik, packing, provider,
                                       rocblas, types)

        # Register our data types
        self.base_matrix_cls = types.HIPMatrixBase
        self.const_matrix_cls = types.HIPConstMatrix
        self.matrix_cls = types.HIPMatrix
        self.matrix_slice_cls = types.HIPMatrixSlice
        self.queue_cls = types.HIPQueue
        self.view_cls = types.HIPView
        self.xchg_matrix_cls = types.HIPXchgMatrix
        self.xchg_view_cls = types.HIPXchgView

        # Instantiate the base kernel providers
        kprovs = [
            provider.HIPPointwiseKernelProvider, blasext.HIPBlasExtKernels,
            packing.HIPPackingKernels, gimmik.HIPGiMMiKKernels,
            rocblas.HIPRocBLASKernels
        ]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #2
0
    def __init__(self, cfg):
        super(CUDABackend, self).__init__(cfg)

        # Get the desired CUDA device
        devid = cfg.get('backend-cuda', 'device-id', 'round-robin')
        if not re.match(r'(round-robin|local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # In the non round-robin case set CUDA_DEVICE to be the desired
        # CUDA device number (used by pycuda.autoinit)
        os.environ.pop('CUDA_DEVICE', None)
        if devid != 'round-robin':
            os.environ['CUDA_DEVICE'] = devid

        # Create a CUDA context
        from pycuda.autoinit import context
        import pycuda.driver as cuda

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        context.set_cache_config(cuda.func_cache.PREFER_SHARED)

        from pyfr.backends.cuda import (blasext, cublas, packing, provider,
                                        types)

        # Register our data types
        self.base_matrix_cls = types.CUDAMatrixBase
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.mpi_matrix_cls = types.CUDAMPIMatrix
        self.mpi_view_cls = types.CUDAMPIView
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels')

        # Instantiate the base kernel providers
        kprovs = [
            provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels,
            packing.CUDAPackingKernels, cublas.CUDACUBLASKernels
        ]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #3
0
    def __init__(self, cfg):
        super(CUDABackend, self).__init__(cfg)

        # Get the desired CUDA device
        devid = cfg.get('backend-cuda', 'device-id', 'round-robin')
        if not re.match(r'(round-robin|local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # In the non round-robin case set CUDA_DEVICE to be the desired
        # CUDA device number (used by pycuda.autoinit)
        os.environ.pop('CUDA_DEVICE', None)
        if devid != 'round-robin':
            os.environ['CUDA_DEVICE'] = devid

        # Create a CUDA context
        from pycuda.autoinit import context
        import pycuda.driver as cuda

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        context.set_cache_config(cuda.func_cache.PREFER_SHARED)

        from pyfr.backends.cuda import (blasext, cublas, packing, provider,
                                        types)

        # Register our data types
        self.base_matrix_cls = types.CUDAMatrixBase
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.mpi_matrix_cls = types.CUDAMPIMatrix
        self.mpi_view_cls = types.CUDAMPIView
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels')

        # Instantiate the base kernel providers
        kprovs = [provider.CUDAPointwiseKernelProvider,
                  blasext.CUDABlasExtKernels,
                  packing.CUDAPackingKernels,
                  cublas.CUDACUBLASKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #4
0
    def __init__(self, cfg):
        super().__init__(cfg)

        import pymic as mic

        # Get the device ID to use
        devid = cfg.get('backend-mic', 'device-id', 'local-rank')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Get a handle to the desired device
        self.dev = mic.devices[int(devid)]

        # Default stream
        self.sdflt = self.dev.get_default_stream()

        # Take the alignment requirement to be 64-bytes
        self.alignb = 64

        from pyfr.backends.mic import (blasext, cblas, packing, provider,
                                       types)

        # Register our data types
        self.base_matrix_cls = types.MICMatrixBase
        self.const_matrix_cls = types.MICConstMatrix
        self.matrix_cls = types.MICMatrix
        self.matrix_bank_cls = types.MICMatrixBank
        self.matrix_rslice_cls = types.MICMatrixRSlice
        self.queue_cls = types.MICQueue
        self.view_cls = types.MICView
        self.xchg_matrix_cls = types.MICXchgMatrix
        self.xchg_view_cls = types.MICXchgView

        # Template lookup
        self.lookup = DottedTemplateLookup(
            'pyfr.backends.mic.kernels',
            fpdtype=self.fpdtype, alignb=self.alignb
        )

        # Kernel provider classes
        kprovcls = [provider.MICPointwiseKernelProvider,
                    blasext.MICBlasExtKernels,
                    packing.MICPackingKernels,
                    cblas.MICCBLASKernels]
        self._providers = [k(self) for k in kprovcls]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #5
0
    def __init__(self, cfg):
        super().__init__(cfg)

        import pymic as mic

        # Get the device ID to use
        devid = cfg.get('backend-mic', 'device-id', 'local-rank')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Get a handle to the desired device
        self.dev = mic.devices[int(devid)]

        # Default stream
        self.sdflt = self.dev.get_default_stream()

        # Take the alignment requirement to be 64-bytes
        self.alignb = 64

        from pyfr.backends.mic import (blasext, cblas, packing, provider,
                                       types)

        # Register our data types
        self.base_matrix_cls = types.MICMatrixBase
        self.const_matrix_cls = types.MICConstMatrix
        self.matrix_cls = types.MICMatrix
        self.matrix_bank_cls = types.MICMatrixBank
        self.matrix_rslice_cls = types.MICMatrixRSlice
        self.queue_cls = types.MICQueue
        self.view_cls = types.MICView
        self.xchg_matrix_cls = types.MICXchgMatrix
        self.xchg_view_cls = types.MICXchgView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.mic.kernels',
                                           fpdtype=self.fpdtype,
                                           alignb=self.alignb)

        # Kernel provider classes
        kprovcls = [
            provider.MICPointwiseKernelProvider, blasext.MICBlasExtKernels,
            packing.MICPackingKernels, cblas.MICCBLASKernels
        ]
        self._providers = [k(self) for k in kprovcls]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #6
0
    def __init__(self, cfg):
        super(OpenCLBackend, self).__init__(cfg)

        import pyopencl as cl

        # Get the platform/device info from the config file
        platid = cfg.get('backend-opencl', 'platform-id', '0').lower()
        devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower()
        devtype = cfg.get('backend-opencl', 'device-type', 'all').upper()

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Map the device type to the corresponding PyOpenCL constant
        devtype = getattr(cl.device_type, devtype)

        # Determine the OpenCL platform to use
        for i, platform in enumerate(cl.get_platforms()):
            if platid == str(i) or platid == platform.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL platform found')

        # Determine the OpenCL device to use
        for i, device in enumerate(platform.get_devices(devtype)):
            if devid == str(i) or devid == device.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL device found')

        # Create a OpenCL context on this device
        self.ctx = cl.Context([device])

        # Create a queue for initialisation-type operations
        self.qdflt = cl.CommandQueue(self.ctx)

        # Compute the alignment requirement for the context
        self.alignb = device.mem_base_addr_align // 8

        from pyfr.backends.opencl import (blasext, clblas, packing, provider,
                                          types)

        # Register our data types
        self.base_matrix_cls = types.OpenCLMatrixBase
        self.const_matrix_cls = types.OpenCLConstMatrix
        self.matrix_cls = types.OpenCLMatrix
        self.matrix_bank_cls = types.OpenCLMatrixBank
        self.matrix_rslice_cls = types.OpenCLMatrixRSlice
        self.mpi_matrix_cls = types.OpenCLMPIMatrix
        self.mpi_view_cls = types.OpenCLMPIView
        self.queue_cls = types.OpenCLQueue
        self.view_cls = types.OpenCLView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.opencl.kernels')

        # Instantiate the base kernel providers
        kprovs = [
            provider.OpenCLPointwiseKernelProvider,
            blasext.OpenCLBlasExtKernels, packing.OpenCLPackingKernels,
            clblas.OpenCLClBLASKernels
        ]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #7
0
    def __init__(self, cfg):
        super().__init__(cfg)

        from pyfr.backends.cuda.compiler import NVRTC
        from pyfr.backends.cuda.driver import CUDA, CUDAError

        # Load and wrap CUDA and NVRTC
        self.cuda = CUDA()
        self.nvrtc = NVRTC()

        # Get the desired CUDA device
        devid = cfg.get('backend-cuda', 'device-id', 'round-robin')
        if not re.match(r'(round-robin|local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # For round-robin try each device until we find one that works
        if devid == 'round-robin':
            for i in range(self.cuda.device_count()):
                try:
                    self.cuda.set_device(i)
                    break
                except CUDAError:
                    pass
            else:
                raise RuntimeError('Unable to create a CUDA context')
        elif devid == 'local-rank':
            self.cuda.set_device(get_local_rank())
        else:
            self.cuda.set_device(int(devid))

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Take the SoA size to be 32 elements
        self.soasz = 32

        # Get the MPI runtime type
        self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard')
        if self.mpitype not in {'standard', 'cuda-aware'}:
            raise ValueError('Invalid CUDA backend MPI type')

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        self.cuda.set_cache_pref(prefer_shared=True)

        from pyfr.backends.cuda import (blasext, cublas, gimmik, packing,
                                        provider, types)

        # Register our data types
        self.base_matrix_cls = types.CUDAMatrixBase
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_slice_cls = types.CUDAMatrixSlice
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView
        self.xchg_matrix_cls = types.CUDAXchgMatrix
        self.xchg_view_cls = types.CUDAXchgView

        # Instantiate the base kernel providers
        kprovs = [
            provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels,
            packing.CUDAPackingKernels, gimmik.CUDAGiMMiKKernels,
            cublas.CUDACUBLASKernels
        ]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #8
0
    def __init__(self, cfg):
        super().__init__(cfg)

        import pyopencl as cl

        # Get the platform/device info from the config file
        platid = cfg.get('backend-opencl', 'platform-id', '0').lower()
        devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower()
        devtype = cfg.get('backend-opencl', 'device-type', 'all').upper()

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Map the device type to the corresponding PyOpenCL constant
        devtype = getattr(cl.device_type, devtype)

        # Determine the OpenCL platform to use
        for i, platform in enumerate(cl.get_platforms()):
            if platid == str(i) or platid == platform.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL platform found')

        # Determine the OpenCL device to use
        for i, device in enumerate(platform.get_devices(devtype)):
            if devid == str(i) or devid == device.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL device found')

        # Determine if the device supports double precision arithmetic
        if self.fpdtype == np.float64 and not device.double_fp_config:
            raise ValueError('Device does not support double precision')

        # Create a OpenCL context on this device
        self.ctx = cl.Context([device])

        # Create a queue for initialisation-type operations
        self.qdflt = cl.CommandQueue(self.ctx)

        # Compute the alignment requirement for the context
        self.alignb = device.mem_base_addr_align // 8

        # Compute the SoA size
        self.soasz = 2 * self.alignb // np.dtype(self.fpdtype).itemsize
        self.csubsz = self.soasz

        from pyfr.backends.opencl import (blasext, clblast, gimmik, packing,
                                          provider, types)

        # Register our data types
        self.base_matrix_cls = types.OpenCLMatrixBase
        self.const_matrix_cls = types.OpenCLConstMatrix
        self.matrix_cls = types.OpenCLMatrix
        self.matrix_bank_cls = types.OpenCLMatrixBank
        self.matrix_slice_cls = types.OpenCLMatrixSlice
        self.queue_cls = types.OpenCLQueue
        self.view_cls = types.OpenCLView
        self.xchg_matrix_cls = types.OpenCLXchgMatrix
        self.xchg_view_cls = types.OpenCLXchgView

        # Instantiate the base kernel providers
        kprovs = [
            provider.OpenCLPointwiseKernelProvider,
            blasext.OpenCLBlasExtKernels, packing.OpenCLPackingKernels,
            gimmik.OpenCLGiMMiKKernels, clblast.OpenCLCLBlastKernels
        ]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #9
0
    def __init__(self, cfg):
        super(OpenCLBackend, self).__init__(cfg)

        import pyopencl as cl

        # Get the platform/device info from the config file
        platid = cfg.get('backend-opencl', 'platform-id', '0').lower()
        devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower()
        devtype = cfg.get('backend-opencl', 'device-type', 'all').upper()

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Map the device type to the corresponding PyOpenCL constant
        devtype = getattr(cl.device_type, devtype)

        # Determine the OpenCL platform to use
        for i, platform in enumerate(cl.get_platforms()):
            if platid == str(i) or platid == platform.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL platform found')

        # Determine the OpenCL device to use
        for i, device in enumerate(platform.get_devices(devtype)):
            if devid == str(i) or devid == device.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL device found')

        # Create a OpenCL context on this device
        self.ctx = cl.Context([device])

        # Create a queue for initialisation-type operations
        self.qdflt = cl.CommandQueue(self.ctx)

        # Compute the alignment requirement for the context
        self.alignb = device.mem_base_addr_align // 8

        from pyfr.backends.opencl import (blasext, clblas, packing, provider,
                                          types)

        # Register our data types
        self.base_matrix_cls = types.OpenCLMatrixBase
        self.const_matrix_cls = types.OpenCLConstMatrix
        self.matrix_cls = types.OpenCLMatrix
        self.matrix_bank_cls = types.OpenCLMatrixBank
        self.matrix_rslice_cls = types.OpenCLMatrixRSlice
        self.mpi_matrix_cls = types.OpenCLMPIMatrix
        self.mpi_view_cls = types.OpenCLMPIView
        self.queue_cls = types.OpenCLQueue
        self.view_cls = types.OpenCLView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.opencl.kernels')

        # Instantiate the base kernel providers
        kprovs = [provider.OpenCLPointwiseKernelProvider,
                  blasext.OpenCLBlasExtKernels,
                  packing.OpenCLPackingKernels,
                  clblas.OpenCLClBLASKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #10
0
    def __init__(self, cfg):
        super().__init__(cfg)

        import pyopencl as cl

        # Get the platform/device info from the config file
        platid = cfg.get('backend-opencl', 'platform-id', '0').lower()
        devid = cfg.get('backend-opencl', 'device-id', 'local-rank').lower()
        devtype = cfg.get('backend-opencl', 'device-type', 'all').upper()

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # Map the device type to the corresponding PyOpenCL constant
        devtype = getattr(cl.device_type, devtype)

        # Determine the OpenCL platform to use
        for i, platform in enumerate(cl.get_platforms()):
            if platid == str(i) or platid == platform.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL platform found')

        # Determine the OpenCL device to use
        for i, device in enumerate(platform.get_devices(devtype)):
            if devid == str(i) or devid == device.name.lower():
                break
        else:
            raise ValueError('No suitable OpenCL device found')

        # Determine if the device supports double precision arithmetic
        if self.fpdtype == np.float64 and not device.double_fp_config:
            raise ValueError('Device does not support double precision')

        # Create a OpenCL context on this device
        self.ctx = cl.Context([device])

        # Create a queue for initialisation-type operations
        self.qdflt = cl.CommandQueue(self.ctx)

        # Compute the alignment requirement for the context
        self.alignb = device.mem_base_addr_align // 8

        # Compute the SoA size
        self.soasz = 2*self.alignb // np.dtype(self.fpdtype).itemsize

        from pyfr.backends.opencl import (blasext, clblas, gimmik, packing,
                                          provider, types)

        # Register our data types
        self.base_matrix_cls = types.OpenCLMatrixBase
        self.const_matrix_cls = types.OpenCLConstMatrix
        self.matrix_cls = types.OpenCLMatrix
        self.matrix_bank_cls = types.OpenCLMatrixBank
        self.matrix_rslice_cls = types.OpenCLMatrixRSlice
        self.queue_cls = types.OpenCLQueue
        self.view_cls = types.OpenCLView
        self.xchg_matrix_cls = types.OpenCLXchgMatrix
        self.xchg_view_cls = types.OpenCLXchgView

        # Instantiate the base kernel providers
        kprovs = [provider.OpenCLPointwiseKernelProvider,
                  blasext.OpenCLBlasExtKernels,
                  packing.OpenCLPackingKernels,
                  gimmik.OpenCLGiMMiKKernels,
                  clblas.OpenCLClBLASKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]