Exemple #1
0
    def __init__(self, cfg):
        super(CUDABackend, self).__init__(cfg)

        # Get the desired CUDA device
        devid = cfg.get('backend-cuda', 'device-id', 'round-robin')
        if not re.match(r'(round-robin|local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # In the non round-robin case set CUDA_DEVICE to be the desired
        # CUDA device number (used by pycuda.autoinit)
        os.environ.pop('CUDA_DEVICE', None)
        if devid != 'round-robin':
            os.environ['CUDA_DEVICE'] = devid

        # Create a CUDA context
        from pycuda.autoinit import context
        import pycuda.driver as cuda

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        context.set_cache_config(cuda.func_cache.PREFER_SHARED)

        from pyfr.backends.cuda import (blasext, cublas, packing, provider,
                                        types)

        # Register our data types
        self.base_matrix_cls = types.CUDAMatrixBase
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.mpi_matrix_cls = types.CUDAMPIMatrix
        self.mpi_view_cls = types.CUDAMPIView
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels')

        # Instantiate the base kernel providers
        kprovs = [
            provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels,
            packing.CUDAPackingKernels, cublas.CUDACUBLASKernels
        ]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #2
0
    def __init__(self, cfg):
        super(CUDABackend, self).__init__(cfg)

        # Get the desired CUDA device
        devid = cfg.get('backend-cuda', 'device-id', 'round-robin')
        if not re.match(r'(round-robin|local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # Handle the local-rank case
        if devid == 'local-rank':
            devid = str(get_local_rank())

        # In the non round-robin case set CUDA_DEVICE to be the desired
        # CUDA device number (used by pycuda.autoinit)
        os.environ.pop('CUDA_DEVICE', None)
        if devid != 'round-robin':
            os.environ['CUDA_DEVICE'] = devid

        # Create a CUDA context
        from pycuda.autoinit import context
        import pycuda.driver as cuda

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        context.set_cache_config(cuda.func_cache.PREFER_SHARED)

        from pyfr.backends.cuda import (blasext, cublas, packing, provider,
                                        types)

        # Register our data types
        self.base_matrix_cls = types.CUDAMatrixBase
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.mpi_matrix_cls = types.CUDAMPIMatrix
        self.mpi_view_cls = types.CUDAMPIView
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels')

        # Instantiate the base kernel providers
        kprovs = [provider.CUDAPointwiseKernelProvider,
                  blasext.CUDABlasExtKernels,
                  packing.CUDAPackingKernels,
                  cublas.CUDACUBLASKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #3
0
    def __init__(self, cfg):
        super(CUDABackend, self).__init__(cfg)

        # Create a CUDA context
        from pycuda.autoinit import context as cuda_ctx

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        from pycuda.driver import func_cache
        cuda_ctx.set_cache_config(func_cache.PREFER_SHARED)

        # For introspection to work it must always be possible to
        # import the CUDABackend (even if CUDA is unavailable on the
        # system).  As many of our types/providers depend on the CUDA
        # runtime we import these here, locally, at the time of
        # instantiation.
        from pyfr.backends.cuda import (blasext, cublas, packing, provider,
                                        types)

        # Register our data types
        self.block_diag_matrix_cls = types.CUDABlockDiagMatrix
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.mpi_matrix_cls = types.CUDAMPIMatrix
        self.mpi_view_cls = types.CUDAMPIView
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels')

        # Instantiate the base kernel providers
        kprovs = [provider.CUDAPointwiseKernelProvider,
                  blockmats.BlockDiagMatrixKernels,
                  blasext.CUDABlasExtKernels,
                  packing.CUDAPackingKernels,
                  cublas.CUDACublasKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #4
0
    def __init__(self, cfg):
        super(CUDABackend, self).__init__(cfg)

        # Create a CUDA context
        from pycuda.autoinit import context

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        from pycuda.driver import func_cache
        context.set_cache_config(func_cache.PREFER_SHARED)

        from pyfr.backends.cuda import (blasext, cublas, packing, provider,
                                        types)

        # Register our data types
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.mpi_matrix_cls = types.CUDAMPIMatrix
        self.mpi_view_cls = types.CUDAMPIView
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView

        # Template lookup
        self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels')

        # Instantiate the base kernel providers
        kprovs = [provider.CUDAPointwiseKernelProvider,
                  blasext.CUDABlasExtKernels,
                  packing.CUDAPackingKernels,
                  cublas.CUDACUBLASKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
Exemple #5
0
    def __init__(self, cfg):
        super().__init__(cfg)

        # Get the desired CUDA device
        devid = cfg.get('backend-cuda', 'device-id', 'local-rank')
        if not re.match(r'(round-robin|local-rank|\d+)$', devid):
            raise ValueError('Invalid device-id')

        # In the non round-robin case set CUDA_DEVICE to be the desired
        # CUDA device number (used by pycuda.autoinit)
        os.environ.pop('CUDA_DEVICE', None)

        # Handle the local-rank case
        if devid == 'local-rank':
            devord = str(get_local_rank())
            os.environ['CUDA_DEVICE'] = devord

        # Create a CUDA context
        from pycuda.autoinit import context
        import pycuda.driver as cuda

        # Aforementioned commented lines do not work for multiple gpus/node
        """
        if devid == 'local-rank':
            import pycuda.driver as cuda            
            cuda.init()
            cudadevice = cuda.Device(int(devord))
            context = cudadevice.make_context()
            #import atexit
            #atexit.register(context.pop)
        elif devid == 'round-robin':
            from pycuda.autoinit import context
            import pycuda.driver as cuda
        """

        # Take the required alignment to be 128 bytes
        self.alignb = 128

        # Take the SoA size to be 32 elements
        self.soasz = 32

        # Get the MPI runtime type
        self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard')
        if self.mpitype not in {'standard', 'cuda-aware'}:
            raise ValueError('Invalid CUDA backend MPI type')

        # Some CUDA devices share L1 cache and shared memory; on these
        # devices CUDA allows us to specify a preference between L1
        # cache and shared memory.  For the sake of CUBLAS (which
        # benefits greatly from more shared memory but fails to
        # declare its preference) we set the global default to
        # PREFER_SHARED.
        context.set_cache_config(cuda.func_cache.PREFER_SHARED)
        #self.context = context

        from frfs.backends.cuda import (blasext, cublas, gimmik, packing,
                                        provider, types)

        # Register our data types
        self.base_matrix_cls = types.CUDAMatrixBase
        self.const_matrix_cls = types.CUDAConstMatrix
        self.matrix_cls = types.CUDAMatrix
        self.matrix_bank_cls = types.CUDAMatrixBank
        self.matrix_rslice_cls = types.CUDAMatrixRSlice
        self.queue_cls = types.CUDAQueue
        self.view_cls = types.CUDAView
        self.xchg_matrix_cls = types.CUDAXchgMatrix
        self.xchg_view_cls = types.CUDAXchgView

        # Instantiate the base kernel providers
        kprovs = [provider.CUDAPointwiseKernelProvider,
                  blasext.CUDABlasExtKernels,
                  packing.CUDAPackingKernels,
                  gimmik.CUDAGiMMiKKernels,
                  cublas.CUDACUBLASKernels]
        self._providers = [k(self) for k in kprovs]

        # Pointwise kernels
        self.pointwise = self._providers[0]
        pred_obs_mean_ar[tid], &fltr_stt_mean_ar[offset_mean+wid], dim_state_ar, tid, i, iteration);

      fltr_stt_cov_ar_cal(&pred_stt_cov_ar[offset_cov+wid],
        kalman_gain_ar, 
        &fltr_stt_cov_ar[offset_cov+wid], dim_state_ar, temp, temp1, tid);
      }
      
      logpdf[tid] = log_pdf(pred_obs_mean_ar[tid], pred_obs_cov_ar[tid],tid);
    }
""")


start = time.time()


context.set_cache_config(cuda.func_cache.PREFER_L1)

filter = mod.get_function("filter")
init_stt_mean_ar =  mod.get_global('init_stt_mean_ar')[0] 
init_stt_cov_ar =  mod.get_global('init_stt_cov_ar')[0] 
tran_mat_ar =  mod.get_global('tran_mat_ar')[0] 
observations_ar =  mod.get_global('observations_ar')[0] 

cuda.memcpy_htod(init_stt_mean_ar, init_stt_mean_ar_const)
cuda.memcpy_htod(init_stt_cov_ar, init_stt_cov_ar_const)
cuda.memcpy_htod(tran_mat_ar, tran_mat_ar_const)
cuda.memcpy_htod(observations_ar, observations_ar_const)



start = time.time()