class CUDASparseContext(object): def __init__(self, int_m, dec_m, device_id=0): if config['FP_precision'] == 32: self.fl_pr = np.float32 elif config['FP_precision'] == 64: self.fl_pr = np.float64 else: raise Exception( "CUDASparseContext(): Unknown precision specified.") #====================================================================== # Setup GPU stuff and upload data to it #====================================================================== try: from accelerate.cuda.blas import Blas import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cuda.select_device(0) self.cuda = cuda self.cusp = cusparse.Sparse() self.cubl = Blas() self.set_matrices(int_m, dec_m) def set_matrices(self, int_m, dec_m): import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda self.m, self.n = int_m.shape self.int_m_nnz = int_m.nnz self.int_m_csrValA = cuda.to_device(int_m.data.astype(self.fl_pr)) self.int_m_csrRowPtrA = cuda.to_device(int_m.indptr) self.int_m_csrColIndA = cuda.to_device(int_m.indices) self.dec_m_nnz = dec_m.nnz self.dec_m_csrValA = cuda.to_device(dec_m.data.astype(self.fl_pr)) self.dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr) self.dec_m_csrColIndA = cuda.to_device(dec_m.indices) self.descr = self.cusp.matdescr() self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO self.cu_delta_phi = self.cuda.device_array_like( np.zeros(self.m, dtype=self.fl_pr)) print np.zeros(self.m, dtype=self.fl_pr).shape def set_phi(self, phi): self.cu_curr_phi = self.cuda.to_device(phi.astype(self.fl_pr)) def get_phi(self): return self.cu_curr_phi.copy_to_host() def do_step(self, rho_inv, dX): self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.int_m_nnz, descr=self.descr, alpha=self.fl_pr(1.0), csrVal=self.int_m_csrValA, csrRowPtr=self.int_m_csrRowPtrA, csrColInd=self.int_m_csrColIndA, x=self.cu_curr_phi, beta=self.fl_pr(0.0), y=self.cu_delta_phi) # print np.sum(cu_curr_phi.copy_to_host()) self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.dec_m_nnz, descr=self.descr, alpha=self.fl_pr(rho_inv), csrVal=self.dec_m_csrValA, csrRowPtr=self.dec_m_csrRowPtrA, csrColInd=self.dec_m_csrColIndA, x=self.cu_curr_phi, beta=self.fl_pr(1.0), y=self.cu_delta_phi) self.cubl.axpy(alpha=self.fl_pr(dX), x=self.cu_delta_phi, y=self.cu_curr_phi)
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ fl_pr = None if config['FP_precision'] == 32: fl_pr = np.float32 elif config['FP_precision'] == 64: fl_pr = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") # if config['enable_muon_energyloss']: # raise NotImplementedError('kern_CUDA_dense(): ' + # 'Energy loss not imlemented for this solver.') if config['enable_muon_energy_loss']: raise NotImplementedError( 'kern_CUDA_dense(): ' + 'Energy loss not imlemented for this solver.') #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from accelerate.cuda.blas import Blas from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream) cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream) cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr) from time import time start = time() for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(1.0), A=cu_int_m, x=cu_curr_phi, beta=fl_pr(0.0), y=cu_delta_phi) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=fl_pr(1.0), y=cu_delta_phi) cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi) print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return cu_curr_phi.copy_to_host(), []