def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib.cublas import Blas # @UnresolvedImport from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream) cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream) cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()