Beispiel #1
0
class CUDASparseContext(object):
    def __init__(self, int_m, dec_m, device_id=0):

        if config['FP_precision'] == 32:
            self.fl_pr = np.float32
        elif config['FP_precision'] == 64:
            self.fl_pr = np.float64
        else:
            raise Exception(
                "CUDASparseContext(): Unknown precision specified.")
        #======================================================================
        # Setup GPU stuff and upload data to it
        #======================================================================
        try:
            from accelerate.cuda.blas import Blas
            import accelerate.cuda.sparse as cusparse
            from accelerate.cuda import cuda
        except ImportError:
            raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " +
                            "installed.\nCan not use GPU.")

        cuda.select_device(0)
        self.cuda = cuda
        self.cusp = cusparse.Sparse()
        self.cubl = Blas()
        self.set_matrices(int_m, dec_m)

    def set_matrices(self, int_m, dec_m):
        import accelerate.cuda.sparse as cusparse
        from accelerate.cuda import cuda

        self.m, self.n = int_m.shape
        self.int_m_nnz = int_m.nnz
        self.int_m_csrValA = cuda.to_device(int_m.data.astype(self.fl_pr))
        self.int_m_csrRowPtrA = cuda.to_device(int_m.indptr)
        self.int_m_csrColIndA = cuda.to_device(int_m.indices)

        self.dec_m_nnz = dec_m.nnz
        self.dec_m_csrValA = cuda.to_device(dec_m.data.astype(self.fl_pr))
        self.dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr)
        self.dec_m_csrColIndA = cuda.to_device(dec_m.indices)

        self.descr = self.cusp.matdescr()
        self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO
        self.cu_delta_phi = self.cuda.device_array_like(
            np.zeros(self.m, dtype=self.fl_pr))
        print np.zeros(self.m, dtype=self.fl_pr).shape

    def set_phi(self, phi):
        self.cu_curr_phi = self.cuda.to_device(phi.astype(self.fl_pr))

    def get_phi(self):
        return self.cu_curr_phi.copy_to_host()

    def do_step(self, rho_inv, dX):

        self.cusp.csrmv(trans='N',
                        m=self.m,
                        n=self.n,
                        nnz=self.int_m_nnz,
                        descr=self.descr,
                        alpha=self.fl_pr(1.0),
                        csrVal=self.int_m_csrValA,
                        csrRowPtr=self.int_m_csrRowPtrA,
                        csrColInd=self.int_m_csrColIndA,
                        x=self.cu_curr_phi,
                        beta=self.fl_pr(0.0),
                        y=self.cu_delta_phi)
        # print np.sum(cu_curr_phi.copy_to_host())
        self.cusp.csrmv(trans='N',
                        m=self.m,
                        n=self.n,
                        nnz=self.dec_m_nnz,
                        descr=self.descr,
                        alpha=self.fl_pr(rho_inv),
                        csrVal=self.dec_m_csrValA,
                        csrRowPtr=self.dec_m_csrRowPtrA,
                        csrColInd=self.dec_m_csrColIndA,
                        x=self.cu_curr_phi,
                        beta=self.fl_pr(1.0),
                        y=self.cu_delta_phi)
        self.cubl.axpy(alpha=self.fl_pr(dX),
                       x=self.cu_delta_phi,
                       y=self.cu_curr_phi)
Beispiel #2
0
def kern_CUDA_dense(nsteps,
                    dX,
                    rho_inv,
                    int_m,
                    dec_m,
                    phi,
                    grid_idcs,
                    mu_egrid=None,
                    mu_dEdX=None,
                    mu_lidx_nsp=None,
                    prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation
    of forward-euler integration.

    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.

    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """

    fl_pr = None
    if config['FP_precision'] == 32:
        fl_pr = np.float32
    elif config['FP_precision'] == 64:
        fl_pr = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")

    # if config['enable_muon_energyloss']:
    #     raise NotImplementedError('kern_CUDA_dense(): ' +
    #         'Energy loss not imlemented for this solver.')

    if config['enable_muon_energy_loss']:
        raise NotImplementedError(
            'kern_CUDA_dense(): ' +
            'Energy loss not imlemented for this solver.')

    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from accelerate.cuda.blas import Blas
        from accelerate.cuda import cuda
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " +
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream)
    cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr)

    from time import time
    start = time()

    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='N',
                  m=m,
                  n=n,
                  alpha=fl_pr(1.0),
                  A=cu_int_m,
                  x=cu_curr_phi,
                  beta=fl_pr(0.0),
                  y=cu_delta_phi)
        cubl.gemv(trans='N',
                  m=m,
                  n=n,
                  alpha=fl_pr(rho_inv[step]),
                  A=cu_dec_m,
                  x=cu_curr_phi,
                  beta=fl_pr(1.0),
                  y=cu_delta_phi)
        cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) /
                                                     float(nsteps))

    return cu_curr_phi.copy_to_host(), []