Esempio n. 1
0
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")    
    
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib.cublas import Blas  # @UnresolvedImport
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
            x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
            A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
Esempio n. 2
0
def sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = None,
                        MAX_TPB = 256, stream = None):
    """
    Input are device arrays.
    Inputs:
     dest, weight, fe 		: device arrays
     disconnect_weight 		: weight between unconnected vertices
     mst 					: list of edges in MST
     MAX_TPB 				: number of threads per block
     stream 				: CUDA stream to use
    TODO:
     - argmax is from cuBlas and only works with 32/64 floats. Make this work 
       with any type.
     - 
    """

    if disconnect_weight is None:
        disconnect_weight = weight.max()

    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream

    mst, n_edges = boruvka_minho_gpu(dest, weight, fe, od,
                                     MAX_TPB=MAX_TPB, stream=myStream,
    	  							 returnDevAry=True)

    # Allocate array for the mst weights.
    h_n_edges = int(n_edges.getitem(0, stream=myStream)) # edges to keep in MST
    mst_weights = cuda.device_array(h_n_edges, dtype=weight.dtype)    

    # Get array with only the considered weights in the MST
    # and remove those edges in the MST edge list
    mstGrid = compute_cuda_grid_dim(h_n_edges, MAX_TPB)
    d_weight = cuda.to_device(weight, stream = myStream)
    getWeightsOfEdges_gpu[mstGrid, MAX_TPB, myStream](mst, n_edges, d_weight,
                                                      mst_weights)    

    # Sort the MST weights. There are no repeated edges at this
    # point since the output MST is like a directed graph.
    sorter = RadixSort(maxcount = mst_weights.size, dtype = mst_weights.dtype,
                       stream = myStream)
    sortedWeightArgs = sorter.argsort(mst_weights)

    # Allocate array for the lifetimes.
    lifetimes = cuda.device_array(mst_weights.size - 1, dtype=mst_weights.dtype)
    compute_lifetimes_CUDA[mstGrid, MAX_TPB, myStream](mst_weights, lifetimes)

    maxer = Blas(stream)
    arg_max_lt = maxer.amax(lifetimes)
    max_lt = lifetimes.getitem(arg_max_lt)

    # this is the lifetime between edges with no connection and the weakest link
    #lt_threshold = disconnect_weight - max_lt
    lt_threshold = disconnect_weight - mst_weights.getitem(mst_weights.size - 1)

    # if the maximum lifetime is higher or equal than the lifetime threshold
    # cut the tree
    if max_lt >= lt_threshold:
        # from arg_max_lt onward all edges are discarded
        n_discarded = lifetimes.size - arg_max_lt + 1

        # remove edges
        removeGrid = compute_cuda_grid_dim(n_discarded, MAX_TPB)
        removeEdges[removeGrid, MAX_TPB](edgeList, sortedArgs, n_discarded)

        # compute new amount of edges and update it
        new_n_edges = h_n_edges - n_discarded
        cuda.to_device(np.array([new_n_edges], dtype = n_edges.dtype),
                       to = n_edges,
                       stream = myStream)

    ngraph = getGraphFromEdges_gpu(dest, weight, fe, od, edges = mst,
                                   n_edges = n_edges, MAX_TPB = MAX_TPB,
                                   stream = myStream)

    ndest, nweight, nfe, nod = ngraph

    labels = connected_comps_gpu(ndest, nweight, nfe, nod,
                                 MAX_TPB = 512, stream = myStream)

    del ndest, nweight, nfe, nod, lifetimes

    return labels
Esempio n. 3
0
def kern_CUDA_sparse(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuSPARSE <https://developer.nvidia.com/cusparse>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation.
    
    Note:
      Currently some bug in :mod:`numbapro` introduces unnecessary array copies and
      slows down the execution tremendously. 
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_sparse(): Unknown precision specified.")    
    print ("kern_CUDA_sparse(): Warning, the performance is slower than " + 
           "dense cuBLAS or any type of MKL.")
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib import cusparse  # @UnresolvedImport
        from numbapro.cudalib.cublas import Blas
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cusp = cusparse.Sparse()
    cubl = Blas()
    m, n = int_m.shape
    int_m_nnz = int_m.nnz
    int_m_csrValA = cuda.to_device(int_m.data.astype(calc_precision))
    int_m_csrRowPtrA = cuda.to_device(int_m.indptr)
    int_m_csrColIndA = cuda.to_device(int_m.indices)
    
    dec_m_nnz = dec_m.nnz
    dec_m_csrValA = cuda.to_device(dec_m.data.astype(calc_precision))
    dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr)
    dec_m_csrColIndA = cuda.to_device(dec_m.indices)
    
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision))
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)

    descr = cusp.matdescr()
    descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO
    
    for step in xrange(nsteps):
        if prog_bar and (step % 5 == 0):
            prog_bar.update(step)
        cusp.csrmv(trans='T', m=m, n=n, nnz=int_m_nnz,
                   descr=descr,
                   alpha=float32(1.0),
                   csrVal=int_m_csrValA,
                   csrRowPtr=int_m_csrRowPtrA,
                   csrColInd=int_m_csrColIndA,
                   x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cusp.csrmv(trans='T', m=m, n=n, nnz=dec_m_nnz,
                   descr=descr,
                   alpha=float32(rho_inv[step]),
                   csrVal=dec_m_csrValA,
                   csrRowPtr=dec_m_csrRowPtrA,
                   csrColInd=dec_m_csrColIndA,
                   x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()