Esempi in Python per Blas, esempi in Python per accelerate.cuda.blas.Blas

Esempio n. 1

0

Mostra file

File: test_conda.py Progetto: johnnykwwang/GPGPU_Final_Python_CUDA

def gemm_v1():
    '''
    Note that all arrays are in Fortran order.
    '''
    print("Version 1".center(80, '='))
    # Prepare arrays for input
    A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N), order='F')
    B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F')
    D = np.zeros_like(A, order='F')

    # NumPy
    start = timer()
    E = np.dot(A, np.diag(B))
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = Blas()

    start = timer()
    blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D)
    cuda_time = timer() - start

    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))

Esempio n. 2

0

Mostra file

File: test_conda.py Progetto: johnnykwwang/GPGPU_Final_Python_CUDA

def gemm_v2():
    """
    Let GEMM transpose the input matrices so that they can be in C order,
    originally.  Note that the output matrix is still in Fortran array.
    The string arguments in gemm tells it to apply transformation on the input
    matrices.
    See argument description in:
        http://docs.continuum.io/accelerate/cublas#blas-level-2
    """
    print("Version 2".center(80, '='))
    # Prepare arrays for input
    A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N))
    B = np.array(np.arange(N) + 10, dtype=A.dtype)
    D = np.zeros_like(A, order='F')

    # NumPy
    start = timer()
    E = np.dot(A, np.diag(B))
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = Blas()

    start = timer()
    blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D)
    cuda_time = timer() - start

    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))

Esempio n. 3

0

Mostra file

File: gemm.py Progetto: AngelBerihuete/numbapro-examples

def gemm_v1():
    '''
    Note that all arrays are in Fortran order.
    '''
    print("Version 1".center(80, '='))
    # Prepare arrays for input
    A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N), order='F')
    B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F')
    D = np.zeros_like(A, order='F')

    # NumPy
    start = timer()
    E = np.dot(A, np.diag(B))
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = Blas()

    start = timer()
    blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D)
    cuda_time = timer() - start

    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))

Esempio n. 4

0

Mostra file

File: gemm.py Progetto: AngelBerihuete/numbapro-examples

def gemm_v2():
    """
    Let GEMM transpose the input matrices so that they can be in C order,
    originally.  Note that the output matrix is still in Fortran array.
    The string arguments in gemm tells it to apply transformation on the input
    matrices.

    See argument description in:
        http://docs.continuum.io/accelerate/cublas#blas-level-2
    """
    print("Version 2".center(80, '='))
    # Prepare arrays for input
    A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N))
    B = np.array(np.arange(N) + 10, dtype=A.dtype)
    D = np.zeros_like(A, order='F')

    # NumPy
    start = timer()
    E = np.dot(A, np.diag(B))
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = Blas()

    start = timer()
    blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D)
    cuda_time = timer() - start

    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))

Esempio n. 5

0

Mostra file

File: kernels.py Progetto: HansN87/MCEq

    def __init__(self, int_m, dec_m, device_id=0):

        if config['FP_precision'] == 32:
            self.fl_pr = np.float32
        elif config['FP_precision'] == 64:
            self.fl_pr = np.float64
        else:
            raise Exception(
                "CUDASparseContext(): Unknown precision specified.")
        #======================================================================
        # Setup GPU stuff and upload data to it
        #======================================================================
        try:
            from accelerate.cuda.blas import Blas
            import accelerate.cuda.sparse as cusparse
            from accelerate.cuda import cuda
        except ImportError:
            raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " +
                            "installed.\nCan not use GPU.")

        cuda.select_device(0)
        self.cuda = cuda
        self.cusp = cusparse.Sparse()
        self.cubl = Blas()
        self.set_matrices(int_m, dec_m)

Esempio n. 6

0

Mostra file

def mscAsianOption(C_P, S0, K, T, r, sigma, M, I):
    if C_P == 'C':
        par = 1.0
    elif C_P == 'P':
        par = -1.0
    else:
        return ("the value can not be calculatable")

    prng = rand.PRNG(seed=10000)
    prngx = Blas()
    dt = T / M
    S = np.zeros((M + 1, I))
    ArithSum = np.zeros((1, I))
    GeoSum = np.ones((1, I))
    S[0] = S0
    sigma_x = sigma * sqrt(float((M + 1) * (2 * M + 1)) / (6 * M**2))
    r_x = float((r - 0.5 * sigma**2) * (M + 1)) / (2 * M) + 0.5 * sigma_x**2

    d1 = (log(float(S0) / K) +
          (r_x + 0.5 * sigma_x**2) * T) / (sigma_x * sqrt(T))
    d2 = d1 - sigma_x * sqrt(T)
    S_geoc = par * S0 * exp(r_x * T) * stats.norm.cdf(par * d1)
    K_geoc = par * K * stats.norm.cdf(par * d2)
    Vgeoc = exp(-r * T) * (S_geoc - K_geoc)

    for t in range(1, M + 1):
        z = np.empty(I)
        prng.normal(z, mean=0, sigma=1)
        S[t] = S[t - 1] * np.exp((r - 0.5 * sigma**2) * dt +
                                 sigma * sqrt(dt) * z)

    for i in range(1, M, 2):
        ArithSum += S[i] + S[i + 1]  #ts:time series
        GeoSum *= S[i] * S[i + 1]

    X = np.maximum(par * ArithSum / M - par * K, 0.0)
    Y = np.maximum(par * np.power(GeoSum, 1.0 / M) - par * K, 0.0)
    theta = np.cov(X, Y)[0][1] / np.var(Y)
    D = exp(-r * T)
    Varith = D * prngx.asum(X[0]) / I
    Vgeo = D * prngx.asum(Y[0]) / I

    # V = Varith+theta*(Vgeoc - Vgeo)
    Varray = D * X + theta * (Vgeoc - D * Y)
    Vmean = prngx.asum(Varray[0]) / I
    Vstd = np.std(Varray)
    Vconf = [Vmean - 1.96 * Vstd / sqrt(I), Vmean + 1.96 * Vstd / sqrt(I)]
    return Vmean, Vconf, Varith, Vgeo

Esempio n. 7

0

Mostra file

File: mscGPUBO.py Progetto: rolldance/OptionValuation

def mscBasketOption(C_P, S10, S20, K, T, r, sigma1, sigma2, cov, I):
    if C_P == 'C':
        par = 1.0
    elif C_P == 'P':
        par = -1.0
    else:
        return ("the value can not be calculatable")

    prng = rand.PRNG(seed=10000)
    prngx = Blas()
    D = exp(-r * T)

    z1 = np.empty(I)
    z2 = np.empty(I)
    prng.normal(z1, mean=0, sigma=1)
    prng.normal(z2, mean=0, sigma=1)

    z3 = cov * z1 + sqrt(1 - cov**2) * z2

    S1T = S10 * np.exp((r - 0.5 * sigma1**2) * T + sigma1 * sqrt(T) * z1)
    S2T = S20 * np.exp((r - 0.5 * sigma2**2) * T + sigma2 * sqrt(T) * z3)

    Sa = (S1T + S2T) * 0.5
    Sg = np.sqrt(S1T * S2T)
    hTa = np.maximum(par * Sa - par * K, 0)
    hTg = np.maximum(par * Sg - par * K, 0)
    Va = D * prngx.asum(hTa) / I
    Vg = D * prngx.asum(hTg) / I

    B0 = sqrt(S10 * S20)
    Bsigma = 0.5 * sqrt(sigma1**2 + sigma2**2 + 2 * sigma1 * sigma2 * cov)
    Bu = r - (sigma1**2 + sigma2**2) / (2 * 2) + 0.5 * Bsigma**2
    d1_x_nominator = log(float(B0) / K) + (Bu + 0.5 * Bsigma**2) * T
    d1_x = d1_x_nominator / (Bsigma * sqrt(T))
    d2_x = d1_x - Bsigma * sqrt(T)

    CB_nominator = par * B0 * exp(Bu * T) * stats.norm.cdf(
        par * d1_x) - par * K * stats.norm.cdf(par * d2_x)
    Vgc = D * CB_nominator

    theta = np.cov(hTa, hTg)[0][1] / np.var(hTg)

    # V = Va + theta*(Vgc - Vg)
    Varray = D * hTa + theta * (Vgc - D * hTg)
    Vmean = prngx.asum(Varray) / I
    Vstd = np.std(Varray)
    Vconf = [Vmean - 1.96 * Vstd / sqrt(I), Vmean + 1.96 * Vstd / sqrt(I)]
    return Vmean, Vconf, Va, Vg, Vgc

Esempio n. 8

0

Mostra file

def simple(data, covfct, u, N=0, nugget=0):

    # calculate the matrices K, and k
    K, k, P = kmatrices(data, covfct, u, N)
    print len(K)
    # calculate the kriging weights
    weights = Blas().gemv('N', len(K), len(K), 1.0, K, K, 0.0, P)
    weights = np.array(weights)

    # calculate k' * K * k for
    # the kriging variance
    kvar = k.T * weights

    # mean of the variable
    mu = np.mean(data[:, 2])

    # calculate the residuals
    residuals = P[:, 2] - mu

    # calculate the estimation
    estimation = np.dot(weights.T, residuals) + mu

    # calculate the sill and the
    # kriging standard deviation
    sill = np.var(data[:, 2])
    kvar = float(sill + nugget - kvar)
    kstd = np.sqrt(kvar)

    return float(estimation), kstd

Esempio n. 9

0

Mostra file

File: matmul_pycuda.py Progetto: scienceopen/python-performance

def gemm(A, B, dD):
    N = A.shape[0]  # square matrices
    '''
    Note that all arrays are in Fortran order.
    '''

    # cuBLAS
    blas = Blas()

    start = timer()
    blas.gemm('N', 'N', N, N, N, 1.0, A, B, 1.0, dD)
    cuda_time = timer() - start

    D = dD.copy_to_host()
    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))
    return D

Esempio n. 10

0

Mostra file

File: kernels.py Progetto: HansN87/MCEq

class CUDASparseContext(object):
    def __init__(self, int_m, dec_m, device_id=0):

        if config['FP_precision'] == 32:
            self.fl_pr = np.float32
        elif config['FP_precision'] == 64:
            self.fl_pr = np.float64
        else:
            raise Exception(
                "CUDASparseContext(): Unknown precision specified.")
        #======================================================================
        # Setup GPU stuff and upload data to it
        #======================================================================
        try:
            from accelerate.cuda.blas import Blas
            import accelerate.cuda.sparse as cusparse
            from accelerate.cuda import cuda
        except ImportError:
            raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " +
                            "installed.\nCan not use GPU.")

        cuda.select_device(0)
        self.cuda = cuda
        self.cusp = cusparse.Sparse()
        self.cubl = Blas()
        self.set_matrices(int_m, dec_m)

    def set_matrices(self, int_m, dec_m):
        import accelerate.cuda.sparse as cusparse
        from accelerate.cuda import cuda

        self.m, self.n = int_m.shape
        self.int_m_nnz = int_m.nnz
        self.int_m_csrValA = cuda.to_device(int_m.data.astype(self.fl_pr))
        self.int_m_csrRowPtrA = cuda.to_device(int_m.indptr)
        self.int_m_csrColIndA = cuda.to_device(int_m.indices)

        self.dec_m_nnz = dec_m.nnz
        self.dec_m_csrValA = cuda.to_device(dec_m.data.astype(self.fl_pr))
        self.dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr)
        self.dec_m_csrColIndA = cuda.to_device(dec_m.indices)

        self.descr = self.cusp.matdescr()
        self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO
        self.cu_delta_phi = self.cuda.device_array_like(
            np.zeros(self.m, dtype=self.fl_pr))
        print np.zeros(self.m, dtype=self.fl_pr).shape

    def set_phi(self, phi):
        self.cu_curr_phi = self.cuda.to_device(phi.astype(self.fl_pr))

    def get_phi(self):
        return self.cu_curr_phi.copy_to_host()

    def do_step(self, rho_inv, dX):

        self.cusp.csrmv(trans='N',
                        m=self.m,
                        n=self.n,
                        nnz=self.int_m_nnz,
                        descr=self.descr,
                        alpha=self.fl_pr(1.0),
                        csrVal=self.int_m_csrValA,
                        csrRowPtr=self.int_m_csrRowPtrA,
                        csrColInd=self.int_m_csrColIndA,
                        x=self.cu_curr_phi,
                        beta=self.fl_pr(0.0),
                        y=self.cu_delta_phi)
        # print np.sum(cu_curr_phi.copy_to_host())
        self.cusp.csrmv(trans='N',
                        m=self.m,
                        n=self.n,
                        nnz=self.dec_m_nnz,
                        descr=self.descr,
                        alpha=self.fl_pr(rho_inv),
                        csrVal=self.dec_m_csrValA,
                        csrRowPtr=self.dec_m_csrRowPtrA,
                        csrColInd=self.dec_m_csrColIndA,
                        x=self.cu_curr_phi,
                        beta=self.fl_pr(1.0),
                        y=self.cu_delta_phi)
        self.cubl.axpy(alpha=self.fl_pr(dX),
                       x=self.cu_delta_phi,
                       y=self.cu_curr_phi)

Esempio n. 11

0

Mostra file

File: kernels.py Progetto: HansN87/MCEq

def kern_CUDA_dense(nsteps,
                    dX,
                    rho_inv,
                    int_m,
                    dec_m,
                    phi,
                    grid_idcs,
                    mu_egrid=None,
                    mu_dEdX=None,
                    mu_lidx_nsp=None,
                    prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation
    of forward-euler integration.

    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.

    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """

    fl_pr = None
    if config['FP_precision'] == 32:
        fl_pr = np.float32
    elif config['FP_precision'] == 64:
        fl_pr = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")

    # if config['enable_muon_energyloss']:
    #     raise NotImplementedError('kern_CUDA_dense(): ' +
    #         'Energy loss not imlemented for this solver.')

    if config['enable_muon_energy_loss']:
        raise NotImplementedError(
            'kern_CUDA_dense(): ' +
            'Energy loss not imlemented for this solver.')

    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from accelerate.cuda.blas import Blas
        from accelerate.cuda import cuda
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " +
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream)
    cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr)

    from time import time
    start = time()

    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='N',
                  m=m,
                  n=n,
                  alpha=fl_pr(1.0),
                  A=cu_int_m,
                  x=cu_curr_phi,
                  beta=fl_pr(0.0),
                  y=cu_delta_phi)
        cubl.gemv(trans='N',
                  m=m,
                  n=n,
                  alpha=fl_pr(rho_inv[step]),
                  A=cu_dec_m,
                  x=cu_curr_phi,
                  beta=fl_pr(1.0),
                  y=cu_delta_phi)
        cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) /
                                                     float(nsteps))

    return cu_curr_phi.copy_to_host(), []