def gemm_v1(): ''' Note that all arrays are in Fortran order. ''' print("Version 1".center(80, '=')) # Prepare arrays for input A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N), order='F') B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F') D = np.zeros_like(A, order='F') # NumPy start = timer() E = np.dot(A, np.diag(B)) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = Blas() start = timer() blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D) cuda_time = timer() - start print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def gemm_v2(): """ Let GEMM transpose the input matrices so that they can be in C order, originally. Note that the output matrix is still in Fortran array. The string arguments in gemm tells it to apply transformation on the input matrices. See argument description in: http://docs.continuum.io/accelerate/cublas#blas-level-2 """ print("Version 2".center(80, '=')) # Prepare arrays for input A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N)) B = np.array(np.arange(N) + 10, dtype=A.dtype) D = np.zeros_like(A, order='F') # NumPy start = timer() E = np.dot(A, np.diag(B)) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = Blas() start = timer() blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D) cuda_time = timer() - start print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def gemm_v1(): ''' Note that all arrays are in Fortran order. ''' print("Version 1".center(80, '=')) # Prepare arrays for input A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N), order='F') B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F') D = np.zeros_like(A, order='F') # NumPy start = timer() E = np.dot(A, np.diag(B)) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = Blas() start = timer() blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D) cuda_time = timer() - start print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def gemm_v2(): """ Let GEMM transpose the input matrices so that they can be in C order, originally. Note that the output matrix is still in Fortran array. The string arguments in gemm tells it to apply transformation on the input matrices. See argument description in: http://docs.continuum.io/accelerate/cublas#blas-level-2 """ print("Version 2".center(80, '=')) # Prepare arrays for input A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N)) B = np.array(np.arange(N) + 10, dtype=A.dtype) D = np.zeros_like(A, order='F') # NumPy start = timer() E = np.dot(A, np.diag(B)) numpy_time = timer() - start print("Numpy took %f seconds" % numpy_time) # cuBLAS blas = Blas() start = timer() blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D) cuda_time = timer() - start print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff))
def __init__(self, int_m, dec_m, device_id=0): if config['FP_precision'] == 32: self.fl_pr = np.float32 elif config['FP_precision'] == 64: self.fl_pr = np.float64 else: raise Exception( "CUDASparseContext(): Unknown precision specified.") #====================================================================== # Setup GPU stuff and upload data to it #====================================================================== try: from accelerate.cuda.blas import Blas import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cuda.select_device(0) self.cuda = cuda self.cusp = cusparse.Sparse() self.cubl = Blas() self.set_matrices(int_m, dec_m)
def mscAsianOption(C_P, S0, K, T, r, sigma, M, I): if C_P == 'C': par = 1.0 elif C_P == 'P': par = -1.0 else: return ("the value can not be calculatable") prng = rand.PRNG(seed=10000) prngx = Blas() dt = T / M S = np.zeros((M + 1, I)) ArithSum = np.zeros((1, I)) GeoSum = np.ones((1, I)) S[0] = S0 sigma_x = sigma * sqrt(float((M + 1) * (2 * M + 1)) / (6 * M**2)) r_x = float((r - 0.5 * sigma**2) * (M + 1)) / (2 * M) + 0.5 * sigma_x**2 d1 = (log(float(S0) / K) + (r_x + 0.5 * sigma_x**2) * T) / (sigma_x * sqrt(T)) d2 = d1 - sigma_x * sqrt(T) S_geoc = par * S0 * exp(r_x * T) * stats.norm.cdf(par * d1) K_geoc = par * K * stats.norm.cdf(par * d2) Vgeoc = exp(-r * T) * (S_geoc - K_geoc) for t in range(1, M + 1): z = np.empty(I) prng.normal(z, mean=0, sigma=1) S[t] = S[t - 1] * np.exp((r - 0.5 * sigma**2) * dt + sigma * sqrt(dt) * z) for i in range(1, M, 2): ArithSum += S[i] + S[i + 1] #ts:time series GeoSum *= S[i] * S[i + 1] X = np.maximum(par * ArithSum / M - par * K, 0.0) Y = np.maximum(par * np.power(GeoSum, 1.0 / M) - par * K, 0.0) theta = np.cov(X, Y)[0][1] / np.var(Y) D = exp(-r * T) Varith = D * prngx.asum(X[0]) / I Vgeo = D * prngx.asum(Y[0]) / I # V = Varith+theta*(Vgeoc - Vgeo) Varray = D * X + theta * (Vgeoc - D * Y) Vmean = prngx.asum(Varray[0]) / I Vstd = np.std(Varray) Vconf = [Vmean - 1.96 * Vstd / sqrt(I), Vmean + 1.96 * Vstd / sqrt(I)] return Vmean, Vconf, Varith, Vgeo
def mscBasketOption(C_P, S10, S20, K, T, r, sigma1, sigma2, cov, I): if C_P == 'C': par = 1.0 elif C_P == 'P': par = -1.0 else: return ("the value can not be calculatable") prng = rand.PRNG(seed=10000) prngx = Blas() D = exp(-r * T) z1 = np.empty(I) z2 = np.empty(I) prng.normal(z1, mean=0, sigma=1) prng.normal(z2, mean=0, sigma=1) z3 = cov * z1 + sqrt(1 - cov**2) * z2 S1T = S10 * np.exp((r - 0.5 * sigma1**2) * T + sigma1 * sqrt(T) * z1) S2T = S20 * np.exp((r - 0.5 * sigma2**2) * T + sigma2 * sqrt(T) * z3) Sa = (S1T + S2T) * 0.5 Sg = np.sqrt(S1T * S2T) hTa = np.maximum(par * Sa - par * K, 0) hTg = np.maximum(par * Sg - par * K, 0) Va = D * prngx.asum(hTa) / I Vg = D * prngx.asum(hTg) / I B0 = sqrt(S10 * S20) Bsigma = 0.5 * sqrt(sigma1**2 + sigma2**2 + 2 * sigma1 * sigma2 * cov) Bu = r - (sigma1**2 + sigma2**2) / (2 * 2) + 0.5 * Bsigma**2 d1_x_nominator = log(float(B0) / K) + (Bu + 0.5 * Bsigma**2) * T d1_x = d1_x_nominator / (Bsigma * sqrt(T)) d2_x = d1_x - Bsigma * sqrt(T) CB_nominator = par * B0 * exp(Bu * T) * stats.norm.cdf( par * d1_x) - par * K * stats.norm.cdf(par * d2_x) Vgc = D * CB_nominator theta = np.cov(hTa, hTg)[0][1] / np.var(hTg) # V = Va + theta*(Vgc - Vg) Varray = D * hTa + theta * (Vgc - D * hTg) Vmean = prngx.asum(Varray) / I Vstd = np.std(Varray) Vconf = [Vmean - 1.96 * Vstd / sqrt(I), Vmean + 1.96 * Vstd / sqrt(I)] return Vmean, Vconf, Va, Vg, Vgc
def simple(data, covfct, u, N=0, nugget=0): # calculate the matrices K, and k K, k, P = kmatrices(data, covfct, u, N) print len(K) # calculate the kriging weights weights = Blas().gemv('N', len(K), len(K), 1.0, K, K, 0.0, P) weights = np.array(weights) # calculate k' * K * k for # the kriging variance kvar = k.T * weights # mean of the variable mu = np.mean(data[:, 2]) # calculate the residuals residuals = P[:, 2] - mu # calculate the estimation estimation = np.dot(weights.T, residuals) + mu # calculate the sill and the # kriging standard deviation sill = np.var(data[:, 2]) kvar = float(sill + nugget - kvar) kstd = np.sqrt(kvar) return float(estimation), kstd
def gemm(A, B, dD): N = A.shape[0] # square matrices ''' Note that all arrays are in Fortran order. ''' # cuBLAS blas = Blas() start = timer() blas.gemm('N', 'N', N, N, N, 1.0, A, B, 1.0, dD) cuda_time = timer() - start D = dD.copy_to_host() print("CUBLAS took %f seconds" % cuda_time) diff = np.abs(D - E) print("Maximum error %f" % np.max(diff)) return D
class CUDASparseContext(object): def __init__(self, int_m, dec_m, device_id=0): if config['FP_precision'] == 32: self.fl_pr = np.float32 elif config['FP_precision'] == 64: self.fl_pr = np.float64 else: raise Exception( "CUDASparseContext(): Unknown precision specified.") #====================================================================== # Setup GPU stuff and upload data to it #====================================================================== try: from accelerate.cuda.blas import Blas import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cuda.select_device(0) self.cuda = cuda self.cusp = cusparse.Sparse() self.cubl = Blas() self.set_matrices(int_m, dec_m) def set_matrices(self, int_m, dec_m): import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda self.m, self.n = int_m.shape self.int_m_nnz = int_m.nnz self.int_m_csrValA = cuda.to_device(int_m.data.astype(self.fl_pr)) self.int_m_csrRowPtrA = cuda.to_device(int_m.indptr) self.int_m_csrColIndA = cuda.to_device(int_m.indices) self.dec_m_nnz = dec_m.nnz self.dec_m_csrValA = cuda.to_device(dec_m.data.astype(self.fl_pr)) self.dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr) self.dec_m_csrColIndA = cuda.to_device(dec_m.indices) self.descr = self.cusp.matdescr() self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO self.cu_delta_phi = self.cuda.device_array_like( np.zeros(self.m, dtype=self.fl_pr)) print np.zeros(self.m, dtype=self.fl_pr).shape def set_phi(self, phi): self.cu_curr_phi = self.cuda.to_device(phi.astype(self.fl_pr)) def get_phi(self): return self.cu_curr_phi.copy_to_host() def do_step(self, rho_inv, dX): self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.int_m_nnz, descr=self.descr, alpha=self.fl_pr(1.0), csrVal=self.int_m_csrValA, csrRowPtr=self.int_m_csrRowPtrA, csrColInd=self.int_m_csrColIndA, x=self.cu_curr_phi, beta=self.fl_pr(0.0), y=self.cu_delta_phi) # print np.sum(cu_curr_phi.copy_to_host()) self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.dec_m_nnz, descr=self.descr, alpha=self.fl_pr(rho_inv), csrVal=self.dec_m_csrValA, csrRowPtr=self.dec_m_csrRowPtrA, csrColInd=self.dec_m_csrColIndA, x=self.cu_curr_phi, beta=self.fl_pr(1.0), y=self.cu_delta_phi) self.cubl.axpy(alpha=self.fl_pr(dX), x=self.cu_delta_phi, y=self.cu_curr_phi)
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ fl_pr = None if config['FP_precision'] == 32: fl_pr = np.float32 elif config['FP_precision'] == 64: fl_pr = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") # if config['enable_muon_energyloss']: # raise NotImplementedError('kern_CUDA_dense(): ' + # 'Energy loss not imlemented for this solver.') if config['enable_muon_energy_loss']: raise NotImplementedError( 'kern_CUDA_dense(): ' + 'Energy loss not imlemented for this solver.') #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from accelerate.cuda.blas import Blas from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream) cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream) cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr) from time import time start = time() for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(1.0), A=cu_int_m, x=cu_curr_phi, beta=fl_pr(0.0), y=cu_delta_phi) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=fl_pr(1.0), y=cu_delta_phi) cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi) print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return cu_curr_phi.copy_to_host(), []