def reduce_by_key(input_data, chunk_id, literal, length):#step 3 flag = numpy.ones(length, dtype='int32') stream = d_flag = cuda.to_device(flag, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag) d_flag.to_host(stream) print 'flag:' print flag stream.synchronize() is_finish = numpy.zeros(length, dtype='int32') hop = 1 while hop<32:#only 32 because the length of a word in binary form is 32 reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length) hop *= 2 d_literal.to_host(stream) d_chunk_id.to_host(stream) stream.synchronize() reduced_input_data = [] reduced_chunk_id = [] reduced_literal =[] for i in xrange(length): if flag[i]: reduced_input_data.append(input_data[i]) reduced_chunk_id.append(chunk_id[i]) reduced_literal.append(literal[i]) return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
def tests(): a = np.random.rand(300,500) b = np.random.rand(500,300) start = timer() c =,b) nptime = timer()-start print('nptime',nptime) x = np.array(np.random.rand(600,1500),dtype='float32',order='F') y = np.array(np.random.rand(1500,300),dtype='float32',order='F') z = np.zeros((1000,1000),order='F',dtype='float32') stream = dx = cuda.to_device(x) dy = cuda.to_device(y) dz = cuda.to_device(z) start = timer() blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz) cutime = timer()-start print('cutime',cutime) #dz.copy_to_host(z) print(dz[0]) c = np.ones((1000,1000),order='F',dtype='float32') print(c.shape) dc = cuda.to_device(c) # blockDim = (256,256) #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1])) blockDim = (30,30) gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1])) start = timer() mtanh[gridDim,blockDim,stream](dc) tantime = timer() - start print('tantime',tantime) dc.copy_to_host(c,stream=stream) stream.synchronize() print(c) y = cm.CUDAMatrix(np.ones((1000,1000))) start = timer() cm.tanh(y) cmtan = timer()-start print('cmtan',cmtan) x = cm.CUDAMatrix(np.random.rand(1000,1500)) y = cm.CUDAMatrix(np.random.rand(1500,1000)) start = timer(),y) cmtime = timer()-start print('cmtime',cmtime)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) d_last = d_paths stream.synchronize()
def reduce_by_key(input_data, chunk_id, literal, length): #step 3 flag = numpy.ones(length, dtype='int32') stream = d_flag = cuda.to_device(flag, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) produce_flag[1, tpb](input_data, d_chunk_id, length, d_flag) d_flag.to_host(stream) print 'flag:' print flag stream.synchronize() is_finish = numpy.zeros(length, dtype='int32') hop = 1 while hop < 32: #only 32 because the length of a word in binary form is 32 reduce_by_key_gpu[1, tpb](d_literal, d_flag, is_finish, hop, length) hop *= 2 d_literal.to_host(stream) d_chunk_id.to_host(stream) stream.synchronize() reduced_input_data = [] reduced_chunk_id = [] reduced_literal = [] for i in xrange(length): if flag[i]: reduced_input_data.append(input_data[i]) reduced_chunk_id.append(chunk_id[i]) reduced_literal.append(literal[i]) return numpy.array(reduced_input_data), numpy.array( reduced_chunk_id), reduced_literal
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> step_cfg = step[gridsz, blksz, stream] d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) d_paths.copy_to_host(paths[:, j], stream=stream), stream=stream) d_last = d_paths stream.synchronize()
def radix_sort(arr, rid): length = numpy.int64(len(arr)) bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length thread_num = numpy.int64(math.pow(2,bin_length)) block_num = max(thread_num/TPB_MAX,1) stream = one_list = numpy.zeros(shape=(thread_num), dtype='int64') zero_list = numpy.zeros(shape=(thread_num), dtype='int64') iter_num = len(bin(ATTR_CARD_MAX)) for i in range(iter_num): d_arr = cuda.to_device(arr, stream) d_rid = cuda.to_device(rid, stream) d_zero_list = cuda.to_device(zero_list,stream) d_one_list = cuda.to_device(one_list,stream) get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list d_one_list.to_host(stream) d_zero_list.to_host(stream) stream.synchronize() base_reduction_block_num = block_num base_reduction_block_size = TPB_MAX tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64') d_tmp_out = cuda.to_device(tmp_out, stream) sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out) d_tmp_out.to_host(stream) stream.synchronize() base = 0 #base for the scan of one_list for j in xrange(base_reduction_block_num): base += tmp_out[j] Blelloch_scan_caller(d_zero_list, d_one_list, base) array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
def main(): NN = 4096 NM = 4096 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m) timer = time.time() iter = 0 blockdim = (tpb, tpb) griddim = (NN/blockdim[0], NM/blockdim[1]) error_grid = np.zeros(griddim) stream = dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: assert error_grid.dtype == np.float64 jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.to_host(stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer) iter += 1 runtime = time.time() - timer print " total: %f s" % runtime
def main(): NN = 4096 NM = 4096 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m) timer = time.time() iter = 0 blockdim = (32, 32) griddim = (NN / blockdim[0], NM / blockdim[1]) error_grid = np.zeros_like(A) stream = dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: assert error_grid.dtype == np.float64 jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.to_host(stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time() - timer) iter += 1 runtime = time.time() - timer print " total: %f s" % runtime
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [ for _ in range(num_streams)] prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm), stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def get_indexList(path, attr_selected): path1, path2, attr_num = bitmap_pickle.get_pic_path(path) f1 = open(path1, 'rb') # read data_map.pkl try: attr_map = pickle.load(f1) attr_list = pickle.load(f1) attr_total = pickle.load(f1) finally: f1.close() f2 = open(path2, 'rb') # read bitmap_pic.pkl try: lists = pickle.load(f2) key = pickle.load(f2) offset = pickle.load(f2) finally: f2.close() # attr_input is a list that stores the numbers of input attributes # attr_num is the total number of attributes # attr_total is the total number of data/31 attr_input = [[] for i in xrange(attr_num)] for i in xrange(attr_num): for attri in attr_selected[i]: if attri in attr_map[i]: attr_input[i].append(attr_map[i][attri]) elif attri == 'All': attr_input[i].append(-1) if len(attr_input[i]) > 1 and (-1 in attr_input[i]): attr_input[i].remove(-1) print attr_input search_start_time = time.time() if len(attr_input ) != attr_num: # there might be a wrong input in print 'No eligible projects' else: tpb = 1024 blocknum = 1 attr_mul = (attr_total + (tpb * blocknum - 1)) / (tpb * blocknum) # attr_mul is the number that each thread need to be performed #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul) # attr_num = 1 index_list = numpy.zeros(attr_total * 31, dtype='int32') bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset) stream = d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream) d_index_list = cuda.to_device(numpy.array(index_list), stream) index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul) index_list = d_index_list.copy_to_host() stream.synchronize() search_end_time = time.time() return index_list, search_end_time - search_start_time
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuBLAS <>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib.cublas import Blas # @UnresolvedImport from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream) cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream) cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()
def main(): vort = np.array(np.random.rand(2 * n), dtype=dtype).reshape((n, 2)) gamma = np.array(np.random.rand(n), dtype=dtype) vel = np.zeros_like(vort) start = timer() induced_velocity(vort, vort, gamma, vel) numpy_time = timer() - start print("n = %d" % n) print("Numpy".center(40, "=")) print("Time: %f seconds" % numpy_time) vel2 = np.zeros_like(vort) start = timer() induced_velocity2(vort, vort, gamma, vel2) numba_time = timer() - start print("Numba".center(40, "=")) print("Time: %f seconds" % numba_time) error = np.max(np.max(np.abs(vel2 - vel))) print("Difference: %f" % error) print("Speedup: %f" % (numpy_time / numba_time)) stream = d_vort = cuda.to_device(vort, stream) d_gamma = cuda.to_device(gamma, stream) vel3 = np.zeros_like(vort) d_vel = cuda.to_device(vel3, stream) # blockdim = (32,32) # griddim = (n // blockdim[0], n // blockdim[1]) griddim = (n - 1) // blksize + 1 start = timer() induced_velocity3[griddim, blksize, stream](d_vort, d_vort, d_gamma, d_vel) d_vel.to_host(stream) gpu_time = timer() - start error = np.max(np.max(np.abs(vel3 - vel))) print("GPU".center(40, "=")) print("Time: %f seconds" % gpu_time) print("Difference: %f" % error) print("Speedup: %f" % (numpy_time / gpu_time)) # print(vel3) vel4 = np.zeros_like(vort) d_vel2 = cuda.to_device(vel4, stream) start = timer() induced_velocity4[griddim, blksize, stream](d_vort, d_vort, d_gamma, d_vel2) d_vel2.to_host(stream) gpu2_time = timer() - start error = np.max(np.max(np.abs(vel4 - vel))) print("GPU smem".center(40, "=")) print("Time: %f seconds" % gpu2_time) print("Difference: %f" % error) print("Speedup: %f" % (numpy_time / gpu2_time))
def get_indexList(path, attr_selected): path1, path2, attr_num = bitmap_pickle.get_pic_path(path) f1 = open(path1, 'rb') # read data_map.pkl try: attr_map = pickle.load(f1) attr_list = pickle.load(f1) attr_total = pickle.load(f1) finally: f1.close() f2 = open(path2, 'rb') # read bitmap_pic.pkl try: lists = pickle.load(f2) key = pickle.load(f2) offset = pickle.load(f2) finally: f2.close() # attr_input is a list that stores the numbers of input attributes # attr_num is the total number of attributes # attr_total is the total number of data/31 attr_input = [[] for i in xrange(attr_num)] for i in xrange(attr_num): for attri in attr_selected[i]: if attri in attr_map[i]: attr_input[i].append(attr_map[i][attri]) elif attri == 'All': attr_input[i].append(-1) if len(attr_input[i])>1 and (-1 in attr_input[i]): attr_input[i].remove(-1) print attr_input search_start_time = time.time() if len(attr_input) != attr_num: # there might be a wrong input in print 'No eligible projects' else: tpb = 1024 blocknum = 1 attr_mul = (attr_total + (tpb * blocknum - 1))/(tpb * blocknum) # attr_mul is the number that each thread need to be performed #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul) # attr_num = 1 index_list = numpy.zeros(attr_total*31, dtype='int32') bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset) stream = d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream) d_index_list = cuda.to_device(numpy.array(index_list), stream) index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul) index_list = d_index_list.copy_to_host() stream.synchronize() search_end_time = time.time() return index_list, search_end_time-search_start_time
def main(): flowtime = 0.1 nx = 128 ny = 128 dx = 2.0 / (nx - 1) dy = 2.0 / (ny - 1) dt = dx / 50 ##ensures stability for a given mesh fineness rho = 1.0 nu = .1 nt = int( flowtime / dt ) ##calculate number of timesteps required to reach a specified total flowtime U = numpy.zeros((nx, ny), dtype=numpy.float32) U[-1, :] = 1 V = numpy.zeros((nx, ny), dtype=numpy.float32) P = numpy.zeros((ny, nx), dtype=numpy.float32) UN = numpy.zeros((nx, ny), dtype=numpy.float32) VN = numpy.zeros((nx, ny), dtype=numpy.float32) griddim = nx, ny blockdim = 768, 768, 1 #if nx > 767: # griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0])) t1 = time.time() ###Target the GPU to begin calculation stream = d_U = cuda.to_device(U, stream) d_V = cuda.to_device(V, stream) d_UN = cuda.to_device(UN, stream) d_VN = cuda.to_device(VN, stream) for i in range(nt): P = ppe(rho, dt, dx, dy, U, V, P) CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt, rho, nu) d_U.to_host(stream) d_V.to_host(stream) stream.synchronize() t2 = time.time() print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2 - t1) x = numpy.linspace(0, 2, nx) y = numpy.linspace(0, 2, ny) Y, X = numpy.meshgrid(y, x)
def produce_fill(reduced_input_data, reduced_chunk_id, reduced_length):#step 4 head = numpy.ones(reduced_length, dtype='int32') stream = d_head = cuda.to_device(head, stream) d_reduced_input_data = cuda.to_device(reduced_input_data, stream) produce_head[1,tpb](d_reduced_input_data, d_head, reduced_length)#produce head d_head.to_host(stream) stream.synchronize() d_reduced_chunk_id = cuda.to_device(reduced_chunk_id,stream) produce_fill_gpu[1,tpb](d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length) d_reduced_chunk_id.to_host(stream) stream.synchronize() #convert to int32 because the range a fill_word can describe is 0~(2^31-1) return numpy.array(reduced_chunk_id, dtype='int32')
def main(): timeintial = time.time() OPT_N = 4000000 blockdim = 1024, 1 griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1 stream = ###### Initialize Parameters ###### strike = 80 t = 1 expiry = 10 spot = 105 sigma = .3 rate = .03 dividend = 0 # Alpha apparently measures performance compared to the projected performance alpha = .69 # Steps in time N = 10 #Number of simulations M = 100 ## TODO: Figure out what to set dt to dt = 1 Vbar = 0.02 xi = xi = .025 N = 100 M = 2000 beta1 = -.88 beta2 = -.42 beta3 = -.0003 sigma2 = sigma**2 alphadt = alpha*dt xisdt = xi*np.sqrt(dt) erddt = np.exp((rate-dividend)*dt) egam1 = np.exp(2*(rate-dividend)*dt) egam2 = -2*erddt + 1 eveg1 = np.exp(-alpha*dt) eveg2 = Vbar - Vbar*eveg1 tau = expiry-t VectorizedMonteCarlo[griddim, blockdim, stream](spot, rate, sigma, expiry, N, M, strike, sigma2, Vbar, dt, xi, alpha, dividend, tau) stream.synchronize()
def __init__(self, gpuID=None, stream=None): if gpuID is not None: if gpuID < len(cuda.list_devices()) and gpuID >= 0: cuda.close() cuda.select_device(gpuID) else: raise ValueError('GPU ID not found') if stream is None: = else: assert isinstance(stream, numba.cuda.cudadrv.driver.Stream) = stream self.blas = numbapro.cudalib.cublas.Blas( self.blockdim = 32 self.blockdim2 = (32, 32)
def main(): flowtime = 0.1 nx = 128 ny = 128 dx = 2.0/(nx-1) dy = 2.0/(ny-1) dt = dx/50 ##ensures stability for a given mesh fineness rho = 1.0 nu =.1 nt = int(flowtime/dt) ##calculate number of timesteps required to reach a specified total flowtime U = numpy.zeros((nx,ny), dtype=numpy.float32) U[-1,:] = 1 V = numpy.zeros((nx,ny), dtype=numpy.float32) P = numpy.zeros((ny, nx), dtype=numpy.float32) UN = numpy.zeros((nx,ny), dtype=numpy.float32) VN = numpy.zeros((nx,ny), dtype=numpy.float32) griddim = nx, ny blockdim = 768, 768, 1 #if nx > 767: # griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0])) t1 = time.time() ###Target the GPU to begin calculation stream = d_U = cuda.to_device(U, stream) d_V = cuda.to_device(V, stream) d_UN = cuda.to_device(UN, stream) d_VN = cuda.to_device(VN, stream) for i in range(nt): P = ppe(rho, dt, dx, dy, U, V, P) CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt, rho, nu) d_U.to_host(stream) d_V.to_host(stream) stream.synchronize() t2 = time.time() print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2-t1) x = numpy.linspace(0,2,nx) y = numpy.linspace(0,2,ny) Y,X = numpy.meshgrid(y,x)
def spca_simpler(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] numSamples = int(math.ceil((4. / epsilon)**d)) print(numSamples) ##actual algorithm opt_x = np.zeros((p, 1)) opt_v = -np.inf # Prepare CUDA prng = curand.PRNG() custr = #GENERATE ALL RANDOM SAMPLES BEFORE # C = np.random.randn(d, numSamples).astype(float_dtype) C = np.empty((d, numSamples), dtype=float_dtype) prng.normal(C.ravel(), mean=0, sigma=1) sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, descending=True) for i in range(1, numSamples + 1): #c = np.random.randn(d,1) #c = C[:,i-1] c = C[:, i - 1:i] c = c / np.linalg.norm(c) a = #partial argsort in numpy? #if partial, kth largest is p-k th smallest #but need indices more than partial # I = np.argsort(a, axis=0) # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest # I = sorter.argselect(a[:, 0], k=k, reverse=True) I = sorter.argselect(k, a[:, 0]) val = np.linalg.norm(a[:k]) #index to get k largest if val > opt_v: opt_v = val opt_x = np.zeros((p, 1), dtype=float_dtype) opt_x[I] = a[:k] / val return opt_x
def produce_fill(reduced_input_data, reduced_chunk_id, reduced_length): #step 4 head = numpy.ones(reduced_length, dtype='int32') stream = d_head = cuda.to_device(head, stream) d_reduced_input_data = cuda.to_device(reduced_input_data, stream) produce_head[1, tpb](d_reduced_input_data, d_head, reduced_length) #produce head d_head.to_host(stream) stream.synchronize() d_reduced_chunk_id = cuda.to_device(reduced_chunk_id, stream) produce_fill_gpu[1, tpb](d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length) d_reduced_chunk_id.to_host(stream) stream.synchronize() #convert to int32 because the range a fill_word can describe is 0~(2^31-1) return numpy.array(reduced_chunk_id, dtype='int32')
def spca_simpler(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] numSamples = int(math.ceil((4. / epsilon) ** d)) print(numSamples) ##actual algorithm opt_x = np.zeros((p, 1)) opt_v = -np.inf # Prepare CUDA prng = curand.PRNG() custr = #GENERATE ALL RANDOM SAMPLES BEFORE # C = np.random.randn(d, numSamples).astype(float_dtype) C = np.empty((d, numSamples), dtype=float_dtype) prng.normal(C.ravel(), mean=0, sigma=1) sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, descending=True) for i in range(1, numSamples + 1): #c = np.random.randn(d,1) #c = C[:,i-1] c = C[:, i - 1:i] c = c / np.linalg.norm(c) a = #partial argsort in numpy? #if partial, kth largest is p-k th smallest #but need indices more than partial # I = np.argsort(a, axis=0) # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest # I = sorter.argselect(a[:, 0], k=k, reverse=True) I = sorter.argselect(k, a[:, 0]) val = np.linalg.norm(a[:k]) #index to get k largest if val > opt_v: opt_v = val opt_x = np.zeros((p, 1), dtype=float_dtype) opt_x[I] = a[:k] / val return opt_x
def compute_block(self): device_uniforms = curand.uniform(size=N * N, device=True) host_results = zeros((self.size, self.size)) stream = device_proposals = cuda.to_device(self.host_proposals, stream=stream) device_omegas = cuda.to_device(self.host_omegas, stream=stream) device_results = cuda.device_array_like(host_results, stream=stream) cu_one_block[self.grid_dim, self.threads_per_block, stream](self.start, device_proposals, device_omegas, device_uniforms, device_results, self.size, self.size) device_results.copy_to_host(host_results, stream=stream) stream.synchronize() return host_results
def radix_sort(arr, rid): length = numpy.int64(len(arr)) bin_length = max(len(bin(length - 1)), len( bin(TPB_MAX - 1))) #the bit number of binary form of array length thread_num = numpy.int64(math.pow(2, bin_length)) block_num = max(thread_num / TPB_MAX, 1) stream = one_list = numpy.zeros(shape=(thread_num), dtype='int64') zero_list = numpy.zeros(shape=(thread_num), dtype='int64') iter_num = len(bin(ATTR_CARD_MAX)) for i in range(iter_num): d_arr = cuda.to_device(arr, stream) d_rid = cuda.to_device(rid, stream) d_zero_list = cuda.to_device(zero_list, stream) d_one_list = cuda.to_device(one_list, stream) get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list) #get one_list and zero_list d_one_list.to_host(stream) d_zero_list.to_host(stream) stream.synchronize() base_reduction_block_num = block_num base_reduction_block_size = TPB_MAX tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64') d_tmp_out = cuda.to_device(tmp_out, stream) sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out) d_tmp_out.to_host(stream) stream.synchronize() base = 0 #base for the scan of one_list for j in xrange(base_reduction_block_num): base += tmp_out[j] Blelloch_scan_caller(d_zero_list, d_one_list, base) array_adjust[block_num, TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
def block_increment(start, n): cuda.select_device(0) stream = blockdim = 256 griddim = n // 256 + 1 c_host = np.zeros((n, n), dtype=np.float32) m_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) n_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) a_host = np.zeros(n, dtype=np.float32) a_dev = cuda.device_array_like(a_host) cuda_div[griddim, blockdim, stream](m_dev, n_dev, a_dev, n) #keeps a_dev on the device for the kernel ==> no access at this point to the device memory # so i cant know what appends to m_dev and n_dev best guess is python GC is # translated into desallocation on the device b_dev = curand.uniform((n * n), dtype=np.float32, device=True) c_dev = cuda.device_array_like(c_host, stream) block_kernel[griddim, blockdim, stream](start, n, a_dev, b_dev, c_dev) c_dev.copy_to_host(c_host, stream) stream.synchronize() return c_host
def mc_cuda(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # instantiate a CUDA stream for queueing async CUDA cmds stream = # instantiate a cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # configure the kernel # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>> step_cfg = step_cuda[gridsz, blksz, stream] # transfer the initial prices d_last = cuda.to_device(paths[:, 0], stream=stream) for j in range(1, paths.shape[1]): # call cuRAND to populate d_normdist with gaussian noises prng.normal(d_normdist, mean=0, sigma=1) # setup memory for new prices # device_array_like is like empty_like for GPU d_paths = cuda.device_array_like(paths[:, j], stream=stream) # invoke step kernel asynchronously step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) # transfer memory back to the host d_paths.copy_to_host(paths[:, j], stream=stream) d_last = d_paths # wait for all GPU work to complete stream.synchronize()
def score_sequence(seq, pssm, verbose = False, keep_strands = True, benchmark = False, blocks_per_grid = -1, threads_per_block = -1): """ This function will score a sequence of nucleotides based on a PSSM by using a sliding window parallelized on a GPU. Args: seq: This must be an integer representation of the nucleotide sequence, where the alphabet is (A = 0, C = 1, G = 2, T = 3). It must be a vector (1D array) of integers that can be cast to int32 (See: numpy.int32). pssm: This must a vectorized PSSM where every four elements correspond to one position. Make sure this can be cast to an array of float64. verbose: Set this to True to print performance information. benchmark: If set to True, the function will return information about the run in a dictionary at the third output variable. keep_strands: Whether memory should be allocated for storing which strand the scores come from. Set this to False if you just want the scores and the strands array will not be returned. NOTE: If this and benchmark are set to False, then the scores will not be returned in a tuple, meaning: >>> score_sequence blocks_per_grid: This is the blocks per grid that will be assigned to the CUDA kernel. See this SO question for info on choosing this value: It defaults to the length of the sequence or the maximum number of blocks per grid supported by the GPU, whichever is lower. Set this to a negative number threads_per_block: Threads per block. See above. It defaults to 55% of the maximum number of threads per block supported by the GPU, a value determined experimentally. Higher values will likely result in failure to allocate resources to the kernel (since there will not be enough register space for each thread). Returns: scores: 1D float64 array of length (n - w + 1), where n is the length of the sequence and w is the window size. The value at index i of this array corresponds to the score of the n-mer at position i in the sequence. strands: 1D int32 array of length (n - w + 1). The value at position i is either 0 or 1 corresponding to the strand of the score at that position where 0 means the forward strand and 1 means reverse. run_info: This is a dictionary that is returned if the benchmark parameter is set to True. It contains the following: >>> run_info.keys() ['memory_used', 'genome_size', 'runtime', 'threads_per_block', 'blocks_per_grid'] Note that the memory_used is rather misleading if running the function more than once. CUDA is optimized to not transfer the same data from the host to the device so it will not always change. It may also unload other assets from memory, so the memory changed can be negative. TODO: Find a better method of calculating memory usage. Example: >>> pssm = np.random.uniform(-7.5, 2.0, 4 * 16) # Window size of 16 >>> seq = np.random.randint(0, 3, 30e6) # Generate random 30 million bp sequence >>> scores, strands, run_info = score_sequence(seq, pssm, benchmark=True, verbose=True) Threads per block = 563 Blocks per grid = 53286 Total threads = 30000018 Scoring... Done. Genome size: 3e+07 bp Time: 605.78 ms Speed: 4.95229e+07 bp/sec >>> scores array([-16.97089798, -33.48925866, -21.80381526, ..., -10.27919401, -32.64575614, -23.97110103]) >>> strands array([1, 1, 1, ..., 1, 1, 0]) >>> run_info {'memory_used': 426508288L, 'genome_size': 30000000, 'runtime': 0.28268090518054123, 'threads_per_block': 563, 'blocks_per_grid': 53286} A more interesting interpretation of the run information for performance analysis is the number of bases score per second: >>> print "%g bases/sec" % run_info["genome_size"] / run_info["runtime"] 1.06127e+08 bases/sec """ w = int(pssm.size / 4) # width of PSSM n = int(seq.size) # length of the sequence being scored # Calculate the reverse-complement of the PSSM pssm_r = np.array([pssm[i / 4 + (3 - (i % 4))] for i in range(pssm.size)][::-1]) # Calculate the appropriate threads per block and blocks per grid if threads_per_block <= 0 or blocks_per_grid <= 0: # We don't use the max number of threads to avoid running out of # register space by saturating the streaming multiprocessors # ~55% was found empirically, but your mileage may vary with different GPUs threads_per_block = int(cuda.get_current_device().MAX_BLOCK_DIM_X * 0.55) # We saturate our grid and let the dynamic scheduler assign the blocks # to the discrete CUDA cores/streaming multiprocessors blocks_per_grid = int(math.ceil(float(n) / threads_per_block)) if blocks_per_grid > cuda.get_current_device().MAX_GRID_DIM_X: blocks_per_grid = cuda.get_current_device().MAX_GRID_DIM_X if verbose: print "Threads per block = %d" % threads_per_block print "Blocks per grid = %d" % blocks_per_grid print "Total threads = %d" % (threads_per_block * blocks_per_grid) # Collect benchmarking info s = default_timer() start_mem = cuda.current_context().get_memory_info()[0] # Start a stream stream = # Copy data to device d_pssm = cuda.to_device(pssm.astype(np.float64), stream) d_pssm_r = cuda.to_device(pssm_r.astype(np.float64), stream) d_seq = cuda.to_device(seq.astype(np.int32), stream) # Allocate memory on device to store results d_scores = cuda.device_array(n - w + 1, dtype=np.float64, stream=stream) if keep_strands: d_strands = cuda.device_array(n - w + 1, dtype=np.int32, stream=stream) # Run the kernel if keep_strands: cuda_score[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores, d_strands) else: cuda_score_without_strands[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores) # Copy results back to host scores = d_scores.copy_to_host(stream=stream) if keep_strands: strands = d_strands.copy_to_host(stream=stream) stream.synchronize() # Collect benchmarking info end_mem = cuda.current_context().get_memory_info()[0] t = default_timer() - s # Output info on the run if verbose parameter is true if verbose: print "Genome size: %g bp" % n print "Time: %.2f ms (using time.%s())" % (t * 1000, default_timer.__name__) print "Speed: %g bp/sec" % (n / t) print "Global memory: %d bytes used (%.2f%% of total)" % \ (start_mem - end_mem, float(start_mem - end_mem) * 100 / cuda.get_current_device().get_context().get_memory_info()[1]) # Return the run information for benchmarking run_info = {"genome_size": n, "runtime": t, "memory_used": start_mem - end_mem, \ "blocks_per_grid": blocks_per_grid, "threads_per_block": threads_per_block} # I'm so sorry BDFL, please don't hunt me down for returning different size # tuples in my function if keep_strands: if benchmark: return (scores, strands, run_info) else: return (scores, strands) else: if benchmark: return (scores, run_info) else: # Careful! This won't return a tuple, so you don't need to do # score_sequence[0] to get the scores return scores
def get_trials(params, n_rep=100000): ''' Generates n_rep number of facilitation curves for Go response for all simulated trials required Parameters ------------- params : sequence (4,) of float k_facGo - scale of fac curve pre_t_mean - average start time before target presentation pre_t_sd - standard deviation of start time before target Returns -------- fac_i : array facilitation curves for all simulated trials t : array sequence of time index ''' pre_t_mean, pre_t_sd = params k_facGo = 0.004 tau_facGo = 1.69 inhib_mean = 1.57 inhib_sd = 0.31 t = np.linspace(-.4, .2, 600, endpoint=False, dtype=np.float32) pre_t = np.array(np.random.normal(pre_t_mean, pre_t_sd, size=n_rep), dtype=np.float32) fac_i_parallel = np.zeros((n_rep, t.size), dtype=np.float32) inhib_tonic_parallel = np.zeros((n_rep, t.size)) inhib_parallel = np.random.normal(inhib_mean, inhib_sd, size=n_rep) inhib_tonic_parallel += inhib_parallel[:,np.newaxis] if PAR_TEST: fac_i = np.zeros((n_rep, t.size), dtype=np.float32) t_start = time() for i in range(n_rep): # for each simulated trial myparams = pre_t[i] #k_facGo, tau_facGo, #fac_i[i] = get_fac(t, myparams) #fac_i[i] = fast.get_fac(t, myparams) t_end = time() s_time = t_end - t_start print "Serial time: %.3f s" % s_time # Used for testing get_fac_parallel, it will fill the array fac_i_parallel #get_fac_parallel(fac_i_parallel, n_rep, t, len(t), k_facGo, tau_facGo, pre_t) # Setup CUDA variables tpb_x = 8 # threads per block in x dimension tpb_y = 8 # threads per block in y dimension block_dim = tpb_x, tpb_y bpg_x = int(n_rep / tpb_x) + 1 # block grid x dimension bpg_y = int(t.size / tpb_y) + 1 # block grid y dimension grid_dim = bpg_x, bpg_y t_start = time() stream = with stream.auto_synchronize(): d_fac = cuda.to_device(fac_i_parallel, stream) d_t = cuda.to_device(t, stream) d_pre_t = cuda.to_device(pre_t, stream) #d_inhib_tonic = cuda.to_device(inhib_tonic_parallel, stream) print "CUDA kernel: Block dim: ({tx}, {ty}), Grid dim: ({gx}, {gy})".format(tx=tpb_x, ty=tpb_y, gx=bpg_x, gy=bpg_y) get_fac_cuda[grid_dim, block_dim](d_fac, n_rep, t, len(t), k_facGo, tau_facGo, pre_t) #k_facGo, tau_facGo, removed - defined in get_fac_cuda function input argument #get_inhib_tonic_cuda[] d_fac.to_host(stream) t_end = time() c_time = t_end - t_start print "CUDA time: %.3f s" % c_time if PAR_TEST: print "Difference between fac_i and fac_i_parallel" print (fac_i - fac_i_parallel) print "Close enough? ", np.allclose(fac_i, fac_i_parallel, rtol=0, atol=1e-05) print "Speed up: %.3f x" % (s_time / c_time) return fac_i_parallel, inhib_tonic_parallel, t
d_src = cuda.to_device(src) d_dst = cuda.device_array_like(dst) copy_kernel(d_src, out=d_dst) d_dst.copy_to_host(dst) te = timer() print 'regular', te - ts del d_src, d_dst assert np.allclose(dst, src) # Pinned (pagelocked) memory transfer with cuda.pinned(src, dst): ts = timer() stream = # use stream to trigger async memory transfer d_src = cuda.to_device(src, stream=stream) d_dst = cuda.device_array_like(dst, stream=stream) copy_kernel(d_src, out=d_dst, stream=stream) d_dst.copy_to_host(dst, stream=stream) stream.synchronize() te = timer() print 'pinned', te - ts assert np.allclose(dst, src)
def main(*args): OPT_N = 4000000 iterations = 10 if len(args) >= 2: iterations = int(args[0]) blockdim = 1024, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 # Use cuRand to generate random numbers directyl on the gpu # to avoid memory transfers. prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) time0 = time.time() # malloc d_stockPrice = cuda.device_array(shape=(OPT_N), dtype=np.float32) d_optionStrike = cuda.device_array(shape=(OPT_N), dtype=np.float32) d_optionYears = cuda.device_array(shape=(OPT_N), dtype=np.float32) # Base distribution prng.uniform(d_stockPrice) prng.uniform(d_optionStrike) prng.uniform(d_optionYears) stream = cfg_distribute = c_distribute[griddim, blockdim, stream] cfg_distribute(d_stockPrice, 5.0, 30.0) cfg_distribute(d_optionStrike, 1.0, 100.0) cfg_distribute(d_optionYears, 0.25, 10.) stream.synchronize() callResultNumbapro = np.zeros(OPT_N) putResultNumbapro = -np.ones(OPT_N) d_callResult = cuda.to_device(callResultNumbapro, stream) d_putResult = cuda.to_device(putResultNumbapro, stream) time1 = time.time() # Preconfigure the kernel as it's called multiple times in a loop. cfg_black_scholes_cuda = black_scholes_cuda[griddim, blockdim, stream] for i in range(iterations): cfg_black_scholes_cuda(d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.to_host(stream) d_putResult.to_host(stream) stream.synchronize() time2 = time.time() dt = (time1 - time0) * 10 + (time2 - time1) print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))
def main (*args): OPT_N = 4000000 iterations = 10 if len(args) >= 2: iterations = int(args[0]) callResultNumpy = np.zeros(OPT_N) putResultNumpy = -np.ones(OPT_N) stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0) optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0) optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0) callResultNumba = np.zeros(OPT_N) putResultNumba = -np.ones(OPT_N) callResultNumbapro = np.zeros(OPT_N) putResultNumbapro = -np.ones(OPT_N) time0 = time.time() for i in range(iterations): black_scholes(callResultNumpy, putResultNumpy, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY) time1 = time.time() print("Numpy Time: %f msec" % ((1000 * (time1 - time0)) / iterations)) time0 = time.time() for i in range(iterations): black_scholes_numba(callResultNumba, putResultNumba, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY) time1 = time.time() print("Numba Time: %f msec" % ((1000 * (time1 - time0)) / iterations)) time0 = time.time() blockdim = 1024, 1 griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1 stream = d_callResult = cuda.to_device(callResultNumbapro, stream) d_putResult = cuda.to_device(putResultNumbapro, stream) d_stockPrice = cuda.to_device(stockPrice, stream) d_optionStrike = cuda.to_device(optionStrike, stream) d_optionYears = cuda.to_device(optionYears, stream) time1 = time.time() for i in range(iterations): black_scholes_cuda[griddim, blockdim, stream]( d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.to_host(stream) d_putResult.to_host(stream) stream.synchronize() time2 = time.time() dt = (time1 - time0) * 10 + (time2 - time1) print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations)) delta = np.abs(callResultNumpy - callResultNumba) L1norm = delta.sum() / np.abs(callResultNumpy).sum() print("L1 norm: %E" % L1norm) print("Max absolute error: %E" % delta.max()) delta = np.abs(callResultNumpy - callResultNumbapro) L1norm = delta.sum() / np.abs(callResultNumpy).sum() print("L1 norm (Numbapro): %E" % L1norm) print("Max absolute error (Numbapro): %E" % delta.max())
def spca_full(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int(math.ceil((4. / epsilon) ** d)) print(initNumSamples) maxSize = 6400 ##actual algorithm opt_x = np.zeros((p, 1), dtype=float_dtype) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, # descending=True) prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A # print(Vd.dtype) # print('dA', (Vd.shape[0], numSamples)) # print('dI', (k, numSamples)) dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F', dtype=Vd.dtype) dI = cuda.device_array(shape=(Vd.shape[0], numSamples), dtype=np.uint32, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype) dC = cuda.device_array(shape=(d, numSamples), order='F', dtype=Vd.dtype) #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time nn = Vd.shape[0] segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz), custr](dI) segmented_sort(dA, dI, segments, stream=custr) # async_dA = dA.bind(custr) # async_dI = dI.bind(custr) # selnext = sorter.batch_argselect(dtype=dA.dtype, # count=dA.shape[0], # k=k, # reverse=True) # for i in range(numSamples): # dIi = selnext(async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # for i in range(numSamples): # # radix_argselect(async_dA[:, i], k=k, stream=custr, # # storeidx=async_dI[:, i]) # dIi = sorter.argselect(k, async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # Replaces: val = np.linalg.norm(a[I[-k:]]) # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, # daInorm) dA = dA.bind(custr)[-k:] dI = dI.bind(custr)[-k:] batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need Ik = dI[:, i].copy_to_host() aIk = dA[:, i].copy_to_host().reshape(k, 1) opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def spca(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int((4. / epsilon) ** d) maxSize = 32000 ##actual algorithm opt_x = np.zeros((p, 1)) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F') dI = cuda.device_array(shape=(k, numSamples), dtype=np.int16, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=np.float64) dC = cuda.device_array(shape=(d, numSamples), order='F') #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time batch_k_selection[numSamples, Vd.shape[0], custr](dA, dI, k) # Replaces: val = np.linalg.norm(a[I[-k:]]) batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, daInorm) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need a = gpu_slice(dA, i).reshape(p, 1) Ik = gpu_slice(dI, i).reshape(k, 1) aIk = a[Ik] opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def gaussian(method, params, n_rep=DEFAULT_TRIALS, compare=None, dtype=np.float32): """ method, compare can be : "original", "parallel_base", "cuda" dtype float32 is faster for GPU but has lower precision dtype float64 is faster for CPU """ # expand params a_facGo_mean, a_facGo_sd, b_facGo_mean, b_facGo_sd, c_facGo_mean, c_facGo_sd, \ inhib_mean, inhib_sd = params # create data structures t = np.linspace(-.4, .2, 600, endpoint=False).astype(dtype) #tau_facGo = 2 # Currently set, but will need to optomize # generates n_rep random numbers from a normal distribution of mean, sd that given into function a_facGo = np.random.normal(a_facGo_mean, a_facGo_sd, size=n_rep).astype(dtype) b_facGo = np.random.normal(b_facGo_mean, b_facGo_sd, size=n_rep).astype(dtype) c_facGo = np.random.normal(c_facGo_mean, c_facGo_sd, size=n_rep).astype(dtype) inhib_tonic = np.zeros((n_rep, t.size)) inhib = np.random.normal(inhib_mean, inhib_sd, size=n_rep) inhib_tonic += inhib[:,np.newaxis] # sets up empty array of zeros for all simulated trials fac1 = np.zeros((n_rep, t.size)).astype(dtype) facs = [fac1] if compare: fac2 = np.zeros((n_rep, t.size)).astype(dtype) facs = facs + [fac2] # Execute trials and compare performance and results if required tps = [0, 0] # trials per second for each method for fi, f in enumerate([method, compare]): if f: # check if method or comapre is not None fac = facs[fi] # get the right fac t_start = time() if (f == "original"): for i in range(n_rep): # for each simulated trial myparams_fac = a_facGo[i], b_facGo[i], c_facGo[i] # generates curve for that simulated trial fac[i] = _gaussian_original(t, myparams_fac) elif (f == "parallel_base"): _gaussian_parallel_base(fac, n_rep, t, len(t), a_facGo, b_facGo, c_facGo) elif (f == "cuda"): # Setup CUDA variables tpb_x = 8 # threads per block in x dimension tpb_y = 8 # threads per block in y dimension block_dim = tpb_x, tpb_y bpg_x = int(n_rep / tpb_x) + 1 # block grid x dimension bpg_y = int(t.size / tpb_y) + 1 # block grid y dimension grid_dim = bpg_x, bpg_y stream = with stream.auto_synchronize(): d_fac = cuda.to_device(fac, stream) d_t = cuda.to_device(t, stream) d_a_facGo = cuda.to_device(a_facGo, stream) d_b_facGo = cuda.to_device(b_facGo, stream) d_c_facGo = cuda.to_device(c_facGo, stream) #print "CUDA kernel: Block dim: ({tx}, {ty}), Grid dim: ({gx}, {gy})".format(tx=tpb_x, ty=tpb_y, gx=bpg_x, gy=bpg_y) if dtype == np.float32: _gaussian_cuda32[grid_dim, block_dim](d_fac, n_rep, d_t, len(t), d_a_facGo, d_b_facGo, d_c_facGo) elif dtype == np.float64: _gaussian_cuda64[grid_dim, block_dim](d_fac, n_rep, d_t, len(t), d_a_facGo, d_b_facGo, d_c_facGo) else: print "Error: CUDA dtype must be np.float32 or np.float64" sys.exit(1) d_fac.to_host(stream) t_diff = time() - t_start tps[fi] = n_rep / t_diff # Check results close enough if compare: close = np.allclose(facs[0], facs[1], rtol=0, atol=1e-05) if not close: print "ERROR: results from method '%s' are not the same as method '%s'" % (method, compare) #print (facs[1] - facs[0]) sys.exit(1); # Summary print "%s trials per second: %.0f" % (method, tps[0]) if compare: print "%s trials per second: %.0f" % (compare, tps[1]) print "Speed up: %.3f x" %(tps[0]/tps[1]) # method / compare print "Results close enough? ", close return fac1, inhib_tonic, t
def main(): # Build Filter laplacian_pts = """ -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 """.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.lena().astype(np.float32) print("Image size: %s" % (image.shape,)) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode="same") te = timer() print("CPU: %.2fs" % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print("kernel config: %s x %s" % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) stream1 = stream2 = fftplan1 = cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream1) fftplan2 = cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream2) # pagelock memory with cuda.pinned(image_complex, response_complex): # We can overlap the transfer of response_complex with the forward FFT # on image_complex. d_image_complex = cuda.to_device(image_complex, stream=stream1) d_response_complex = cuda.to_device(response_complex, stream=stream2) fftplan1.forward(d_image_complex, out=d_image_complex) fftplan2.forward(d_response_complex, out=d_response_complex) stream2.synchronize() mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex, d_response_complex) fftplan1.inverse(d_image_complex, out=d_image_complex) # implicitly synchronizes the streams cvimage_gpu = d_image_complex.copy_to_host().real / te = timer() print("GPU: %.2fs" % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title("CPU") plt.imshow(cvimage_cpu, plt.axis("off") plt.subplot(1, 2, 2) plt.title("GPU") plt.imshow(cvimage_gpu, plt.axis("off")
def reduce_by_key(input_data, chunk_id, literal, length): length = numpy.int64(len(input_data)) bin_length = max(len(bin(length-1)),len(bin(tpb-1))) thread_num = numpy.int64(math.pow(2,bin_length)) block_num = max(thread_num/tpb,1) flag = numpy.zeros(thread_num, dtype='int64') arg_useless = numpy.zeros(thread_num, dtype='int64') stream = d_flag = cuda.to_device(flag, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) produce_flag[block_num,tpb](input_data, d_chunk_id, length, d_flag) d_flag.to_host(stream) stream.synchronize() start_pos = numpy.ones(length, dtype='int64') * (-1) radix_sort.Blelloch_scan_caller(d_flag, arg_useless, 0) d_start_pos = cuda.to_device(start_pos, stream) dd_flag = cuda.to_device(flag, stream) print 'flag' print flag[:length] #d_flag.to_host(stream) #print 'd_flag' #print flag[:length] get_startPos[(length-1)/tpb+1, tpb](dd_flag, d_flag, d_start_pos, length) d_start_pos.to_host(stream) stream.synchronize() start_pos = filter(lambda x: x>=0, start_pos) reduced_length = len(start_pos) start_pos = list(start_pos) start_pos.append(length) reduced_input_data = [] reduced_chunk_id = [] reduced_literal =[] for i in xrange(reduced_length): print start_pos[i], start_pos[i+1] data_to_reduce = literal[start_pos[i]:start_pos[i+1]] print data_to_reduce reduce_block_num = (len(data_to_reduce)-1)/tpb + 1 tmp_out = numpy.zeros(reduce_block_num, dtype='uint32') d_tmp_out = cuda.to_device(tmp_out, stream) or_reduction[reduce_block_num, tpb](numpy.array(data_to_reduce), d_tmp_out,len(data_to_reduce)) d_tmp_out.to_host(stream) stream.synchronize() result = 0x00000000 for j in xrange(reduce_block_num): result |= tmp_out[j] reduced_input_data.append(input_data[start_pos[i]]) reduced_chunk_id.append(chunk_id[start_pos[i]]) reduced_literal.append(result) print '************!!!!!!!!!!!!!!!****************' return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
def get_pic_path(path): #print 'open source file in bitmap_pickle: '.strip() start = time.time() attr_dict,attr_values,attr_value_NO,attr_list, data_pic_path = data_pickle.openfile(path) end = time.time() #print str(end-start) #print 'index part(get bitmap, keylength and offset): '.strip() start = time.time() attr_num = len(attr_list) lists = [[]for i in xrange(attr_num)] key = [[]for i in xrange(attr_num)] offset = [[]for i in xrange(attr_num)] # attr_num = 1 total_row = len(attr_values[0]) for idx in range(attr_num): input_data = numpy.array(attr_values[idx]) length = input_data.shape[0] rid = numpy.arange(0,length) #step1 sort #print 'time in step1--sort:' start = time.time() radix_sort.radix_sort(input_data,rid) end = time.time() #print str(end-start) cardinality = len(attr_value_NO[idx].items()) literal = numpy.zeros(length, dtype = 'uint32') chunk_id = numpy.zeros(length, dtype = 'int64') #print 'time in step2--produce chId_lit:' start = time.time() stream = #d_rid = cuda.to_device(rid, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) #step2 produce chunk_id and literal produce_chId_lit_gpu[length/tpb+1, tpb](rid, d_literal, d_chunk_id, length) #d_rid.to_host(stream) d_chunk_id.to_host(stream) d_literal.to_host(stream) stream.synchronize() end = time.time() #print str(end-start) #step3 reduce by key(value, chunk_id) #print 'time in step3--reduce by key:' start = time.time() reduced_input_data, reduced_chunk_id, reduced_literal = reduce_by_key(input_data, chunk_id, literal, length) reduced_length = reduced_input_data.shape[0]#row end = time.time() #print str(end-start) #print '##############################reduced############################' #for i in xrange(reduced_length): # print reduced_input_data[i], reduced_chunk_id[i], bin(reduced_literal[i]) #step4 produce 0-Fill word #print 'time in step4--produce 0-fill word:' start = time.time() fill_word, head = produce_fill(reduced_input_data, reduced_chunk_id, reduced_length) end = time.time() #print str(end-start) #step 5 & 6: get index by interleaving 0-Fill word and literal(also remove all-zeros word) #print 'time in step5--get out_index & length & offset:' start = time.time() out_index, offsets, key_length = getIdx(fill_word,reduced_literal, reduced_length, head, cardinality) end = time.time() #print str(end-start) lists[idx] = out_index key[idx] = key_length offset[idx] = offsets end = time.time() #print str(end-start) ''' print '*****************index:' print lists print '*****************length:' print key print '*****************offset:' print offset ''' print 'put index result into file: '.strip() start = time.time() bitmap_pic_path = 'bitmap_pic.pkl' f1 = open(bitmap_pic_path, 'wb') pickle.dump(lists, f1, True) pickle.dump(key, f1, True) pickle.dump(offset, f1, True) f1.close() end = time.time() print str(end-start) return data_pic_path, bitmap_pic_path, attr_num
def _update(d): stream1 = stream2 = stream3 = stream4 = step = d['step'] #print "Step: {}".format(step) """Calculate the pressure gradient. Two steps are needed for this.""" # Calculate FFT of pressure. fft(d['field']['p'], d['temp']['fft_p'], stream=stream1) stream1.synchronize() #print "FFT pressure: {}".format(d['temp']['fft_p'].copy_to_host()) #pressure_exponent_x = exp(pressure_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!! #pressure_exponent_y = exp(pressure_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!! #print(d['spacing'].shape) #print(d['k_x'].shape) ex = cuda.device_array(shape=d['field']['p'].shape) print(d['k_x'].shape) print(d['spacing'].shape) print(d['k_x'].dtype) print(d['spacing'].dtype) print(pressure_gradient_exponent(d['k_x'], d['spacing'])) ex = pressure_gradient_exponent(d['k_x'], d['spacing'])#, stream=stream1) ey = pressure_gradient_exponent(d['k_y'], d['spacing'])#, stream=stream2) pressure_exponent_x = exp(ex, stream=stream1) # This is a constant!! pressure_exponent_y = exp(ey, stream=stream2) # This is a constant!! stream1.synchronize() stream2.synchronize() #print ( to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x) ).copy_to_host() """Calculate the velocity gradient.""" ifft(to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x, stream=stream1), d['temp']['d_p_d_x'], stream=stream1) ifft(to_gradient(d['temp']['fft_p'], d['k_y'], d['kappa'], pressure_exponent_y, stream=stream2), d['temp']['d_p_d_y'], stream=stream2) #print "Pressure gradient x: {}".format( d['temp']['d_p_d_x'].copy_to_host() ) #print "Pressure gradient y: {}".format( d['temp']['d_p_d_y'].copy_to_host() ) """Calculate the velocity.""" d['field']['v_x'] = velocity_with_pml(d['field']['v_x'], d['temp']['d_p_d_x'], d['timestep'], d['density'], d['abs_exp']['x'], d['source']['v']['x'][step], stream=stream1) d['field']['v_y'] = velocity_with_pml(d['field']['v_y'], d['temp']['d_p_d_y'], d['timestep'], d['density'], d['abs_exp']['y'], d['source']['v']['y'][step], stream=stream2) stream1.synchronize() stream2.synchronize() """Fourier transform of the velocity.""" fft(d['field']['v_x'], d['temp']['fft_v_x'], stream=stream1) fft(d['field']['v_y'], d['temp']['fft_v_y'], stream=stream2) stream1.synchronize() stream2.synchronize() #print d['temp']['fft_v_y'].copy_to_host() #print "Velocity x: {}".format(d['field']['v_x'].copy_to_host()) #print "Velocity y: {}".format(d['field']['v_y'].copy_to_host()) #print "Source: {}".format(d['source']['p'][step].copy_to_host()) #print "Source: {}".format(d['source']['p']) #print "Velocity exponent y: {}".format(velocity_exponent_y.copy_to_host()) stream1.synchronize() stream2.synchronize() #stream3.synchronize() #stream4.synchronize() velocity_exponent_x = exp(velocity_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!! velocity_exponent_y = exp(velocity_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!! ifft(to_gradient(d['temp']['fft_v_x'], d['k_x'], d['kappa'], velocity_exponent_x, stream=stream1), d['temp']['d_v_d_x'], stream=stream1) ifft(to_gradient(d['temp']['fft_v_y'], d['k_y'], d['kappa'], velocity_exponent_y, stream=stream2), d['temp']['d_v_d_y'], stream=stream2) """And finally the pressure.""" #print len([ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ]) #pressure_with_pml( d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ) #for i in [ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ]: #print i , i.shape #print i.copy_to_host() #try: #print i.dtype #except AttributeError: #print 'None' stream1.synchronize() stream2.synchronize() #print "Velocity gradient x: {}".format(d['temp']['d_v_d_x'].copy_to_host()) #print "Velocity gradient y: {}".format(d['temp']['d_v_d_y'].copy_to_host()) #print "Pressure x previous: {}".format(d['temp']['p_x'].copy_to_host()) #print "Pressure y previous: {}".format(d['temp']['p_y'].copy_to_host()) #print "Abs exp x: {}".format( d['abs_exp']['x'].copy_to_host()) #print "Abs exp y: {}".format( d['abs_exp']['y'].copy_to_host()) d['temp']['p_x'] = pressure_with_pml(d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step], stream=stream1) d['temp']['p_y'] = pressure_with_pml(d['temp']['p_y'], d['temp']['d_v_d_y'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['y'], d['source']['p'][step], stream=stream2) stream1.synchronize() stream2.synchronize() #try: #print "Source p: {}".format(d['source']['p'][step].copy_to_host()) #except AttributeError: #print "Source p: {}".format(d['source']['p'][step]) #print "Pressure x: {}".format(d['temp']['p_x'].copy_to_host()) #print "Pressure y: {}".format(d['temp']['p_y'].copy_to_host()) d['field']['p'] = add(d['temp']['p_x'], d['temp']['p_y'], stream=stream3) #stream3.synchronize() #print "Pressure total: {}".format(d['field']['p'].copy_to_host()) stream1.synchronize() stream2.synchronize() stream3.synchronize() return d
def spca_full(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int(math.ceil((4. / epsilon)**d)) print(initNumSamples) maxSize = 6400 ##actual algorithm opt_x = np.zeros((p, 1), dtype=float_dtype) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, # descending=True) prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A # print(Vd.dtype) # print('dA', (Vd.shape[0], numSamples)) # print('dI', (k, numSamples)) dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F', dtype=Vd.dtype) dI = cuda.device_array(shape=(Vd.shape[0], numSamples), dtype=np.uint32, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype) dC = cuda.device_array(shape=(d, numSamples), order='F', dtype=Vd.dtype) #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time nn = Vd.shape[0] segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz), custr](dI) segmented_sort(dA, dI, segments, stream=custr) # async_dA = dA.bind(custr) # async_dI = dI.bind(custr) # selnext = sorter.batch_argselect(dtype=dA.dtype, # count=dA.shape[0], # k=k, # reverse=True) # for i in range(numSamples): # dIi = selnext(async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # for i in range(numSamples): # # radix_argselect(async_dA[:, i], k=k, stream=custr, # # storeidx=async_dI[:, i]) # dIi = sorter.argselect(k, async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # Replaces: val = np.linalg.norm(a[I[-k:]]) # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, # daInorm) dA = dA.bind(custr)[-k:] dI = dI.bind(custr)[-k:] batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need Ik = dI[:, i].copy_to_host() aIk = dA[:, i].copy_to_host().reshape(k, 1) opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def reduce_by_key(input_data, chunk_id, literal, length): length = numpy.int64(len(input_data)) bin_length = max(len(bin(length - 1)), len(bin(tpb - 1))) thread_num = numpy.int64(math.pow(2, bin_length)) block_num = max(thread_num / tpb, 1) flag = numpy.zeros(thread_num, dtype='int64') arg_useless = numpy.zeros(thread_num, dtype='int64') stream = d_flag = cuda.to_device(flag, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) produce_flag[block_num, tpb](input_data, d_chunk_id, length, d_flag) d_flag.to_host(stream) stream.synchronize() start_pos = numpy.ones(length, dtype='int64') * (-1) radix_sort.Blelloch_scan_caller(d_flag, arg_useless, 0) d_start_pos = cuda.to_device(start_pos, stream) dd_flag = cuda.to_device(flag, stream) get_startPos[(length - 1) / tpb + 1, tpb](dd_flag, d_flag, d_start_pos, length) d_start_pos.to_host(stream) stream.synchronize() start_pos = filter(lambda x: x >= 0, start_pos) reduced_length = len(start_pos) start_pos = list(start_pos) start_pos.append(length) #print reduced_length reduced_input_data = numpy.zeros(reduced_length, dtype='int32') reduced_chunk_id = numpy.zeros(reduced_length, dtype='int64') reduced_literal = numpy.zeros(reduced_length, dtype='uint32') #print 'append stage in reduce_by_key:' start = time.time() dd_start_pos = cuda.to_device(numpy.array(start_pos), stream) d_reduced_chunk_id = cuda.to_device(reduced_chunk_id, stream) d_reduced_literal = cuda.to_device(reduced_literal, stream) d_reduced_input_data = cuda.to_device(reduced_input_data, stream) block_num = (reduced_length - 1) / tpb + 1 get_reduced[block_num, tpb](d_literal, dd_start_pos, reduced_length, d_reduced_literal, input_data, d_chunk_id, d_reduced_input_data, d_reduced_chunk_id) #kernel function d_reduced_literal.to_host(stream) d_reduced_chunk_id.to_host(stream) d_reduced_input_data.to_host(stream) stream.synchronize() ''' reduced_input_data = [] reduced_chunk_id = [] reduced_literal =[] for i in xrange(reduced_length): data_to_reduce = literal[start_pos[i]:start_pos[i+1]] reduce_block_num = (len(data_to_reduce)-1)/tpb + 1 tmp_out = numpy.zeros(reduce_block_num, dtype='uint32') d_tmp_out = cuda.to_device(tmp_out, stream) start = time.time() or_reduction[reduce_block_num, tpb](numpy.array(data_to_reduce), d_tmp_out,len(data_to_reduce)) end = time.time() print str(end-start) d_tmp_out.to_host(stream) stream.synchronize() result = 0x00000000 for j in xrange(reduce_block_num): result |= tmp_out[j] reduced_input_data.append(input_data[start_pos[i]]) reduced_chunk_id.append(chunk_id[start_pos[i]]) reduced_literal.append(result) ''' end = time.time() #print str(end-start) return numpy.array(reduced_input_data), numpy.array( reduced_chunk_id), reduced_literal
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [ int(math.ceil(float(partlen) / blksz)) for partlen in partlens ] strmlist = [ for _ in range(num_streams)] prnglist = [ curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist ] # Allocate device side array d_normlist = [ cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist) ] c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [ cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist) ] d_lastlist = [ cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [ cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm), stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
if price < 0: price = 0 # lower bound return x, price #upload memory to gpu bpg = 50 tpb = 32 nView = 5 steps = 2000 initialPrice = 100 stream = #initialize memory stream # instantiate a cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) paths = 1 pricePath = [] for j in range(paths): print "Generating path: %s" % j # plotting lists LogReturns, nLogReturns = [0], [0] # log returns, normalized log returns xchange, xcorrelation = [], [] # change in price P, and autocorrelation activeTraders = [] # number of active traders prices = [initialPrice]
print input_data f1 = open('input_data.txt', 'w') f1.write(str(list(input_data))) f2 = open("rid.txt", 'w') f2.write(str(list(rid))) f1.close() f2.close() #cardinality = input_data[-1]+1 cardinality = len(attr_dict['worker_class']) print 'rid:\n',rid literal = numpy.zeros(length, dtype = 'uint32') chunk_id = numpy.zeros(length, dtype = 'int64') stream = d_rid = cuda.to_device(rid, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) #step2 produce chunk_id and literal produce_chId_lit_gpu[length/tpb+1, tpb](d_rid, d_literal, d_chunk_id) d_rid.to_host(stream) d_chunk_id.to_host(stream) d_literal.to_host(stream) stream.synchronize() print chunk_id for i in literal: print i #step3 reduce by key(value, chunk_id) reduced_input_data, reduced_chunk_id, reduced_literal = reduce_by_key(input_data, chunk_id, literal, length) reduced_length = reduced_input_data.shape[0]#row
def main(): cu_discriminant = vectorize([f4(f4, f4, f4), f8(f8, f8, f8)], target='gpu')(poly.discriminant) N = 1e+8 // 2 print 'Data size', N A, B, C = poly.generate_input(N, dtype=np.float32) D = np.empty(A.shape, dtype=A.dtype) stream = print '== One' ts = time() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) dD = cuda.to_device(D, stream, copy=False) cu_discriminant(dA, dB, dC, out=dD, stream=stream) dD.to_host(stream) te = time() total_time = (te - ts) print 'Execution time %.4f' % total_time print 'Throughput %.2f' % (N / total_time) print '== Chunked' chunksize = 1e+7 chunkcount = N // chunksize print 'Chunk size', chunksize sA = np.split(A, chunkcount) sB = np.split(B, chunkcount) sC = np.split(C, chunkcount) sD = np.split(D, chunkcount) device_ptrs = [] ts = time() with stream.auto_synchronize(): for a, b, c, d in zip(sA, sB, sC, sD): dA = cuda.to_device(a, stream) dB = cuda.to_device(b, stream) dC = cuda.to_device(c, stream) dD = cuda.to_device(d, stream, copy=False) cu_discriminant(dA, dB, dC, out=dD, stream=stream) dD.to_host(stream) device_ptrs.extend([dA, dB, dC, dD]) te = time() total_time = (te - ts) print 'Execution time %.4f' % total_time print 'Throughput %.2f' % (N / total_time) if '-verify' in sys.argv[1:]: poly.check_answer(D, A, B, C)
# attr_num = 1 total_row = len(attr_values[0]) for idx in range(attr_num): input_data = numpy.array(attr_values[idx]) length = input_data.shape[0] rid = numpy.arange(0, length, dtype='int64') #step1 sort radix_sort.radix_sort(input_data, rid) print rid print rid.dtype cardinality = len(attr_value_NO[idx].items()) literal = numpy.zeros(length, dtype='uint32') chunk_id = numpy.zeros(length, dtype='int64') stream = #d_rid = cuda.to_device(rid, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) #step2 produce chunk_id and literal produce_chId_lit_gpu[length / tpb + 1, tpb](rid, d_literal, d_chunk_id, length) #d_rid.to_host(stream) d_chunk_id.to_host(stream) d_literal.to_host(stream) stream.synchronize() print '!!!!!!!!!!!!!!!!!!!!!!!!!!chunk_id:!!!!!!!!!!!!!!!!!!!' print chunk_id #step3 reduce by key(value, chunk_id) reduced_input_data, reduced_chunk_id, reduced_literal = reduce_by_key( input_data, chunk_id, literal, length)
def radix_sort(arr, rid): length = numpy.int64(len(arr)) bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length thread_num = numpy.int64(math.pow(2,bin_length)) block_num = max(thread_num/TPB_MAX,1) print 'length: %d'%length print 'bin_length: %d'%bin_length print 'thread_num: %d'%thread_num print 'block_num: %d'%block_num stream = one_list = numpy.zeros(shape=(thread_num), dtype='int64') zero_list = numpy.zeros(shape=(thread_num), dtype='int64') iter_num = len(bin(ATTR_CARD_MAX)) print 'iter_num: %d'%iter_num for i in range(iter_num): print '***************************' print 'iteration_%d:'%i print arr d_arr = cuda.to_device(arr, stream) d_rid = cuda.to_device(rid, stream) d_zero_list = cuda.to_device(zero_list,stream) d_one_list = cuda.to_device(one_list,stream) get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list d_one_list.to_host(stream) d_zero_list.to_host(stream) stream.synchronize() print 'zero_list:' print zero_list print 'one_list' print one_list base_reduction_block_num = block_num base_reduction_block_size = TPB_MAX print 'base_reduction_block_num: %d'%base_reduction_block_num tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64') d_tmp_out = cuda.to_device(tmp_out, stream) sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out) d_tmp_out.to_host(stream) stream.synchronize() base = 0 #base for the scan of one_list for j in xrange(base_reduction_block_num): base += tmp_out[j] print 'base: %d'%base #then do scanning(one_list and zero_list at the same time) print 'begin scan' Blelloch_scan_caller(d_zero_list, d_one_list, base) print 'scan finished' print #adjust array elements' position print 'begin adjust' print 'zero_list:' print zero_list array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length) print arr print
def get_pic_path(path): #print 'open source file in bitmap_pickle: '.strip() start = time.time() attr_dict, attr_values, attr_value_NO, attr_list, data_pic_path = data_pickle.openfile( path) end = time.time() #print str(end-start) #print 'index part(get bitmap, keylength and offset): '.strip() start = time.time() attr_num = len(attr_list) lists = [[] for i in xrange(attr_num)] key = [[] for i in xrange(attr_num)] offset = [[] for i in xrange(attr_num)] # attr_num = 1 total_row = len(attr_values[0]) for idx in range(attr_num): input_data = numpy.array(attr_values[idx]) length = input_data.shape[0] rid = numpy.arange(0, length) #step1 sort #print 'time in step1--sort:' start = time.time() radix_sort.radix_sort(input_data, rid) end = time.time() #print str(end-start) cardinality = len(attr_value_NO[idx].items()) literal = numpy.zeros(length, dtype='uint32') chunk_id = numpy.zeros(length, dtype='int64') #print 'time in step2--produce chId_lit:' start = time.time() stream = #d_rid = cuda.to_device(rid, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) #step2 produce chunk_id and literal produce_chId_lit_gpu[length / tpb + 1, tpb](rid, d_literal, d_chunk_id, length) #d_rid.to_host(stream) d_chunk_id.to_host(stream) d_literal.to_host(stream) stream.synchronize() end = time.time() #print str(end-start) #step3 reduce by key(value, chunk_id) #print 'time in step3--reduce by key:' start = time.time() reduced_input_data, reduced_chunk_id, reduced_literal = reduce_by_key( input_data, chunk_id, literal, length) reduced_length = reduced_input_data.shape[0] #row end = time.time() #print str(end-start) #print '##############################reduced############################' #for i in xrange(reduced_length): # print reduced_input_data[i], reduced_chunk_id[i], bin(reduced_literal[i]) #step4 produce 0-Fill word #print 'time in step4--produce 0-fill word:' start = time.time() fill_word, head = produce_fill(reduced_input_data, reduced_chunk_id, reduced_length) end = time.time() #print str(end-start) #step 5 & 6: get index by interleaving 0-Fill word and literal(also remove all-zeros word) #print 'time in step5--get out_index & length & offset:' start = time.time() out_index, offsets, key_length = getIdx(fill_word, reduced_literal, reduced_length, head, cardinality) end = time.time() #print str(end-start) lists[idx] = out_index key[idx] = key_length offset[idx] = offsets end = time.time() #print str(end-start) ''' print '*****************index:' print lists print '*****************length:' print key print '*****************offset:' print offset ''' print 'put index result into file: '.strip() start = time.time() bitmap_pic_path = 'bitmap_pic.pkl' f1 = open(bitmap_pic_path, 'wb') pickle.dump(lists, f1, True) pickle.dump(key, f1, True) pickle.dump(offset, f1, True) f1.close() end = time.time() print str(end - start) return data_pic_path, bitmap_pic_path, attr_num
def main(): cu_discriminant = vectorize( [f4(f4, f4, f4), f8(f8, f8, f8)], target='gpu')(poly.discriminant) N = 1e+8 // 2 print 'Data size', N A, B, C = poly.generate_input(N, dtype=np.float32) D = np.empty(A.shape, dtype=A.dtype) stream = print '== One' ts = time() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) dD = cuda.to_device(D, stream, copy=False) cu_discriminant(dA, dB, dC, out=dD, stream=stream) dD.to_host(stream) te = time() total_time = (te - ts) print 'Execution time %.4f' % total_time print 'Throughput %.2f' % (N / total_time) print '== Chunked' chunksize = 1e+7 chunkcount = N // chunksize print 'Chunk size', chunksize sA = np.split(A, chunkcount) sB = np.split(B, chunkcount) sC = np.split(C, chunkcount) sD = np.split(D, chunkcount) device_ptrs = [] ts = time() with stream.auto_synchronize(): for a, b, c, d in zip(sA, sB, sC, sD): dA = cuda.to_device(a, stream) dB = cuda.to_device(b, stream) dC = cuda.to_device(c, stream) dD = cuda.to_device(d, stream, copy=False) cu_discriminant(dA, dB, dC, out=dD, stream=stream) dD.to_host(stream) device_ptrs.extend([dA, dB, dC, dD]) te = time() total_time = (te - ts) print 'Execution time %.4f' % total_time print 'Throughput %.2f' % (N / total_time) if '-verify' in sys.argv[1:]: poly.check_answer(D, A, B, C)