def main(): N = 2048 * 2048 # Allocate host memory arrays a = np.empty(N) b = np.empty(N) c = np.empty(N) # Initialize host memory a.fill(2) b.fill(1) c.fill(0) # Allocate and copy GPU/device memory d_a = cuda.to_device(a) d_b = cuda.to_device(b) d_c = cuda.to_device(c) threads_per_block = 128 number_of_blocks = 33000 saxpy [ number_of_blocks, threads_per_block ] (d_a,d_b,d_c,) d_c.copy_to_host(c) # Print out the first and last 5 values of c for a quality check print str(c[0:5]) print str(c[-5:])
def fit(self,X,Budget=None,W=None): self.X = cuda.to_device(X.astype(np.float64,order='F')) self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F') self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F') self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F') self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F') if Budget is None: permutation = np.random.permutation(self.X.shape[0]) self.permutation = cuda.to_device(permutation) initBudget(self.X,self.permutation,self.Budget) else: self.Budget = cuda.to_device(Budget.astype(np.float64,order='F')) self.calculateKB() self.calculateKX() if W is None: self.initW() else: self.W = cuda.to_device(W.astype(np.float64,order='F')) self.t = 0 for i in xrange(self.epochs): print "Epoch " + str(i) samples,features = self.X.shape permutation = getPermutation(samples,self.miniBatchSize) self.permutation = cuda.to_device(permutation) for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize): loadBatch(self.kx,self.permutation,j,self.kxi) self.nextW() self.t += 1 self.predictH()
def getIdx(fill_word,reduced_literal, reduced_length, head, cardinality):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word) bin_length = max(len(bin(2*reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block compact_flag = numpy.ones(thread_num, dtype='int64') index = numpy.ones(2*reduced_length, dtype='uint32') d_index = cuda.to_device(index) d_fill_word = cuda.to_device(fill_word) d_reduced_literal = cuda.to_device(numpy.array(reduced_literal)) d_compact_flag = cuda.to_device(compact_flag) block_num = reduced_length/tpb + 1 getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length) compact_flag = d_compact_flag.copy_to_host() useless_array = numpy.zeros(thread_num, dtype='int64') radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0) out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1 out_index = numpy.zeros(out_index_length, dtype='uint32') offsets = [] new_block_num = 2*reduced_length/tpb + 1 scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length) for i in xrange(reduced_length): if head[i]: offsets.append(d_compact_flag.copy_to_host()[2*i]) key_length = numpy.zeros(cardinality, dtype='int64') for i in xrange(cardinality-1): key_length[i] = offsets[i+1] - offsets[i] key_length[cardinality-1] = out_index_length - offsets[cardinality-1] return out_index, numpy.array(offsets), numpy.array(key_length)
def gpumulti(X,mu): device = cuda.get_current_device() n=len(X) X=np.array(X) x1 = np.array(X.T[0]) x2 = np.array(X.T[1]) bmk = np.arange(len(x1)) mu = np.array(mu) dx1 = cuda.to_device(x1) dx2 = cuda.to_device(x2) dmu = cuda.to_device(mu) dbmk = cuda.to_device(bmk) # Set up enough threads for kernel tpb = device.WARP_SIZE bpg = int(np.ceil(float(n)/tpb)) cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk) bestmukey = dbmk.copy_to_host() return bestmukey
def radix_sort(arr, rid): length = numpy.int64(len(arr)) bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length thread_num = numpy.int64(math.pow(2,bin_length)) block_num = max(thread_num/TPB_MAX,1) stream = cuda.stream() one_list = numpy.zeros(shape=(thread_num), dtype='int64') zero_list = numpy.zeros(shape=(thread_num), dtype='int64') iter_num = len(bin(ATTR_CARD_MAX)) for i in range(iter_num): d_arr = cuda.to_device(arr, stream) d_rid = cuda.to_device(rid, stream) d_zero_list = cuda.to_device(zero_list,stream) d_one_list = cuda.to_device(one_list,stream) get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list d_one_list.to_host(stream) d_zero_list.to_host(stream) stream.synchronize() base_reduction_block_num = block_num base_reduction_block_size = TPB_MAX tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64') d_tmp_out = cuda.to_device(tmp_out, stream) sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out) d_tmp_out.to_host(stream) stream.synchronize() base = 0 #base for the scan of one_list for j in xrange(base_reduction_block_num): base += tmp_out[j] Blelloch_scan_caller(d_zero_list, d_one_list, base) array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
def run_GPU(grid, adjGrid, steps, delay, initDelay, printInd, indSteps): """ Runs the Command-Line interface for a specified number of steps, or forever if the number of steps is specified to be -1. Note that here, grid and adjGrid must be explicitly specified as opposed to passed in as a Game, to enable everything to be run on the GPU. Returns the final grid state. """ step = 0 dim = grid.shape # move arrays to GPU d_grid = cuda.to_device(grid) d_adjGrid = cuda.to_device(adjGrid) blockDim = (32,16) gridDim = (32,8) while step < steps or steps == -1: # print grid if printInd is not -1 and step % printInd is 0: # in order to print grid, first need memory back in CPU d_grid.to_host() printGrid(grid, step, dim) # print index if indSteps is not -1 and step % indSteps is 0: print("Step = " + str(step)) newGrid = np.zeros_like(grid) d_newGrid = cuda.to_device(newGrid) evolve2D_kernel[gridDim, blockDim](d_grid, d_adjGrid, d_newGrid) d_grid = d_newGrid grid = newGrid sleep(delay) if step == 0: # allow initial position to be more easily visible sleep(initDelay) step += 1 d_grid.to_host() return grid
def evaluation_function(factors, opts): start = timer() longest_wavelet, target_samples = opts['longest_wavelet'], opts['target_samples'] window_width = len(target_samples) full_width = window_width + longest_wavelet num_wavelengths = longest_wavelet-2 offsets_per_wavelet = full_width / num_wavelengths num_rows = offsets_per_wavelet * num_wavelengths result = np.zeros(window_width, dtype=np.float32) d_factors = cuda.to_device(factors) d_result = cuda.to_device(result) griddim = full_width, 1 blockdim = 4, 1, 1 compute_samples_configured = compute_sample_kernel.configure(griddim, blockdim) compute_samples_configured(d_factors, longest_wavelet, offsets_per_wavelet, d_result, num_rows) d_result.to_host() generated_samples_sum = sum(result) factors_sum = sum(factors) difference_from_target = math.fabs(sum(target_samples - result)) non_zero_factors = filter(lambda x: x != 0.0, result) fun_value = difference_from_target + 10 * math.fabs(len(non_zero_factors)) print("Value "+str(fun_value)+" generated in " + str((timer() - start)) + " seconds. Sample sum: " + str(generated_samples_sum)+". Factors sum: "+str(factors_sum)) return fun_value
def main(): N = 2048 * 2048 # Allocate host memory arrays a = np.empty(N) b = np.empty(N) c = np.empty(N) # Initialize host memory a.fill(2) b.fill(1) c.fill(0) # Allocate and copy GPU/device memory d_a = cuda.to_device(a) d_b = cuda.to_device(b) d_c = cuda.to_device(c) threads_per_block = 128 number_of_blocks = N / 128 + 1 saxpy [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c ) d_c.copy_to_host(c) # Print out the first and last 5 values of c for a quality check print c[:5] print c[-5:]
def getIdx(fill_word,reduced_literal, reduced_length):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word) bin_length = max(len(bin(reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block compact_flag = numpy.ones(thread_num, dtype='int64') print thread_num index = numpy.ones(2*reduced_length, dtype='uint32') d_index = cuda.to_device(index) d_fill_word = cuda.to_device(fill_word) d_reduced_literal = cuda.to_device(numpy.array(reduced_literal)) d_compact_flag = cuda.to_device(compact_flag) #print fill_word getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length) compact_flag = d_compact_flag.copy_to_host() #print compact_flag[0:28] useless_array = numpy.zeros(thread_num, dtype='int64') radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0) out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1 print d_compact_flag.copy_to_host()[0:2*reduced_length] print out_index_length out_index = numpy.zeros(out_index_length, dtype='uint32') scatter_index[1,tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length) #for i in out_index: # print bin(i) return out_index
def train(self,ds,epochs,batch_size=10): for epoch in range(epochs): start = timer() count = 0. correct = 0. for i in range(len(ds)/batch_size): count += 1. x = encode(ds[i*batch_size][0],gpu=False) t = encode(ds[i*batch_size][1],gpu=False) for b in range(batch_size-1): x = np.concatenate((x,encode(ds[i*batch_size + b+1][0],gpu=False))) t = np.concatenate((t,encode(ds[i*batch_size + b+1][1],gpu=False))) x = cuda.to_device(x) t = cuda.to_device(t) assert x.shape[1] == self.layers[0] assert t.shape[1] == self.layers[2] print(x.shape) self.forward(x) print('output',decode(self.output)) if decode(self.output) == decode(t): correct += 1. self.backward(t) print("Epoch",epoch,"Time:",timer()-start,'output',decode(self.output), 'Accuracy:',correct/count) if correct/count > 0.99: break
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> step_cfg = step[gridsz, blksz, stream] d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last, stream=stream) d_last = d_paths stream.synchronize()
def reduce_by_key(input_data, chunk_id, literal, length):#step 3 flag = numpy.ones(length, dtype='int32') stream = cuda.stream() d_flag = cuda.to_device(flag, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag) d_flag.to_host(stream) print 'flag:' print flag stream.synchronize() is_finish = numpy.zeros(length, dtype='int32') hop = 1 while hop<32:#only 32 because the length of a word in binary form is 32 reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length) hop *= 2 d_literal.to_host(stream) d_chunk_id.to_host(stream) stream.synchronize() reduced_input_data = [] reduced_chunk_id = [] reduced_literal =[] for i in xrange(length): if flag[i]: reduced_input_data.append(input_data[i]) reduced_chunk_id.append(chunk_id[i]) reduced_literal.append(literal[i]) return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def tests(): a = np.random.rand(300,500) b = np.random.rand(500,300) start = timer() c = np.dot(a,b) nptime = timer()-start print('nptime',nptime) x = np.array(np.random.rand(600,1500),dtype='float32',order='F') y = np.array(np.random.rand(1500,300),dtype='float32',order='F') z = np.zeros((1000,1000),order='F',dtype='float32') stream = cuda.stream() dx = cuda.to_device(x) dy = cuda.to_device(y) dz = cuda.to_device(z) start = timer() blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz) cutime = timer()-start print('cutime',cutime) #dz.copy_to_host(z) print(dz[0]) c = np.ones((1000,1000),order='F',dtype='float32') print(c.shape) dc = cuda.to_device(c) # blockDim = (256,256) #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1])) blockDim = (30,30) gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1])) start = timer() mtanh[gridDim,blockDim,stream](dc) tantime = timer() - start print('tantime',tantime) dc.copy_to_host(c,stream=stream) stream.synchronize() print(c) y = cm.CUDAMatrix(np.ones((1000,1000))) start = timer() cm.tanh(y) cmtan = timer()-start print('cmtan',cmtan) x = cm.CUDAMatrix(np.random.rand(1000,1500)) y = cm.CUDAMatrix(np.random.rand(1500,1000)) start = timer() cm.dot(x,y) cmtime = timer()-start print('cmtime',cmtime)
def getIdx( fill_word, reduced_literal, reduced_length ): #step 5: get index by interleaving fill_word and literal(also remove all-zeros word) bin_length = max(len(bin(reduced_length - 1)), len( bin(tpb - 1))) #the bit number of binary form of array length thread_num = numpy.int64( math.pow(2, bin_length) ) #Blelloch_scan need the length of scanned array to be even multiple of thread_per_block compact_flag = numpy.ones(thread_num, dtype='int64') print thread_num index = numpy.ones(2 * reduced_length, dtype='uint32') d_index = cuda.to_device(index) d_fill_word = cuda.to_device(fill_word) d_reduced_literal = cuda.to_device(numpy.array(reduced_literal)) d_compact_flag = cuda.to_device(compact_flag) #print fill_word getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length) compact_flag = d_compact_flag.copy_to_host() #print compact_flag[0:28] useless_array = numpy.zeros(thread_num, dtype='int64') radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0) out_index_length = d_compact_flag.copy_to_host()[2 * reduced_length - 1] + 1 print d_compact_flag.copy_to_host()[0:2 * reduced_length] print out_index_length out_index = numpy.zeros(out_index_length, dtype='uint32') scatter_index[1, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length) #for i in out_index: # print bin(i) return out_index
def reduce_by_key(input_data, chunk_id, literal, length): #step 3 flag = numpy.ones(length, dtype='int32') stream = cuda.stream() d_flag = cuda.to_device(flag, stream) d_chunk_id = cuda.to_device(chunk_id, stream) d_literal = cuda.to_device(literal, stream) produce_flag[1, tpb](input_data, d_chunk_id, length, d_flag) d_flag.to_host(stream) print 'flag:' print flag stream.synchronize() is_finish = numpy.zeros(length, dtype='int32') hop = 1 while hop < 32: #only 32 because the length of a word in binary form is 32 reduce_by_key_gpu[1, tpb](d_literal, d_flag, is_finish, hop, length) hop *= 2 d_literal.to_host(stream) d_chunk_id.to_host(stream) stream.synchronize() reduced_input_data = [] reduced_chunk_id = [] reduced_literal = [] for i in xrange(length): if flag[i]: reduced_input_data.append(input_data[i]) reduced_chunk_id.append(chunk_id[i]) reduced_literal.append(literal[i]) return numpy.array(reduced_input_data), numpy.array( reduced_chunk_id), reduced_literal
def test_scan(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100) tac1 = time() in_d = cuda.to_device(in_h) out_d = cuda.to_device(out_h) cuda.synchronize() tac2 = time() tk1 = time() for i in range(0, 32): tk1 = time() preScan(out_d, in_d, NUM_ELEMENTS) cuda.synchronize() tk2 = time() print i, tk2 - tk1 tk2 = time() th1 = time() out_d.copy_to_host(out_h) cuda.synchronize() #print "Last = ", out_h[-1] + in_h[-1] th2 = time()
def main(): NN = 4096 NM = 4096 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m) timer = time.time() iter = 0 blockdim = (32, 32) griddim = (NN / blockdim[0], NM / blockdim[1]) error_grid = np.zeros_like(A) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: assert error_grid.dtype == np.float64 jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.to_host(stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time() - timer) iter += 1 runtime = time.time() - timer print " total: %f s" % runtime
def main(): NN = 4096 NM = 4096 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m) timer = time.time() iter = 0 blockdim = (tpb, tpb) griddim = (NN/blockdim[0], NM/blockdim[1]) error_grid = np.zeros(griddim) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: assert error_grid.dtype == np.float64 jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.to_host(stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer) iter += 1 runtime = time.time() - timer print " total: %f s" % runtime
def test(n=256, k=30, batch=100): """ Running test between (n x n) * (n x 100) multiply vs (n X 17) * (n) x 100 [vectors] """ b = numbapro.cudalib.cublas.Blas() G = np.array(np.random.randn(n, n), dtype=np.float32, order='F') G2 = np.array(np.random.randn(n, k), dtype=np.float32, order='F') d_G = cuda.to_device(G) d_G2 = cuda.to_device(G2)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def getIdx( fill_word, reduced_literal, reduced_length, head, cardinality ): #step 5: get index by interleaving fill_word and literal(also remove all-zeros word) bin_length = max(len(bin(2 * reduced_length - 1)), len( bin(tpb - 1))) #the bit number of binary form of array length thread_num = numpy.int64( math.pow(2, bin_length) ) #Blelloch_scan need the length of scanned array to be even multiple of thread_per_block compact_flag = numpy.ones(thread_num, dtype='int64') print thread_num print reduced_length index = numpy.ones(2 * reduced_length, dtype='uint32') d_index = cuda.to_device(index) d_fill_word = cuda.to_device(fill_word) d_reduced_literal = cuda.to_device(numpy.array(reduced_literal)) d_compact_flag = cuda.to_device(compact_flag) #print fill_word block_num = reduced_length / tpb + 1 getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length) compact_flag = d_compact_flag.copy_to_host() print 'compact:' print compact_flag[0:28] useless_array = numpy.zeros(thread_num, dtype='int64') radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0) out_index_length = d_compact_flag.copy_to_host()[2 * reduced_length - 1] + 1 print d_compact_flag.copy_to_host()[0:2 * reduced_length] print out_index_length out_index = numpy.zeros(out_index_length, dtype='uint32') offset = [] new_block_num = 2 * reduced_length / tpb + 1 scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length) #for i in out_index: print head[-100:-1] for i in xrange(reduced_length): if head[i]: offset.append(d_compact_flag.copy_to_host()[2 * i]) #print offset key_length = numpy.zeros(cardinality, dtype='int64') print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' print "cardinality:%d" % cardinality print "len(off_set)%d" % len(offset) for i in xrange(cardinality - 1): key_length[i] = offset[i + 1] - offset[i] key_length[cardinality - 1] = out_index_length - offset[cardinality - 1] print key_length return out_index, numpy.array(offset), numpy.array(key_length)
def get_indexList(path, attr_selected): path1, path2, attr_num = bitmap_pickle.get_pic_path(path) f1 = open(path1, 'rb') # read data_map.pkl try: attr_map = pickle.load(f1) attr_list = pickle.load(f1) attr_total = pickle.load(f1) finally: f1.close() f2 = open(path2, 'rb') # read bitmap_pic.pkl try: lists = pickle.load(f2) key = pickle.load(f2) offset = pickle.load(f2) finally: f2.close() # attr_input is a list that stores the numbers of input attributes # attr_num is the total number of attributes # attr_total is the total number of data/31 attr_input = [[] for i in xrange(attr_num)] for i in xrange(attr_num): for attri in attr_selected[i]: if attri in attr_map[i]: attr_input[i].append(attr_map[i][attri]) elif attri == 'All': attr_input[i].append(-1) if len(attr_input[i]) > 1 and (-1 in attr_input[i]): attr_input[i].remove(-1) print attr_input search_start_time = time.time() if len(attr_input ) != attr_num: # there might be a wrong input in input_test.py print 'No eligible projects' else: tpb = 1024 blocknum = 1 attr_mul = (attr_total + (tpb * blocknum - 1)) / (tpb * blocknum) # attr_mul is the number that each thread need to be performed #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul) # attr_num = 1 index_list = numpy.zeros(attr_total * 31, dtype='int32') bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset) stream = cuda.stream() d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream) d_index_list = cuda.to_device(numpy.array(index_list), stream) index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul) index_list = d_index_list.copy_to_host() stream.synchronize() search_end_time = time.time() return index_list, search_end_time - search_start_time
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib.cublas import Blas # @UnresolvedImport from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream) cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream) cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()
def cuda_factor(number, primes): device = cuda.get_current_device() ffactor = np.asarray([1]) dfact = cuda.to_device(ffactor) d_primes = cuda.to_device(np.asarray(primes)) tpb = 720 bpg = 334 cu_fact[bpg, tpb](d_primes, number, dfact) c = dfact.copy_to_host() return c
def get_indexList(path, attr_selected): path1, path2, attr_num = bitmap_pickle.get_pic_path(path) f1 = open(path1, 'rb') # read data_map.pkl try: attr_map = pickle.load(f1) attr_list = pickle.load(f1) attr_total = pickle.load(f1) finally: f1.close() f2 = open(path2, 'rb') # read bitmap_pic.pkl try: lists = pickle.load(f2) key = pickle.load(f2) offset = pickle.load(f2) finally: f2.close() # attr_input is a list that stores the numbers of input attributes # attr_num is the total number of attributes # attr_total is the total number of data/31 attr_input = [[] for i in xrange(attr_num)] for i in xrange(attr_num): for attri in attr_selected[i]: if attri in attr_map[i]: attr_input[i].append(attr_map[i][attri]) elif attri == 'All': attr_input[i].append(-1) if len(attr_input[i])>1 and (-1 in attr_input[i]): attr_input[i].remove(-1) print attr_input search_start_time = time.time() if len(attr_input) != attr_num: # there might be a wrong input in input_test.py print 'No eligible projects' else: tpb = 1024 blocknum = 1 attr_mul = (attr_total + (tpb * blocknum - 1))/(tpb * blocknum) # attr_mul is the number that each thread need to be performed #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul) # attr_num = 1 index_list = numpy.zeros(attr_total*31, dtype='int32') bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset) stream = cuda.stream() d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream) d_index_list = cuda.to_device(numpy.array(index_list), stream) index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul) index_list = d_index_list.copy_to_host() stream.synchronize() search_end_time = time.time() return index_list, search_end_time-search_start_time
def flush(self, metric_opt, supp_opt): if not self.Vcs: # Nothing to do return metric_opt, supp_opt k = self.k V = self.V topk_list = [] nodect = V.shape[0] numseg = len(self.Vcs) assert nodect assert numseg eachsize = nodect * numseg D = np.zeros(eachsize, dtype=np.float32) # Fill buffer for segmented sort for i, Vc in enumerate(self.Vcs): D[i * nodect:(i + 1) * nodect] = Vc[:, 0] # Prepare for GPU segmented sort dD = cuda.to_device(D) dI = cuda.device_array((numseg, nodect), dtype=np.uint32) blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz)](dI) if numseg == 1: segments = np.arange(1, dtype=np.int32) else: segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect segmented_sort(dD, dI, cuda.to_device(segments)) for i in range(numseg): topk = dI[i, -k:].copy_to_host() topk_list.append(topk) # Reduce for topk in topk_list: # Assume A is huge metric = np.linalg.norm(V[topk, :]) ** 2 if metric > metric_opt: metric_opt = metric supp_opt = topk # Clear all Vc self.Vcs.clear() return metric_opt, supp_opt
def main(): vort = np.array(np.random.rand(2 * n), dtype=dtype).reshape((n, 2)) gamma = np.array(np.random.rand(n), dtype=dtype) vel = np.zeros_like(vort) start = timer() induced_velocity(vort, vort, gamma, vel) numpy_time = timer() - start print("n = %d" % n) print("Numpy".center(40, "=")) print("Time: %f seconds" % numpy_time) vel2 = np.zeros_like(vort) start = timer() induced_velocity2(vort, vort, gamma, vel2) numba_time = timer() - start print("Numba".center(40, "=")) print("Time: %f seconds" % numba_time) error = np.max(np.max(np.abs(vel2 - vel))) print("Difference: %f" % error) print("Speedup: %f" % (numpy_time / numba_time)) stream = cuda.stream() d_vort = cuda.to_device(vort, stream) d_gamma = cuda.to_device(gamma, stream) vel3 = np.zeros_like(vort) d_vel = cuda.to_device(vel3, stream) # blockdim = (32,32) # griddim = (n // blockdim[0], n // blockdim[1]) griddim = (n - 1) // blksize + 1 start = timer() induced_velocity3[griddim, blksize, stream](d_vort, d_vort, d_gamma, d_vel) d_vel.to_host(stream) gpu_time = timer() - start error = np.max(np.max(np.abs(vel3 - vel))) print("GPU".center(40, "=")) print("Time: %f seconds" % gpu_time) print("Difference: %f" % error) print("Speedup: %f" % (numpy_time / gpu_time)) # print(vel3) vel4 = np.zeros_like(vort) d_vel2 = cuda.to_device(vel4, stream) start = timer() induced_velocity4[griddim, blksize, stream](d_vort, d_vort, d_gamma, d_vel2) d_vel2.to_host(stream) gpu2_time = timer() - start error = np.max(np.max(np.abs(vel4 - vel))) print("GPU smem".center(40, "=")) print("Time: %f seconds" % gpu2_time) print("Difference: %f" % error) print("Speedup: %f" % (numpy_time / gpu2_time))
def make_fp_tree(): #### Allocate host memory offsets, transactions, num_transactions, all_items_in_transactions = readFile( "data.txt") print num_transactions, all_items_in_transactions flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32) #### Allocate and initialize GPU/Device memory d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) d_flist = cuda.to_device(flist) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) t1 = time() makeFlistGPU[number_of_blocks, threads_per_block](d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions) cuda.synchronize() t2 = time() d_flist.copy_to_host(flist) cuda.synchronize() # # for i in range(0, MAX_UNIQUE_ITEMS): # print i, flist[i] t3 = time() flist_cpu = makeFlist(transactions, all_items_in_transactions) t4 = time() # match = 1 for i in range(1, MAX_UNIQUE_ITEMS): if i not in flist_cpu and flist[i] == 0: continue #print i, flist[i], flist_cpu[i] if flist[i] != flist_cpu[i]: match = -1 break if match == 1: print "Test Passed" else: print "Test Failed" print "Number of transactions = ", num_transactions print "All items in transactions = ", all_items_in_transactions print "GPU time = ", t2 - t1 print "CPU TIME = ", t4 - t3
def encode(word,gpu=True): if isinstance(word,basestring): if using_embeddings == True: return cuda.to_device(vocab[word]) else: x = np.zeros((1,word_idx),dtype='float32') x[0][vocab[word]] = 1. if gpu == True: return cuda.to_device(x) else: return x else: return word
def main(): flowtime = 0.1 nx = 128 ny = 128 dx = 2.0 / (nx - 1) dy = 2.0 / (ny - 1) dt = dx / 50 ##ensures stability for a given mesh fineness rho = 1.0 nu = .1 nt = int( flowtime / dt ) ##calculate number of timesteps required to reach a specified total flowtime U = numpy.zeros((nx, ny), dtype=numpy.float32) U[-1, :] = 1 V = numpy.zeros((nx, ny), dtype=numpy.float32) P = numpy.zeros((ny, nx), dtype=numpy.float32) UN = numpy.zeros((nx, ny), dtype=numpy.float32) VN = numpy.zeros((nx, ny), dtype=numpy.float32) griddim = nx, ny blockdim = 768, 768, 1 #if nx > 767: # griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0])) t1 = time.time() ###Target the GPU to begin calculation stream = cuda.stream() d_U = cuda.to_device(U, stream) d_V = cuda.to_device(V, stream) d_UN = cuda.to_device(UN, stream) d_VN = cuda.to_device(VN, stream) for i in range(nt): P = ppe(rho, dt, dx, dy, U, V, P) CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt, rho, nu) d_U.to_host(stream) d_V.to_host(stream) stream.synchronize() t2 = time.time() print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2 - t1) x = numpy.linspace(0, 2, nx) y = numpy.linspace(0, 2, ny) Y, X = numpy.meshgrid(y, x)
def produce_fill(reduced_input_data, reduced_chunk_id, reduced_length):#step 4 head = numpy.ones(reduced_length, dtype='int32') stream = cuda.stream() d_head = cuda.to_device(head, stream) d_reduced_input_data = cuda.to_device(reduced_input_data, stream) produce_head[1,tpb](d_reduced_input_data, d_head, reduced_length)#produce head d_head.to_host(stream) stream.synchronize() d_reduced_chunk_id = cuda.to_device(reduced_chunk_id,stream) produce_fill_gpu[1,tpb](d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length) d_reduced_chunk_id.to_host(stream) stream.synchronize() #convert to int32 because the range a fill_word can describe is 0~(2^31-1) return numpy.array(reduced_chunk_id, dtype='int32')
def convolve(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) image = get_image() print "Image size: %s" % (image.shape,) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU # Use SciPy to perform the FFT convolution ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print 'CPU: %.2fs' % (te - ts) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Initialize the cuFFT system. cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) task1(cufft, d_image_complex, d_response_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print 'GPU: %.2fs' % (te - ts) return cvimage_cpu, cvimage_gpu
def convolve(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) image = get_image() print "Image size: %s" % (image.shape, ) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU # Use SciPy to perform the FFT convolution ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print 'CPU: %.2fs' % (te - ts) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Initialize the cuFFT system. cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) task1(cufft, d_image_complex, d_response_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print 'GPU: %.2fs' % (te - ts) return cvimage_cpu, cvimage_gpu
def main(): flowtime = 0.1 nx = 128 ny = 128 dx = 2.0/(nx-1) dy = 2.0/(ny-1) dt = dx/50 ##ensures stability for a given mesh fineness rho = 1.0 nu =.1 nt = int(flowtime/dt) ##calculate number of timesteps required to reach a specified total flowtime U = numpy.zeros((nx,ny), dtype=numpy.float32) U[-1,:] = 1 V = numpy.zeros((nx,ny), dtype=numpy.float32) P = numpy.zeros((ny, nx), dtype=numpy.float32) UN = numpy.zeros((nx,ny), dtype=numpy.float32) VN = numpy.zeros((nx,ny), dtype=numpy.float32) griddim = nx, ny blockdim = 768, 768, 1 #if nx > 767: # griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0])) t1 = time.time() ###Target the GPU to begin calculation stream = cuda.stream() d_U = cuda.to_device(U, stream) d_V = cuda.to_device(V, stream) d_UN = cuda.to_device(UN, stream) d_VN = cuda.to_device(VN, stream) for i in range(nt): P = ppe(rho, dt, dx, dy, U, V, P) CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt, rho, nu) d_U.to_host(stream) d_V.to_host(stream) stream.synchronize() t2 = time.time() print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2-t1) x = numpy.linspace(0,2,nx) y = numpy.linspace(0,2,ny) Y,X = numpy.meshgrid(y,x)
def symbolise(self, last_result=None): self.show_visualisation() bounds = np.empty(shape=(self.height * self.depth, 2), dtype=np.float64) for k in range(self.height): bounds[k * 3] = [0.001, 1] # wavelength as a fraction of the window width bounds[k * 3 + 1] = [-1, 2] # offset where window is 0,1 bounds[k * 3 + 2] = [-1, 1] # amplitude. Can be negative self._working_results = np.zeros(shape=(self.width, self.height), dtype=np.float64) # cuda init self._factors = np.zeros(shape=(self.height * self.depth,), dtype=np.float64) # cuda.profile_start() self._d_working_results = cuda.to_device(self._working_results) grid_dim = self.width / 128, self.height - 1 block_dim = 128, 1, 1 self.compute_samples_configured = compute_sample_gpu.configure(grid_dim, block_dim) with cuda.pinned(self._factors, self._working_results): output = scipy.optimize.differential_evolution(self.evaluation_function_cuda, bounds=bounds, strategy='best2bin', maxiter=20, recombination=0.9, mutation=(0.0001, 0.3), tol=0.01, init='latinhypercube', popsize=15, disp=True, callback=self.step_callback) # cuda.profile_stop() self.evaluation_function_cuda(output.x) # raw_input("Press Enter to continue...") print(output)
def _check_array(a): """Checks whether array is valid for moving to gpu and moves data to gpu. Parameters ---------- a : array-like Array to move to gpu """ ok_dtypes = [np.int, np.float32, np.float64] if isinstance(a, np.ndarray): a = cuda.to_device(np.array(a, dtype=np.float32, order='F')) elif isinstance(a, cuda.cudadrv.devicearray.DeviceNDArray): pass else: a = np.array(a) if a.dtype not in ok_dtypes: raise ValueError('input of type '+str(a.dtype)+ ' is not supported') else: a = np.array(a,dtype=np.float32, order='F') if a.dtype == np.float32: out_dtype = a.dtype else: raise NotImplementedError return (a, out_dtype)
def test_relu_m_v(self): """relu on matrices with newaxis vectors for thresholds.""" a = self.rng.rand(129, 1025).astype(np.float32) t = self.rng.rand(1025).astype(np.float32) out_np = copy.deepcopy(a) out_np[out_np<t[np.newaxis,:]] = 0. out_gp = self.gp.relu(a, thresh=t[np.newaxis,:]).copy_to_host() assert(np.allclose(out_np, out_gp)) a = self.rng.rand(129, 1025).astype(np.float32) t = self.rng.rand(129).astype(np.float32) val = .5 out_np = copy.deepcopy(a) out_np[out_np<t[:,np.newaxis]] = val out_gp = self.gp.relu(a, thresh=t[:,np.newaxis], set_val=val).copy_to_host() assert(np.allclose(out_np, out_gp)) a = self.rng.rand(129, 1025).astype(np.float32) t = self.rng.rand(129).astype(np.float32) val = .5 out_np = copy.deepcopy(a) out_np[out_np<t[:,np.newaxis]] = val out_gp = cuda.to_device(a) self.gp.relu(a, thresh=t[:,np.newaxis], set_val=val, out=out_gp) assert(np.allclose(out_np, out_gp.copy_to_host()))
def scal(self, a, alpha): """Scale a 1D or 2D array by alpha. Parameters ---------- a : array-like Array to scale. alpha : float Scaling factor. """ a, out_dtype = _check_array(a) a_dim = a.shape if a.ndim == 2: a_strides = a.strides a_dtype = a.dtype d_flat_a = _cu_reshape(a, (np.prod(a_dim),), (a_strides[0],), a_dtype) self.blas.scal(alpha, d_flat_a) a = _cu_reshape(d_flat_a, a_dim, a_strides, a_dtype) elif a.ndim == 1: if type(a) == np.ndarray: a = cuda.to_device(a) self.blas.scal(alpha, a) else: raise NotImplementedError return a
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def make_fp_tree(): #### Allocate host memory offsets, transactions, num_transactions, all_items_in_transactions = readFile("data.txt") print num_transactions, all_items_in_transactions flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32) #### Allocate and initialize GPU/Device memory d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) d_flist = cuda.to_device(flist) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) t1 = time() makeFlistGPU [number_of_blocks, threads_per_block] (d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions) cuda.synchronize() t2 = time() d_flist.copy_to_host(flist) cuda.synchronize() # # for i in range(0, MAX_UNIQUE_ITEMS): # print i, flist[i] t3 = time() flist_cpu = makeFlist(transactions, all_items_in_transactions) t4 = time() # match = 1 for i in range(1, MAX_UNIQUE_ITEMS): if i not in flist_cpu and flist[i] == 0: continue #print i, flist[i], flist_cpu[i] if flist[i] != flist_cpu[i]: match = -1 break if match == 1: print "Test Passed" else: print "Test Failed" print "Number of transactions = ", num_transactions print "All items in transactions = ", all_items_in_transactions print "GPU time = ", t2 - t1 print "CPU TIME = ", t4 - t3
def zeros(n,gpu=True): w=None if gpu is True: w = np.zeros((n[0],n[1]),dtype='float32') w = cuda.to_device(w) else: w = np.zeros((n[0],n[1])) return(w)
def produce_fill(reduced_input_data, reduced_chunk_id, reduced_length): #step 4 head = numpy.ones(reduced_length, dtype='int32') stream = cuda.stream() d_head = cuda.to_device(head, stream) d_reduced_input_data = cuda.to_device(reduced_input_data, stream) produce_head[1, tpb](d_reduced_input_data, d_head, reduced_length) #produce head d_head.to_host(stream) stream.synchronize() d_reduced_chunk_id = cuda.to_device(reduced_chunk_id, stream) produce_fill_gpu[1, tpb](d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length) d_reduced_chunk_id.to_host(stream) stream.synchronize() #convert to int32 because the range a fill_word can describe is 0~(2^31-1) return numpy.array(reduced_chunk_id, dtype='int32')
def main(): # Allocate host memory a = np.empty([N,N], dtype=np.float32) b = np.empty_like(a) c_cpu = np.empty_like(a) c_gpu = np.empty_like(a) # Initialize host memory for row in range(N): for col in range(N): a[row,col] = row b[row,col] = col+2 c_cpu[row,col] = 0 c_gpu[row,col] = 0 # Allocate and initialize GPU/device memory d_a = cuda.to_device(a) d_b = cuda.to_device(b) d_c = cuda.to_device(c_gpu) # since we're overwriting c on the GPU in # the matrixMul kernel, no need to copy data over threads_per_block = (16, 16) # A 16 x 16 block threads number_of_blocks = ((N / threads_per_block[0]) + 1, (N / threads_per_block[1]) + 1) matrixMulGPU [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c ) d_c.copy_to_host(c_gpu) # Call the CPU version to check our work matrixMulCPU( a, b, c_cpu ) # Compare the two answers to make sure they are equal error = False for row in range(N): if error: break for col in range(N): if error: break if c_cpu[row,col] != c_gpu[row,col]: print "FOUND ERROR at c[" + str(row) + "," + str(col) + "]" error = True if not error: print "Success!"
def infer(dictionary, coeffs, stimuli, eta, lamb, nIter, softThresh, adapt): #Get Blas routines bs = cublas.Blas() #Initialize arrays numDict = dictionary.shape[0] numStim = stimuli.shape[0] dataLength = stimuli.shape[1] d_u = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_s = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_b = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_ci = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_c = cuda.to_device( np.zeros((numDict, numDict), dtype=np.float32, order='F')) #Move inputs to GPU d_dictionary = cuda.to_device( np.array(dictionary, dtype=np.float32, order='F')) d_coeffs = cuda.to_device(np.array(coeffs, dtype=np.float32, order='F')) d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F')) blockdim2 = (32, 32) blockdim1 = 32 #griddimc = (int(numDict/blockdim[0]),int(numDict/blockdim[1])) griddimcsub = int(numDict / blockdim1) griddimb = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1])) griddimi = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1])) #Calculate c: overlap of basis functions with each other minus identity #cinit[griddimc,blockdim](d_dictionary,d_c) bs.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary, d_dictionary, 0., d_c) csub[griddimcsub, blockdim1](d_c) #binit[griddimb,blockdim2](d_dictionary,d_stimuli,d_b) bs.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli, d_dictionary, 0., d_b) thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1) d_thresh = cuda.to_device(thresh) #Update u[i] and s[i] for nIter time steps for kk in xrange(nIter): #Calculate ci: amount other neurons are stimulated times overlap with rest of basis bs.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci) iter[griddimi, blockdim2](d_c, d_b, d_ci, d_u, d_s, eta, d_thresh, lamb, adapt, softThresh) u = d_u.copy_to_host() s = d_s.copy_to_host() return (s, u, thresh)
def cuda_factor(number, primes): device = cuda.get_current_device() ffactor = np.asarray([0] * len(primes)) dfact = cuda.to_device(ffactor) d_primes = cuda.to_device(np.asarray(primes)) tpb = 720 bpg = 334 start = timer() cu_fact[bpg, tpb](d_primes, number, dfact) total = timer() - start print "Time taken : ", total c = dfact.copy_to_host() k = [] for d in c: if int(d) != 0: k.append(int(d)) return k
def compute_block(self): device_uniforms = curand.uniform(size=N * N, device=True) host_results = zeros((self.size, self.size)) stream = cuda.stream() device_proposals = cuda.to_device(self.host_proposals, stream=stream) device_omegas = cuda.to_device(self.host_omegas, stream=stream) device_results = cuda.device_array_like(host_results, stream=stream) cu_one_block[self.grid_dim, self.threads_per_block, stream](self.start, device_proposals, device_omegas, device_uniforms, device_results, self.size, self.size) device_results.copy_to_host(host_results, stream=stream) stream.synchronize() return host_results
def backward(dY, cache): Xe = cache['Xe'] generator_str = cache['generator_str'] dWs = np.zeros(cache['Ws_shape']) gen_caches = cache['gen_caches'] F = cache['F'] dXe = np.zeros(Xe.shape) Generator = decodeGenerator(generator_str) dmmy, gen_cache = gen_caches[0] g_WLSTM = cuda.to_device(np.asfortranarray(gen_cache['WLSTM'])) # backprop each item in the batch grads = {} dt1 = 0 dt2 = 0 t0 = time.time() for i in xrange(len(gen_caches)): t1 = time.time() ix, gen_cache = gen_caches[i] # unpack local_grads = Generator.backward(dY[i], gen_cache, g_WLSTM) dt1 += time.time() - t1 t2 = time.time() dXs = local_grads['dXs'] # intercept the gradients wrt Xi and Xs del local_grads['dXs'] dXi = local_grads['dXi'] del local_grads['dXi'] accumNpDicts( grads, local_grads) # add up the gradients wrt model parameters # now backprop from dXs to the image vector and word vectors dXe[i, :] += dXi # image vector for n, j in enumerate(ix): # and now all the other words dWs[j, :] += dXs[n, :] dt2 += time.time() - t2 #dt = time.time() - t0 #print 'BP :%0.4f' %(dt) dt = time.time() - t0 print 'Backward Pass:%0.4f Others :%0.4f' % (dt1, dt2) t0 = time.time() # finally backprop into the image encoder dWe = F.transpose().dot(dXe) dbe = np.sum(dXe, axis=0, keepdims=True) dt = time.time() - t0 print 'MMult :%0.4f' % (dt) t0 = time.time() accumNpDicts(grads, {'We': dWe, 'be': dbe, 'Ws': dWs}) dt = time.time() - t0 print 'accum 2:%0.4f' % (dt) t0 = time.time() return grads
def infer(learner, stimuli, coeffs=None): #Get Blas routines blas = cublas.Blas() #Initialize arrays numDict = learner.Q.shape[0] numStim = stimuli.shape[0] dataLength = stimuli.shape[1] u = np.zeros((numStim, numDict), dtype=np.float32, order='F') if coeffs is not None: u[:] = np.atleast_2d(coeffs) d_u = cuda.to_device(u) d_s = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_b = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_ci = cuda.to_device( np.zeros((numStim, numDict), dtype=np.float32, order='F')) d_c = cuda.to_device( np.zeros((numDict, numDict), dtype=np.float32, order='F')) #Move inputs to GPU d_dictionary = cuda.to_device( np.array(learner.Q, dtype=np.float32, order='F')) d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F')) blockdim2 = (32, 32) # TODO: experiment, was all 32s blockdim1 = 32 griddimcsub = int(ceil(numDict / blockdim1)) griddimi = (int(ceil(numStim / blockdim2[0])), int(ceil(numDict / blockdim2[1]))) #Calculate c: overlap of basis functions with each other minus identity blas.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary, d_dictionary, 0., d_c) LCALearner.csub[griddimcsub, blockdim1](d_c) blas.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli, d_dictionary, 0., d_b) thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1) d_thresh = cuda.to_device(thresh) #Update u[i] and s[i] for niter time steps for kk in range(learner.niter): #Calculate ci: amount other neurons are stimulated times overlap with rest of basis blas.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci) LCALearner.iterate[griddimi, blockdim2](d_c, d_b, d_ci, d_u, d_s, learner.infrate, d_thresh, learner.min_thresh, learner.adapt, learner.softthresh) u = d_u.copy_to_host() s = d_s.copy_to_host() return s.T, u.T, thresh
def test_histogram(): #Allocate host memory input_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) bins_h = np.zeros(BIN_SIZE, dtype=np.uint32) myprint("Bin Size = " + str(bins_h.size)) ## Initialize host memory for i in range(0, NUM_ELEMENTS): input_h[i] = randint(0, BIN_SIZE - 1) ## Allocate and initialize GPU/device memory input_d = cuda.to_device(input_h) bins_d = cuda.to_device(bins_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)#((NUM_ELEMENTS / threads_per_block[0]) + 1, 1) t1 = time() histogramGPU [number_of_blocks, threads_per_block] (input_d, bins_d, NUM_ELEMENTS) cuda.synchronize() t2 = time() bins_d.copy_to_host(bins_h) t3 = time() bins_cpu = makeHist(input_h) t4 = time() # for i in range(0, BIN_SIZE): # print i, bins_h[i], bins_cpu[i] print "GPU time = ", t2 - t1 print "CPU TIME = ", t4 - t3 match = 1 for i in range(0, BIN_SIZE): if bins_h[i] != bins_cpu[i]: match = -1 break if match == 1: print "Test Passed" else: print "Test Failed"
def radix_sort(arr, rid): length = numpy.int64(len(arr)) bin_length = max(len(bin(length - 1)), len( bin(TPB_MAX - 1))) #the bit number of binary form of array length thread_num = numpy.int64(math.pow(2, bin_length)) block_num = max(thread_num / TPB_MAX, 1) stream = cuda.stream() one_list = numpy.zeros(shape=(thread_num), dtype='int64') zero_list = numpy.zeros(shape=(thread_num), dtype='int64') iter_num = len(bin(ATTR_CARD_MAX)) for i in range(iter_num): d_arr = cuda.to_device(arr, stream) d_rid = cuda.to_device(rid, stream) d_zero_list = cuda.to_device(zero_list, stream) d_one_list = cuda.to_device(one_list, stream) get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list) #get one_list and zero_list d_one_list.to_host(stream) d_zero_list.to_host(stream) stream.synchronize() base_reduction_block_num = block_num base_reduction_block_size = TPB_MAX tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64') d_tmp_out = cuda.to_device(tmp_out, stream) sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out) d_tmp_out.to_host(stream) stream.synchronize() base = 0 #base for the scan of one_list for j in xrange(base_reduction_block_num): base += tmp_out[j] Blelloch_scan_caller(d_zero_list, d_one_list, base) array_adjust[block_num, TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
def gradient_descent(X, Y, theta, alpha, num_iters): N = X.size NTID = 1024 NBLK = N // NTID assert NBLK * NTID == N Ex = np.empty(NBLK, dtype=X.dtype) Ey = np.empty(NBLK, dtype=X.dtype) theta_x, theta_y = 0, 0 # ----------------- # GPU work dX = cuda.to_device(X) dY = cuda.to_device(Y) dEx = cuda.to_device(Ex, copy=False) dEy = cuda.to_device(Ey, copy=False) griddim = NBLK, blockdim = NTID, for _ in xrange(num_iters): cu_compute_error[griddim, blockdim](dX, dY, dEx, dEy, theta_x, theta_y) dEx.to_host() dEy.to_host() # ----------------- # CPU work error_x = Ex.sum() error_y = Ey.sum() theta_x = theta_x - alpha * (1.0 / N) * error_x theta_y = theta_y - alpha * (1.0 / N) * error_y theta[0] = theta_x theta[1] = theta_y
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
def task2(): a = numpy.float32(2.) # Force value to be float32 x = numpy.arange(NELEM, dtype='float32') y = numpy.arange(NELEM, dtype='float32') ### Task2 ### # a) Complete the memory transfer for x -> dx, y -> dy # b) Allocate device memory for dout # c) Transfer for out <- dout dx = cuda.to_device(x) dy = cuda.to_device(y) dout = cuda.device_array_like(x) griddim = NUM_BLOCKS blockdim = NUM_THREADS saxpy[griddim, blockdim](a, dx, dy, dout) out = dout.copy_to_host() print "out =", out if numpy.allclose(a * x + y, out): print "Correct result" else: print "Incorrect result"
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS - i - 1 in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() for bit_shift in range(0, 32): tk1 = time() #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift) preScan(out_d, in_d, NUM_ELEMENTS) tk2 = time() #print bit_shift, tk2 - tk1 in_d = out_d out_d = temp_d temp_d = in_d tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = [NUM_ELEMENTS - i - 1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread)) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError