def grayfication(image): forme = image.shape aSize = forme[0] * forme[1] xdim = np.int32(forme[0]) ydim = np.int32(forme[1]) r_img = image[:, :, 0].reshape(aSize, order='F') g_img = image[:, :, 1].reshape(aSize, order='F') b_img = image[:, :, 2].reshape(aSize, order='F') dest = np.zeros(aSize).astype(np.float32) #block size: B := dim1*dim2*dim3=1024 #gird size : dim1*dimr2*dim3 = ceiling(aSize/B) blockX = int(xdim) multiplier = aSize / float(blockX) if (aSize / float(blockX) > int(aSize / float(blockX))): gridX = int(multiplier + 1) else: gridX = int(multiplier) #parallel rgb computation+time rgb2gray(drv.Out(dest), drv.InOut(r_img), drv.InOut(g_img), drv.InOut(b_img), ydim, block=(blockX, 1, 1), grid=(gridX, 1, 1)) dest = np.reshape(dest, forme[0:2], order='F') return dest
def _draw(self, pts, colors): if not pts: return False imsize = self.imsize dt0 = time() ind_count = zeros(self.imsize2, npint) colors = row_stack(colors).astype(npfloat) xy = vstack(pts).astype(npfloat) inds = zeros(xy.shape[0], npint) self.cuda_agg(npint(inds.shape[0]), npint(imsize), cuda.In(xy), cuda.InOut(inds), cuda.InOut(ind_count), block=(THREADS, 1, 1), grid=(int(inds.shape[0]//THREADS) + 1, 1)) mask = inds > -1 if not mask.any(): print('-- no dots to draw. time: {:0.4f}'.format(time()-dt0)) return False # xy = xy[mask, :] inds = inds[mask] colors = colors[mask] ind_count_map = _build_ind_count(ind_count) _ind_count_map = cuda.mem_alloc(ind_count_map.nbytes) cuda.memcpy_htod(_ind_count_map, ind_count_map) sort_colors = zeros((inds.shape[0], 4), npfloat) _sort_colors = cuda.mem_alloc(sort_colors.nbytes) cuda.memcpy_htod(_sort_colors, sort_colors) self.cuda_agg_bin(npint(inds.shape[0]), _ind_count_map, cuda.In(colors), cuda.In(inds), _sort_colors, block=(THREADS, 1, 1), grid=(int(inds.shape[0]//THREADS) + 1, 1)) dotn, _ = ind_count_map.shape self.cuda_dot(npint(dotn), self._img, _ind_count_map, _sort_colors, block=(THREADS, 1, 1), grid=(int(dotn//THREADS) + 1, 1)) if self.verbose is not None: print('-- drew dots: {:d}. time: {:0.4f}'.format(colors.shape[0], time()-dt0)) self._updated = True return True
def _do_cuda_calculation(self, pos0, vel0, sim_time, kernel_sim_time): time = 0 counter = 1 iterations = sim_time / kernel_sim_time while (time < sim_time): print " Kernel execution step %s/%s..." % (counter, iterations), self._initalize_cuda() mod = SourceModule(self.gpu_source) do_basins = mod.get_function("basins") do_basins(cuda.InOut(pos0[0]), cuda.InOut(pos0[1]), cuda.InOut(vel0[0]), cuda.InOut(vel0[1]), cuda.InOut(self.track_length), cuda.Out(self.result_data), numpy.float32(kernel_sim_time), block=(self.THREADS_PER_BLOCK, self.THREADS_PER_BLOCK, 1), grid=(self.resolution / self.THREADS_PER_BLOCK, self.resolution / self.THREADS_PER_BLOCK)) self._deactivate_cuda() time = time + kernel_sim_time counter = counter + 1 print "done" self._save_data()
def f_gpu_observables_func(func, seed, num_trials, aS1, aS2, aEnergy, photonYield, chargeYield, excitonToIonRatio, g1Value, extractionEfficiency, gasGainValue, gasGainWidth, speRes, intrinsicResS1, intrinsicResS2): tArgs = [ drv.In(seed), drv.In(num_trials), drv.InOut(aS1), drv.InOut(aS2), drv.In(aEnergy), drv.In(photonYield), drv.In(chargeYield), drv.In(excitonToIonRatio), drv.In(g1Value), drv.In(extractionEfficiency), drv.In(gasGainValue), drv.In(gasGainWidth), drv.In(speRes), drv.In(intrinsicResS1), drv.In(intrinsicResS2) ] func(*tArgs, grid=(2048, 1), block=(256, 1, 1))
def crt_multi_aug(self, Xt_to_t1_t, Phi_t1, Theta_t1, dtype='dense'): if dtype == 'dense': [K_t, J] = Xt_to_t1_t.shape K_t1 = Theta_t1.shape[0] N = K_t * J Para = np.array([K_t, K_t1, J, N], dtype=np.int32) Xt_to_t1_t = np.array(Xt_to_t1_t, dtype=np.int32, order='C') Xt_to_t1_t1 = np.zeros([K_t1, J], dtype=np.float32, order='C') WSZS_t1 = np.zeros([K_t, K_t1], dtype=np.float32, order='C') Phi_t1 = np.array(Phi_t1, dtype=np.float32, order='C') Theta_t1 = np.array(Theta_t1, dtype=np.float32, order='C') if N != 0: block_x = int(400) grid_x = int(np.floor(N / block_x) + 1) randomseed = np.random.rand(N) randomseed = np.array(randomseed, dtype=np.float32, order='C') func = mod.get_function('Crt_Multi_Sampler') func(drv.In(randomseed), drv.In(Para), drv.In(Xt_to_t1_t), drv.In(Phi_t1), drv.In(Theta_t1), drv.InOut(WSZS_t1), drv.InOut(Xt_to_t1_t1), grid=(grid_x, 1), block=(block_x, 1, 1)) return Xt_to_t1_t1, WSZS_t1
def main(): #a = numpy.matrix('2 -1 -4 ; 4 1 -2; 6 3 0').astype(numpy.float32) a = numpy.random.rand(SIZE_OF_MATRIX, SIZE_OF_MATRIX).astype(numpy.float32) #print a b = a c = numpy.zeros((SIZE_OF_MATRIX,SIZE_OF_MATRIX), dtype=numpy.float32) lda = numpy.int32(SIZE_OF_MATRIX) d_a = cuda.mem_alloc(a.nbytes) d_b = cuda.mem_alloc(b.nbytes) d_c = cuda.mem_alloc(c.nbytes) cuda.memcpy_htod(d_a, a) cuda.memcpy_htod(d_b, b) print "threads:", number_of_threads, "blocks: ", number_of_blocks multiply_matrices = multiply_source.get_function("multiply_matrices") multiply_matrices_shared_blocks = multiply_source.get_function("multiply_matrices_shared_blocks") multiply_matrices(d_a, d_b, cuda.InOut(c), lda, block=(number_of_threads,number_of_threads,1), grid=(number_of_blocks,number_of_blocks)) pycuda.driver.Context.synchronize() multiply_matrices_shared_blocks(d_a, d_b, cuda.InOut(c), lda, block=(number_of_threads,number_of_threads,1), grid=(number_of_blocks,number_of_blocks)) pycuda.driver.Context.synchronize()
def do_KMP(text, pattern, pm_table): start = cuda.Event() end = cuda.Event() KMP = mod.get_function("KMP") text = np.array(text) result = np.zeros(text.size * 2 + 1, dtype=np.uint8) result[:] = 35 # 35 == # result_counter = np.array(0, dtype=np.int32) block = (THREADS, 1, 1) grid = (int((text.size / pattern.size + THREADS - 1) / THREADS), 1) n = pattern.size n = np.array(n, dtype=np.int32) m = text.size m = np.array(m, dtype=np.int32) start.record() KMP(cuda.In(pattern), cuda.In(text), cuda.In(pm_table), cuda.InOut(result), cuda.In(n), cuda.In(m), cuda.InOut(result_counter), block=block, grid=grid) end.record() end.synchronize() # print("Time: {}ms".format(start.time_till(end))) return (result_counter.item(0), result)
def process_output_gpu(source_package, dataset, index, item): ill_map_ldr = source_package['ill_map_ldr'] ill_map_hdr = source_package['ill_map_hdr'] # Make cubemap canvas for LDR and HDR cubemap_xyz_flt, idx, cubemap_idx, cubemap_weight, cubemap_basis, shc_norm = get_cube_idx( ill_map_ldr.shape[1], ill_map_ldr.shape[0]) cubemap_len = cubemap_xyz_flt.shape[0] ill_map_ldr_2d = ill_map_ldr.reshape((-1, 3)) ill_map_hdr_2d = ill_map_hdr.reshape((-1, 3)) cubemap_color_ldr = np.empty((cubemap_len, 3), dtype=np.float32) cubemap_color_hdr = np.empty((cubemap_len, 3), dtype=np.float32) cubemap_color_ldr[idx, :] = ill_map_ldr_2d[cubemap_idx, :] cubemap_color_hdr[idx, :] = ill_map_hdr_2d[cubemap_idx, :] # LDR Image need to convert to linear color space srgb_to_linear(cubemap_color_ldr) # Debug point for dumping cubemap to point cloud if DEBUG: cubemap_pc = np.concatenate((cubemap_xyz_flt, cubemap_color_ldr), axis=-1) np.save(f'{OUTPUT_PATH}/{dataset}/{index}/cubemap_gpu', cubemap_pc) # Calculate the SH coefficients cubemap_clr_ldr = cubemap_color_ldr * cubemap_weight cubemap_clr_hdr = cubemap_color_hdr * cubemap_weight cubemap_clr_ldr = cubemap_clr_ldr.astype(np.float32) cubemap_clr_hdr = cubemap_clr_hdr.astype(np.float32) len_pixels = cubemap_len // 6 shc_hdr = np.zeros((9, 3), dtype=np.float64) shc_ldr = np.zeros((9, 3), dtype=np.float64) make_sh_coefficients(drv.InOut(shc_ldr), drv.InOut(shc_hdr), drv.In(cubemap_basis), drv.In(cubemap_clr_ldr), drv.In(cubemap_clr_hdr), grid=(6, (len_pixels + 1024 - 1) // 1024, 1), block=(1, 1024, 1)) # normalize shc_ldr = (shc_ldr * shc_norm).reshape(-1).astype(np.float32) shc_hdr = (shc_hdr * shc_norm).reshape(-1).astype(np.float32) f = open(f'{OUTPUT_PATH}/{dataset}/{index}/shc_ldr.json', 'w') f.write(json.dumps(shc_ldr.tolist())) f.close() f = open(f'{OUTPUT_PATH}/{dataset}/{index}/shc_hdr.json', 'w') f.write(json.dumps(shc_hdr.tolist())) f.close()
def alg1(self): ''' Implementation of the first Z-value algorithm. At the end of this implementation, the hessian function is called internally. :return: The array of betahats for each reference spike train as well as the confidence interval corresponding to each betahat value. Main internal variables: * mod_z1: The CUDA kernel of the first algorithm. * t1 (in kernel): Backward recurrence time. ''' self.mod_z1 = SourceModule(""" #include <stdio.h> #include <math.h> __global__ void z_function(float *tspamt, float *a, float *isiat, float *tspz, float *z, long p , int maxi,float gm, float alphas, float alphar) { int m = threadIdx.x ; int i = blockIdx.y; int j = blockIdx.x; if (i>=j) { float t1; int temp = a[m*gridDim.y+i]; int temp2 = a[m*gridDim.y+j]; int index = 0 ; t1 = tspamt [m*gridDim.y+temp] - isiat [m*gridDim.y+temp] + isiat [m*gridDim.y+temp2] ; for (int k = m; k < p*maxi ;k+=p) { if (tspz [k] < t1 && tspz [k] != -1 && index < k) { index= k ; } } float bwt; bwt = t1 - tspz [index]; z[gridDim.y*gridDim.y*m + gridDim.y*i + j] = (1/gm)*((exp(-bwt/alphas)-exp(-bwt/alphar))/(alphas-alphar)); } } """) # The CUDA kernel of the first algorithm. z1_func = self.mod_z1.get_function("z_function") z1_func(cuda.InOut(self.tspamt_d), cuda.InOut(self.a_d), cuda.InOut(self.isiat_d), cuda.InOut(self.tspz), cuda.InOut(self.z), int_(self.p), int_(self.maxi_d), float32(self.gm), float32(self.alphas), float32(self.alphar), block=(self.p, 1, 1), grid=(int_(self.laf), int_(self.laf))) return self.hessian()
def update_all_individuals(cls, dt): grid_x = (cls._next_id + BLOCK_SIZE - 1) // BLOCK_SIZE _update_individuals_fn( numpy.uint32(cls._next_id), # unsigned int count numpy.float32(dt), # float dt cuda.InOut(cls.Cuda_Arrays._age), # float* age cuda.InOut(cls.Cuda_Arrays._alive), # unsigned int* alive cuda.In(cls.Cuda_Arrays._death_age), # float* death_age block=(BLOCK_SIZE, 1, 1), grid=(grid_x, 1), time_kernel=True)
def cudastep(self): self.cstep(drv.InOut(self.pos[0]), drv.InOut(self.pos[1]), drv.InOut(self.v[0]), drv.InOut(self.v[1]), self.N, self.size, self.epsilon, self.width, self.height, block=(self.blocksize, 1, 1), grid=(self.gridsize, 1))
def compute_new_pendulum_states_rk4(self, currentStates, numTimeStepsTillFlipData, numTimeStepsAlreadyExecuted, maxTimeStepsToExecute, startFromDefaultState): logger.info( 'Computing new pendulum states with Runge-Kutta 4th order method') logger.info('time step: ' + str(self.timeStep) + ' seconds') logger.info('amount of time already computed: ' + str(numTimeStepsAlreadyExecuted * self.timeStep) + ' seconds') logger.info('max time to see if pendulum flips: ' + str(maxTimeStepsToExecute * self.timeStep) + ' seconds') logger.info('amount of time to simulate: ' + str((maxTimeStepsToExecute - numTimeStepsAlreadyExecuted) * self.timeStep) + ' seconds') # Compute the double pendulum fractal image. logger.info('Running pendulum simulation kernel...') kernelStart = time.time() self.computeDoublePendulumFractalFromInitialStatesRK4Function( self.npFloatType(self.point1Mass), self.npFloatType(self.point2Mass), self.npFloatType(self.pendulum1Length), self.npFloatType(self.pendulum2Length), self.npFloatType(self.gravity), self.npFloatType(self.angle1Min), self.npFloatType(self.angle1Max), self.npFloatType(self.angle2Min), self.npFloatType(self.angle2Max), cuda.InOut(currentStates), np.int32(startFromDefaultState), np.int32(numTimeStepsAlreadyExecuted), np.int32(self.numberOfAnglesToTestX), np.int32(self.numberOfAnglesToTestY), self.npFloatType(self.timeStep), np.int32(maxTimeStepsToExecute), cuda.InOut(numTimeStepsTillFlipData), # block=(1, 1, 1), grid=(1, 1)) # block=(2, 2, 1), grid=(1, 1)) # block=(4, 4, 1), grid=(4, 4)) # block=(8, 8, 1), grid=(8, 8)) block=(16, 16, 1), grid=(16, 16)) # block=(32, 32, 1), grid=(32, 32)) # Print the time it took to run the kernel. timeToExecuteLastKernel = time.time() - kernelStart logger.info('Completed pendulum simulation kernel in ' + str(timeToExecuteLastKernel) + ' seconds')
def test(): mask = cv2.imread('cur_mask.jpg') mask = mask[:, :, 0] # mask_g = mask[:,:,1] # mask_b = mask[:,:,2] x = [] y = [] prev_time = time.time() for i in range(len(mask)): for j in range(len(mask[0])): if mask[i][j] == 2: print('x = %d,y = %d' % (j, i)) y.append(i) x.append(j) logger.info('process frame time:' + str(time.time() - prev_time)) prev_time = time.time() x1 = min(x) y1 = min(y) x2 = max(x) y2 = max(y) logger.info('post process frame time:' + str(time.time() - prev_time)) print(x1, y1, x2, y2) # w = numpy.int32(len(mask[0])) # h = numpy.int32(len(mask)) w = numpy.int64(len(mask[0])) h = numpy.int64(len(mask)) mask_np = numpy.array(mask) mask_np = mask_np.reshape(-1).astype(float) N = len(mask_np) # print(N) # print(w) # print(h) a = numpy.zeros(N, dtype=numpy.float) b = numpy.zeros(N, dtype=numpy.float) nTheads = 1024 nBlocks = int((N + nTheads - 1) / nTheads) print("nBlocks:%d\n" % nBlocks) prev_time = time.time() func(drv.In(mask_np), drv.InOut(a), drv.InOut(b), w, h, block=(nTheads, 1, 1), grid=(nBlocks, )) logger.info('gpu process frame time:' + str(time.time() - prev_time)) print(max(a)) print(max(b)) print(a)
def simulate_positions(module, Nobs, N, bounds, radius, d, dN, pa, ps, seed=666, Nthreads=64): Nphotons = Nobs*N print "Total Threads: %s" % Nphotons assert(Nphotons <= 1.1e8) d = np.uint32(d) dN = np.uint32(dN) radius = np.float32(radius) Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1) t1 = time.time() rng_states = get_rng_states(module, Nphotons, seed=seed) t2 = time.time() d_list = get_doms(module, Ndoms, radius, d, dN) t3 = time.time() x = np.random.uniform(bounds[0][0], bounds[0][1], N) y = np.random.uniform(bounds[1][0], bounds[1][1], N) z = np.random.uniform(bounds[2][0], bounds[2][1], N) # print x # print y # print z pInit = np.concatenate([x, y, z]).astype(np.float32) t4 = time.time() # print "t2-t1: ", t2-t1 # print "t3-t2: ", t3-t2 # print "t4-t3: ", t4-t3 start = time.time() datahits = np.zeros(Ndoms*N, dtype=np.int32) datahitsNum = -np.ones(Nobs*N, dtype=np.int32) datatimes = np.zeros(Nphotons, dtype=np.float32) datapositions = np.zeros(Nphotons*3, dtype=np.float32) simulate = module.get_function('simulate_positions') simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datahitsNum), cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1)) print "end-start", time.time() - start # print "sumHits: ", sum(datahits) # print datahits = np.reshape(np.array(datahits, dtype=float), (N, Ndoms)) datahitsNum = np.reshape(np.array(datahitsNum, dtype=float), (N, Nobs)) datatimes = np.reshape(np.array(datatimes, dtype=float), (N, Nobs)) pInit = np.reshape(pInit, (3, N)).T return datahits, datahitsNum, datatimes, pInit
def compute_new_pendulum_states_time_till_flip_adaptive_step_size_method( self, currentStates, timeTillFlipData, timeAlreadyExecuted, maxTimeToExecute, startFromDefaultState): logger.info('Computing new pendulum states with ' + str(self.algorithm.name) + ' method') logger.info('Using the "time till flip" kernel') logger.info('time step: ' + str(self.timeStep) + ' seconds') logger.info('error tolerance: ' + str(self.errorTolerance)) logger.info('amount of time already computed: ' + str(timeAlreadyExecuted) + ' seconds') logger.info('max time to see if pendulum flips: ' + str(maxTimeToExecute) + ' seconds') logger.info('amount of time to simulate: ' + str(maxTimeToExecute - timeAlreadyExecuted) + ' seconds') # Compute the double pendulum fractal image. logger.info('Running pendulum simulation kernel...') kernelStart = time.time() self.computeDoublePendulumFractalWithTimeTillFlipMethodAndAdaptiveStepSize( self.npFloatType(self.point1Mass), self.npFloatType(self.point2Mass), self.npFloatType(self.pendulum1Length), self.npFloatType(self.pendulum2Length), self.npFloatType(self.gravity), self.npFloatType(self.angle1Min), self.npFloatType(self.angle1Max), self.npFloatType(self.angle2Min), self.npFloatType(self.angle2Max), cuda.InOut(currentStates), np.int32(startFromDefaultState), self.npFloatType(timeAlreadyExecuted), np.int32(self.numberOfAnglesToTestX), np.int32(self.numberOfAnglesToTestY), self.npFloatType(self.timeStep), self.npFloatType(self.errorTolerance), self.npFloatType(maxTimeToExecute), cuda.InOut(timeTillFlipData), # block=(1, 1, 1), grid=(1, 1)) # block=(2, 2, 1), grid=(1, 1)) # block=(4, 4, 1), grid=(4, 4)) # block=(8, 8, 1), grid=(8, 8)) block=(16, 16, 1), grid=(16, 16)) # block=(32, 32, 1), grid=(32, 32)) # Print the time it took to run the kernel. timeToExecuteLastKernel = time.time() - kernelStart logger.info('Completed pendulum simulation kernel in ' + str(timeToExecuteLastKernel) + ' seconds')
def simulate_grid(module, Nobs, N, oversampling, datahits, datatimes, radius, d, dN, pa, ps, seed=666, Nthreads=64): Nruns = N*N Nphotons = Nobs*Nruns*oversampling Nobs *= oversampling print "Total Threads: %s" % Nphotons assert(Nphotons <= 1.1e8) d = np.uint32(d) dN = np.uint32(dN) radius = np.float32(radius) Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1) t1 = time.time() rng_states = get_rng_states(module, Nphotons, seed=seed) t2 = time.time() d_list = get_doms(module, Ndoms, radius, d, dN) t3 = time.time() x = np.linspace(-20, 20, N) y = np.linspace(-20, 20, N) X, Y = np.meshgrid(x, y) Z = np.zeros(N*N) pInit = np.concatenate([X.flatten(), Y.flatten(), Z]).astype(np.float32) t4 = time.time() print "t2-t1: ", t2-t1 print "t3-t2: ", t3-t2 print "t4-t3: ", t4-t3 start = time.time() datahits = np.zeros(Ndoms*Nruns, dtype=np.int32) datatimesbinned = np.zeros(Ndoms*Nruns, dtype=np.float32) datatimes = np.zeros(Nphotons, dtype=np.float32) datapositions = np.zeros(Nphotons*3, dtype=np.float32) simulate = module.get_function('simulate_grid') simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datatimesbinned), cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1)) print "end-start", time.time() - start print "sumHits: ", sum(datahits) print datahits = np.reshape(np.array(datahits, dtype=float), (Nruns, Ndoms))/oversampling datatimesbinned = np.reshape(np.array(datatimesbinned, dtype=float), (Nruns, Ndoms))/oversampling return datahits, np.array(datatimes, dtype=float), datatimesbinned, datapositions
def go(scale, block, test_cpu): data = np.fromstring(np.random.bytes(scale * block), dtype=np.uint8) print 'Done seeding' if test_cpu: a = time.time() cpu_pfxs = np.array([np.sum(data == v) for v in range(256)]) b = time.time() print cpu_pfxs print 'took %g secs on CPU' % (b - a) shmem_pfxs = np.zeros(256, dtype=np.int32) launch('prefix_scan_8_0_shmem', cuda.In(data), np.int32(block), cuda.InOut(shmem_pfxs), block=(32, 16, 1), grid=(scale, 1), l1=1) if test_cpu: print 'it worked? %s' % (np.all(shmem_pfxs == cpu_pfxs)) shmeml_pfxs = np.zeros(256, dtype=np.int32) launch('prefix_scan_8_0_shmem_lessconf', cuda.In(data), np.int32(block), cuda.InOut(shmeml_pfxs), block=(32, 32, 1), grid=(scale, 1), l1=1) print 'it worked? %s' % (np.all(shmeml_pfxs == shmem_pfxs)) popc_pfxs = np.zeros(256, dtype=np.int32) launch('prefix_scan_8_0_popc', cuda.In(data), np.int32(block), cuda.InOut(popc_pfxs), block=(32, 16, 1), grid=(scale, 1), l1=1) popc5_pfxs = np.zeros(32, dtype=np.int32) launch('prefix_scan_5_0_popc', cuda.In(data), np.int32(block), cuda.InOut(popc5_pfxs), block=(32, 16, 1), grid=(scale, 1), l1=1)
def testfloat3subequal(): dest = np.copy(a) float3subequal(cuda.InOut(dest), cuda.In(b), **size) if not np.allclose(a['x']-b['x'], dest['x']) or \ not np.allclose(a['y']-b['y'], dest['y']) or \ not np.allclose(a['z']-b['z'], dest['z']): assert False
def testfloat3divfloatequal(): dest = np.copy(a) float3divfloatequal(cuda.InOut(dest), c, **size) if not np.allclose(a['x']/c, dest['x']) or \ not np.allclose(a['y']/c, dest['y']) or \ not np.allclose(a['z']/c, dest['z']): assert False
def Crt_Matrix_GPU(Xt_to_t1_t, p, dtype='dense'): if dtype == 'dense': [K_t, J] = Xt_to_t1_t.shape N = K_t * J N = np.array(N, dtype=np.int32, order='C') Xt_to_t1_t = np.array(Xt_to_t1_t, dtype=np.int32, order='C') p = np.array(p, dtype=np.float32, order='C') X_t1 = np.zeros([K_t, J], dtype=np.float32, order='C') if N != 0: block_x = int(400) grid_x = int(np.floor(N / block_x) + 1) randomseed = np.random.rand(N) randomseed = np.array(randomseed, dtype=np.float32, order='C') func = mod.get_function('Crt_Sampler') func(drv.In(randomseed), drv.In(N), drv.In(Xt_to_t1_t), drv.In(p), drv.InOut(X_t1), grid=(grid_x, 1, 1), block=(block_x, 1, 1)) return X_t1
def test(N): # N = 1024 * 1024 * 90 # float: 4M = 1024 * 1024 print("N = %d" % N) N = np.int32(N) a = np.random.randn(N).astype(np.float32) b = np.random.randn(N).astype(np.float32) # copy a to aa aa = np.empty_like(a) aa[:] = a # GPU run nTheads = 256 nBlocks = int( ( N + nTheads - 1 ) / nTheads ) start = timer() func( drv.InOut(a), drv.In(b), N, block=( nTheads, 1, 1 ), grid=( nBlocks, 1 ) ) run_time = timer() - start print("gpu run time %f seconds " % run_time) # cpu run start = timer() aa = (aa * 10 + 2 ) * ((b + 2) * 10 - 5 ) * 5 run_time = timer() - start print("cpu run time %f seconds " % run_time) # check result r = a - aa print( min(r), max(r) )
def matAdd(A, B, alpha, beta): forme1 = A.shape forme2 = B.shape if (forme1 != forme2): sys.exit('matrix dimensions differ') aSize = forme1[0] * forme1[1] xdim = np.int32(forme1[0]) ydim = np.int32(forme1[1]) A = np.reshape(A, aSize, order='F').astype(np.float32) B = np.reshape(B, aSize, order='F').astype(np.float32) alpha = np.float32(alpha) beta = np.float32(beta) blockX = int(ydim) gridX = int(xdim) matrixAddition(drv.InOut(A), drv.In(B), alpha, beta, ydim, block=(blockX, 1, 1), grid=(gridX, 1, 1)) A = np.reshape(A, forme1, order='F') return A
def full_scan(): # TODO: testing how slow a single full scan is with no parallelism sequential = SourceModule(""" #include <stdio.h> __global__ void full_scan(unsigned char *img, int line[2]) { int counter = 0; for(int y=0; y<853; y++) { for(int x=0; x<1918; x++) { if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180) && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) { counter++; if(counter == 50) { line[0] = x; line[1] = y; return; } } else { counter = 0; } } } } """) image = cv.imread("test images/crop2.png") seq = sequential.get_function("full_scan") image_gpu = gpuarray.to_gpu_async(image) line = np.array([0, 0]) timer = time.clock() seq(image_gpu, cuda.InOut(line), block=(1, 1, 1)) print(time.clock() - timer) print(line)
def kmeans(matrix, k, maxIterations): centroids = initCentroids(matrix, k) oldCentroids = None iterations = 0 matrix_gpu = cuda.mem_alloc(matrix.nbytes) cuda.memcpy_htod(matrix_gpu, matrix) while (not numpy.array_equal(centroids, oldCentroids)) and iterations < maxIterations: oldCentroids = centroids centroids = numpy.ascontiguousarray(centroids, dtype=numpy.float32) labels = numpy.ascontiguousarray(numpy.empty((matrix.shape[0], 2)), dtype=numpy.int32) func = mod.get_function("getLabels") func(matrix_gpu, cuda.In(centroids), cuda.InOut(labels), numpy.int32(matrix.shape[1]), numpy.int32(matrix.shape[0]), numpy.int32(k), grid=(6, 10, 1), block=(32, 32, 1)) centroids = getCentroids(matrix, centroids, labels) iterations += 1 print(iterations) return labels
def gpu_process(image, histogram): process(cuda.In(image), np.int32(image.size / 4), cuda.InOut(histogram), block=(THREADS_PER_BLOCK, 1, 1), grid=(10, 1)) return histogram
def threshold2(image1, image2, minimum, maximum): forme1 = image1.shape forme2 = image2.shape if (np.size(forme1) > 2 & np.size(forme2) > 2): sys.exit('Only works on gray images') aSize = forme1[0] * forme1[1] xdim = np.int32(forme1[0]) ydim = np.int32(forme1[1]) dest = np.zeros(aSize).astype(np.float32) image2 = image2.reshape(aSize, order='F') minval = np.float32(minimum) maxval = np.float32(maximum) #block size: B := dim1*dim2*dim3=1024 #gird size : dim1*dimr2*dim3 = ceiling(aSize/B) blockX = int(1024) multiplier = aSize / float(1024) if (aSize / float(1024) > int(aSize / float(1024))): gridX = int(multiplier + 1) else: gridX = int(multiplier) #parallel rgb computation+time GPUthresholding2(drv.InOut(dest), drv.In(image2), ydim, minima, maxima, block=(blockX, 1, 1), grid=(gridX, 1, 1)) dest = np.reshape(dest, forme1[0:2], order='F') return dest
def cuda_nms(modules, boxes, scores, yxhw=False): if not yxhw: boxes = to_yxhw(boxes) #Prepare data for nms on GPU #After this, # boxes becomes: [y1,x1,y2,x2,score] # results becomes: [True,...] count = boxes.shape[0] boxes = np.hstack((boxes, np.expand_dims(scores,axis=1))) results = np.array([True]*count, dtype=np.bool) # Perform nms on GPU count = boxes.shape[0] NMS_GPU = modules.get_function("NMS_GPU") #use drv.InOut instead of drv.Out so the value of results can be passed in #Setting1:works only when count<=1024 #grid_size, block_size = (1,count,1), (count,1,1) #Setting2:works when count>1024 #grid_size, block_size = (count,count,1), (1,1,1) #Setting3:works when count>1024, faster then Setting2 block_len = 32 grid_len = math.ceil(count/block_len) grid_size, block_size = (grid_len,grid_len,1), (block_len,block_len,1) NMS_GPU(drv.In(boxes), drv.InOut(results), grid=grid_size, block=block_size) return list(np.where(results)[0])
def cuda_nms(modules, boxes, scores, yxhw=False): if not yxhw: boxes = to_yxhw(boxes) n_boxes = boxes.shape[0] boxes = np.hstack((boxes, np.expand_dims(scores, axis=1))) results = np.array([True] * n_boxes, dtype=np.bool) # Perform nms on GPU NMS_GPU = modules.get_function("NMS_GPU") #use drv.InOut instead of drv.Out so the value of results can be passed in #Setting1:works only when count<=1024 #grid_size, block_size = (1,n_boxes,1), (n_boxes,1,1) #Setting3:works when count>1024, faster then Setting2 thread_per_block_dim = 32 grid_len = math.ceil(n_boxes / thread_per_block_dim) grid_size, block_size = (grid_len, grid_len, 1), (thread_per_block_dim, thread_per_block_dim, 1) NMS_GPU(drv.In(boxes), drv.InOut(results), grid=grid_size, block=block_size) return list(np.where(results)[0])
def run(N_STATES=4): np.random.seed(42) fwdlattice = np.random.rand(N1+N2, N_STATES).astype(np.float32) bwdlattice = np.random.rand(N1+N2, N_STATES).astype(np.float32) framelogprob = np.random.rand(N1+N2, N_STATES).astype(np.float32) log_transmat = np.random.rand(N_STATES, N_STATES).astype(np.float32) sequence_lengths = np.array([N1, N2], dtype=np.int32) cum_sequence_lengths = np.array([0, N1], dtype=np.int32) transcounts = np.zeros((N_STATES, N_STATES), dtype=np.float32) n_trajs = 1 f = mod.get_function('transitioncounts%d' % N_STATES) f(cuda.In(fwdlattice), cuda.In(bwdlattice), cuda.In(log_transmat), cuda.In(framelogprob), cuda.In(sequence_lengths), cuda.In(cum_sequence_lengths), np.int32(n_trajs), cuda.InOut(transcounts), grid=(1,1), block=(256,1,1)) print 'cuda transcounts' print transcounts t2_1 = transitioncounts(fwdlattice[:N1], bwdlattice[:N1], framelogprob[:N1], log_transmat) #t2_2 = transitioncounts(fwdlattice[N1:], bwdlattice[N1:], framelogprob[N1:], log_transmat) print 'reference' print t2_1 ref = t2_1 print 'error N_STATES=%d: %f' % (N_STATES, np.linalg.norm(transcounts-ref))
def test(): func = SourceModule(source).get_function('log_diag_mvn_likelihood') n_samples = 8 n_states = 9 n_features = 33 np.random.seed(42) sequences = np.random.rand(n_samples, n_features).astype(np.float32) means = np.random.rand(n_states, n_features).astype(np.float32) variances = np.random.rand(n_states, n_features).astype(np.float32) loglikelihoods = np.zeros((n_samples, n_states), dtype=np.float32) func(cuda.In(sequences), cuda.In(means), cuda.In(variances), cuda.In(np.log(variances)), np.int32(n_samples), np.int32(n_states), np.int32(n_features), cuda.InOut(loglikelihoods), block=(64, 1, 1), grid=(1, 1)) print 'loglikelihoods' print loglikelihoods print 'sklearn' from sklearn.mixture.gmm import _log_multivariate_normal_density_diag r = _log_multivariate_normal_density_diag(sequences, means, variances) print r print np.abs(r - loglikelihoods) < 1e-4