def create_quantiles(data, params): global quantiles, q_lb, q_ub, mask sort_gpu(data) if mask.shape != data.shape: mask = gpuarray.zeros_like(data) n_lb = gpuarray.sum(data < mask) n_ub = gpuarray.sum(data > mask) fill_lb_quantiles(data, quantiles, n_lb, n_ub, q_lb, block=(quantiles.shape[0], 1, 1)) fill_ub_quantiles(data, quantiles, n_lb, n_ub, q_ub, block=(quantiles.shape[0], 1, 1)) q_lb = q_lb.reverse() p_ub = n_ub / (n_ub + n_lb) del n_lb, n_ub return data, q_lb.get(), q_ub.get(), probs * ( 1 - p_ub.get()), probs * p_ub.get()
def computeEnergy(D_v, S, T, _Lambda, _gamma_c, Alpha, Beta): l, m, n = S.shape sum_alpha_beta = gpuarray.zeros_like(D_v) sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta) GR = grad(T) square_matrix(GR, GR) G_norm = gpuarray.zeros_like(T) sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm, 1.0, 1.0, 1.0) sqrt_matrix(G_norm, G_norm) # multiply_matrix(G_norm, _Gamma, G_norm) ET = _gamma_c * gpuarray.sum(G_norm) SP = gpuarray.zeros_like(S) absolute_matrix(S, SP) multiply_matrix(SP, _Lambda, SP) ES = gpuarray.sum(SP) sparse = D_v - S.reshape(l * m * n, 1) - T.reshape(l * m * n, 1) - sum_alpha_beta square_matrix(sparse, sparse) EL = gpuarray.sum(sparse) E = 1 / 2 * EL.get() + ES.get() + ET.get() return EL.get(), ES.get(), ET.get(), E
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): ARD = (len(lengthscale)!=1) N,M,Q = self.get_dimensions(Z, variational_posterior) psi1_gpu = self.gpuCache['psi1_gpu'] psi2n_gpu = self.gpuCache['psi2n_gpu'] l_gpu = self.gpuCache['l_gpu'] Z_gpu = self.gpuCache['Z_gpu'] mu_gpu = self.gpuCache['mu_gpu'] S_gpu = self.gpuCache['S_gpu'] gamma_gpu = self.gpuCache['gamma_gpu'] dvar_gpu = self.gpuCache['dvar_gpu'] dl_gpu = self.gpuCache['dl_gpu'] dZ_gpu = self.gpuCache['dZ_gpu'] dmu_gpu = self.gpuCache['dmu_gpu'] dS_gpu = self.gpuCache['dS_gpu'] dgamma_gpu = self.gpuCache['dgamma_gpu'] grad_l_gpu = self.gpuCache['grad_l_gpu'] grad_mu_gpu = self.gpuCache['grad_mu_gpu'] grad_S_gpu = self.gpuCache['grad_S_gpu'] grad_gamma_gpu = self.gpuCache['grad_gamma_gpu'] log_denom1_gpu = self.gpuCache['log_denom1_gpu'] log_denom2_gpu = self.gpuCache['log_denom2_gpu'] log_gamma_gpu = self.gpuCache['log_gamma_gpu'] log_gamma1_gpu = self.gpuCache['log_gamma1_gpu'] if self.GPU_direct: dL_dpsi1_gpu = dL_dpsi1 dL_dpsi2_gpu = dL_dpsi2 dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get() else: dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1)) dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2)) dL_dpsi0_sum = dL_dpsi0.sum() self.reset_derivative() # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True) # print 'g_psi1compDer '+str(t) # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True) # print 'g_psi2compDer '+str(t) self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q)) self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q)) dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get() sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum) dL_dmu = grad_mu_gpu.get() sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum) dL_dS = grad_S_gpu.get() sum_axis(grad_gamma_gpu,dgamma_gpu,N*Q,self.blocknum) dL_dgamma = grad_gamma_gpu.get() dL_dZ = dZ_gpu.get() if ARD: sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum) dL_dlengscale = grad_l_gpu.get() else: dL_dlengscale = gpuarray.sum(dl_gpu).get() return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
def test_sum_allocator(self): # FIXME from pytest import skip skip("https://github.com/inducer/pycuda/issues/163") # crashes with terminate called after throwing an instance of 'pycuda::error' # what(): explicit_context_dependent failed: invalid device context - no currently active context? import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() rng = np.random.randint(low=512, high=1024) a = gpuarray.arange(rng, dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results assert b.get() == rng * (rng - 1) // 2 assert c.get() == rng * (rng - 1) // 2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator assert c.allocator == pool.allocate
def cuda_run(self, prefix, supportK): print('Running Eclat in recursive: number of itemsets found:', len(self.support_list), end='\r') while supportK: itemset, bitvector = supportK.pop(0) support = gpuarray.sum(bitvector).get() if support >= self.min_support: self.support_list[frozenset(sorted(prefix + [itemset]))] = int(support) suffix = [] for itemset_sub, bitvector_sub in supportK: if gpuarray.sum(bitvector_sub).get() >= self.min_support: if self.use_optimal: union_bitvector = bitvector.__mul__(bitvector_sub) else: union_bitvector = gpuarray.zeros_like(bitvector) self.multiply(union_bitvector, bitvector, bitvector_sub, block=self.block, grid=self.grid) if gpuarray.sum( union_bitvector).get() >= self.min_support: suffix.append((itemset_sub, union_bitvector)) self.cuda_run( prefix + [itemset], sorted(suffix, key=lambda x: int(x[0]), reverse=True))
def apply_mds_parallel2(self): print("Applying parallel MDS via SMACOF...") current_time = time.clock() graph_d = gpu.to_gpu(np.float32(self.graph)) row_sum_d = gpu.to_gpu(np.float32(np.zeros(self.N))) score_current_d = gpu.to_gpu(np.float32(np.random.uniform(0, 10, size=self.N))) score_next_d = gpu.to_gpu(np.float32(np.zeros(self.N))) sigma_d = gpu.to_gpu(np.float32(np.zeros(self.N))) delta_d = gpu.to_gpu(np.float32(np.zeros(self.N))) mds2_kernel = cuda_compile(_kernel_source, 'mds2_kernel') stress = 1 while (stress > 0.001): mds2_kernel( graph_d, row_sum_d, score_current_d, score_next_d, sigma_d, delta_d, np.int32(self.N), block=(1024, 1, 1), grid=(int(self.N / 1024 + 1), int(1)) ) score_current_d = score_next_d score_next_d = gpu.to_gpu(np.float32(np.zeros(self.N))) stress = gpu.sum(sigma_d).get() / gpu.sum(delta_d).get() self.outdata = score_current_d.get() print "Time to apply parallel MDS: %6.2f s" % (time.clock() - current_time)
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False, ddof=0, keepdims=False): global _global_cublas_allocator assert isinstance(ddof, numbers.Integral) if axis is None or len(x_gpu.shape) <= 1: out_shape = (1,)*len(x_gpu.shape) if keepdims else () if calc_mean == False: return gpuarray.sum(x_gpu).reshape(out_shape) else: return gpuarray.sum(x_gpu).reshape(out_shape) / (x_gpu.dtype.type(x_gpu.size-ddof)) if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if x_gpu.flags.c_contiguous: n, m = x_gpu.shape[1], x_gpu.shape[0] lda = x_gpu.shape[1] trans = "n" if axis == 0 else "t" sum_axis, out_axis = (m, n) if axis == 0 else (n, m) else: n, m = x_gpu.shape[0], x_gpu.shape[1] lda = x_gpu.shape[0] trans = "t" if axis == 0 else "n" sum_axis, out_axis = (n, m) if axis == 0 else (m, n) if calc_mean: alpha = (1.0 / (sum_axis-ddof)) else: alpha = 1.0 if (x_gpu.dtype == np.complex64): gemv = cublas.cublasCgemv elif (x_gpu.dtype == np.float32): gemv = cublas.cublasSgemv elif (x_gpu.dtype == np.complex128): gemv = cublas.cublasZgemv elif (x_gpu.dtype == np.float64): gemv = cublas.cublasDgemv alloc = _global_cublas_allocator ons = ones((sum_axis, ), x_gpu.dtype, allocator=alloc) if keepdims: out_shape = (1, out_axis) if axis == 0 else (out_axis, 1) else: out_shape = (out_axis,) if out is None: out = gpuarray.empty(out_shape, x_gpu.dtype, alloc) else: assert out.dtype == x_gpu.dtype assert out.size >= out_axis gemv(_global_cublas_handle, trans, n, m, alpha, x_gpu.gpudata, lda, ons.gpudata, 1, 0.0, out.gpudata, 1) return out
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False, ddof=0, keepdims=False): global _global_cublas_allocator assert isinstance(ddof, numbers.Integral) if axis is None or len(x_gpu.shape) <= 1: out_shape = (1,)*len(x_gpu.shape) if keepdims else () if calc_mean == False: return gpuarray.sum(x_gpu).reshape(out_shape) else: return gpuarray.sum(x_gpu).reshape(out_shape) / (x_gpu.dtype.type(x_gpu.size-ddof)) if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if x_gpu.flags.c_contiguous: n, m = x_gpu.shape[1], x_gpu.shape[0] lda = x_gpu.shape[1] trans = "n" if axis == 0 else "t" sum_axis, out_axis = (m, n) if axis == 0 else (n, m) else: n, m = x_gpu.shape[0], x_gpu.shape[1] lda = x_gpu.shape[0] trans = "t" if axis == 0 else "n" sum_axis, out_axis = (n, m) if axis == 0 else (m, n) if calc_mean: alpha = (1.0 / (sum_axis-ddof)) else: alpha = 1.0 if (x_gpu.dtype == np.complex64): gemv = cublas.cublasCgemv elif (x_gpu.dtype == np.float32): gemv = cublas.cublasSgemv elif (x_gpu.dtype == np.complex128): gemv = cublas.cublasZgemv elif (x_gpu.dtype == np.float64): gemv = cublas.cublasDgemv alloc = _global_cublas_allocator ons = ones((sum_axis, ), x_gpu.dtype, alloc) if keepdims: out_shape = (1, out_axis) if axis == 0 else (out_axis, 1) else: out_shape = (out_axis,) if out is None: out = gpuarray.empty(out_shape, x_gpu.dtype, alloc) else: assert out.dtype == x_gpu.dtype assert out.size >= out_axis gemv(_global_cublas_handle, trans, n, m, alpha, x_gpu.gpudata, lda, ons.gpudata, 1, 0.0, out.gpudata, 1) return out
def updateBC(ul, t): updateBCFunc.prepared_call(grid_Nv, block, ul.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0[p].ptr, self._bc_vals_num[p].ptr, self._bc_vals_den[p].ptr, t) self._wall_nden[p] = -(gpuarray.sum(self._bc_vals_num[p]) / gpuarray.sum(self._bc_vals_den[p]))
def Average_TotalProbabilityP( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): temp = gpuarray.sum( Psi1_GPU*Psi1_GPU.conj() ).get() temp += gpuarray.sum( Psi2_GPU*Psi2_GPU.conj() ).get() temp += gpuarray.sum( Psi3_GPU*Psi3_GPU.conj() ).get() temp += gpuarray.sum( Psi4_GPU*Psi4_GPU.conj() ).get() return temp * self.dPx*self.dPy
def gpu_sharpen(kernel, orig_image): # allocate memory for input and output curr_im, next_im = np.array(orig_image, dtype=np.float64), np.array(orig_image, dtype=np.float64) # Get image data height, width = np.int32(orig_image.shape) N = height * width print "Processing %d x %d image" % (width, height) # Allocate device memory and copy host to device start_transfer = time.time() d_curr = gpu.to_gpu(curr_im) d_next = gpu.to_gpu(next_im) stop_transfer = time.time() host_to_device = stop_transfer - start_transfer print "host to device tranfer time: " + str(host_to_device) # Block size (threads per block) b_size = (32, 32, 1) 33 # Grid size (blocks per grid) g_size = (int(np.ceil(float(width)/float(b_size[0]))), int(np.ceil(float(height)/float(b_size[1])))) # Initialize the GPU event trackers for timing start_gpu_time = cu.Event() end_gpu_time = cu.Event() start_gpu_time.record() # Compute the image's initial mean and variance init_mean = np.float64(gpu.sum(d_curr).get())/N var = ReductionKernel(dtype_out=np.float64, neutral= "0", reduce_expr= "a+b", map_expr="(x[i]-mu)*(x[i]-mu)/size", arguments="double* x, double mu, double size") init_variance = var(d_curr, np.float64(init_mean), np.float64(N)).get() variance = 0 total = 0 # while variance is less than a 20% difference from the initial variance, continue to sharpen while variance < 1.2 * init_variance: kernel(d_curr, EPSILON, d_next, height, width, block=b_size, grid=g_size) # Swap references to the images, next_im => curr_im d_curr, d_next = d_next, d_curr # calculate mean and variance mean = np.float64(gpu.sum(d_curr).get())/N variance = var(d_curr, np.float64(mean), np.float64(N)).get() print "Mean = %f, Variance = %f" % (mean, variance) end_gpu_time.record() end_gpu_time.synchronize() gpu_time = start_gpu_time.time_till(end_gpu_time)*1*1e-3 print "GPU Time: %f" % gpu_time return d_curr.get()
def updateBC(ul, t): updateBCFunc.prepared_call(grid_Nv, block, ul.ptr, self._vm.d_cvx().ptr, self._vm.d_cvy().ptr, self._vm.d_cvz().ptr, self._bc_vals_num.ptr, self._bc_vals_den.ptr, t) self._wall_nden = -(gpuarray.sum(self._bc_vals_num) / gpuarray.sum(self._bc_vals_den))
def Norm_P_GPU( self, Psi1, Psi2, Psi3, Psi4): norm = gpuarray.sum( Psi1.__abs__()**2 ).get() norm += gpuarray.sum( Psi2.__abs__()**2 ).get() norm += gpuarray.sum( Psi3.__abs__()**2 ).get() norm += gpuarray.sum( Psi4.__abs__()**2 ).get() norm = np.sqrt(norm*self.dPx * self.dPy ) return norm
def gCOVAR(data1, data2): dA1 = gpuarray.to_gpu(data1.astype(np.float32)) dA2 = gpuarray.to_gpu(data2.astype(np.float32)) dM1 = gpuarray.sum(dA1)/len(data1) dM2 = gpuarray.sum(dA2)/len(data1) covar = np.float64(kn.kCOVAR(dA1, dA2, dM1, dM2).get()/len(data1)) return covar
def softmax(x, deriv=False): if deriv: return x * (1.0 - x) else: np_t = np.array([[0.0]]) # skcuda.misc.max(x).get(np_t) # x = x - np_t.ravel()[0] gpu.sum(cm.exp(x)).get(np_t) return cm.exp(x) / np_t.ravel()[0]
def gCORREL(data1, data2): dA1 = gpuarray.to_gpu(data1.astype(np.float32)) dA2 = gpuarray.to_gpu(data2.astype(np.float32)) dM1 = gpuarray.sum(dA1)/len(data1) dM2 = gpuarray.sum(dA2)/len(data1) correl = np.float64(kn.kCOVAR(dA1, dA2, dM1, dM2).get() / \ (kn.kSTDEV(dA1, dM1).get() * kn.kSTDEV(dA2, dM2).get())**.5) return correl
def Norm_GPU( self, Psi1, Psi2, Psi3, Psi4): norm = gpuarray.sum( Psi1.__abs__()**2 ).get() norm += gpuarray.sum( Psi2.__abs__()**2 ).get() norm += gpuarray.sum( Psi3.__abs__()**2 ).get() norm += gpuarray.sum( Psi4.__abs__()**2 ).get() norm = np.sqrt(norm*self.dX * self.dY ) #print ' norm GPU = ', norm return norm
def gen_summary_stats(data): lb = data[data < mask] ub = data[data > mask] prob_ub = ub / lb n_ub = ub.size n_lb = lb.size mean_ub = gpuarray.sum(ub) / n_ub mean_lb = gpuarray.sum(lb) / n_lb var_ub = (ub - mean_ub)**2 / n_ub var_lb = (lb - mean_lb)**2 / n_lb
def Norm_GPU( self, Psi1, Psi2, Psi3, Psi4): norm = gpuarray.sum( Psi1.__abs__()**2 ).get() norm += gpuarray.sum( Psi2.__abs__()**2 ).get() norm += gpuarray.sum( Psi3.__abs__()**2 ).get() norm += gpuarray.sum( Psi4.__abs__()**2 ).get() norm = np.sqrt(norm*self.dX * self.dY * self.dZ ) #print ' norm GPU = ', norm return norm
def ERA_probe(self, iters=1): exits2_gpu = self.thr.empty_like(self.exits_gpu) print 'i, eMod, eSup' for i in range(iters): exits2_gpu = self.Pmod(self.exits_gpu) # self.error_mod.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm) # exits = exits2_gpu.get() self.Psup_probe(exits) # self.thr.to_device(makeExits2(self.sample, self.probe, self.coords, exits), dest=self.exits_gpu) # self.error_sup.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm) # update_progress(i / max(1.0, float(iters-1)), 'ERA probe', i, self.error_mod[-1], self.error_sup[-1])
def bloch_single_step_propagation(self, dbeta): """ Perform a single step propagation with respect to the inverse temperature via the Bloch equation. The final Wigner function is not normalized. :param dbeta: (float) the inverse temperature step size :return: self.wignerfunction """ self.p2theta_transform() self.bloch_expV_bulk(self.wigner_theta_x, dbeta, **self.V_bulk_mapper_params) self.bloch_expV_boundary(self.wigner_theta_x, dbeta, **self.V_boundary_mapper_params) self.theta2p_transform() self.x2lambda_transform() self.bloch_expK_bulk(self.wigner_p_lambda, dbeta, **self.K_bulk_mapper_params) self.bloch_expK_boundary(self.wigner_p_lambda, dbeta, **self.K_boundary_mapper_params) self.lambda2x_transform() self.p2theta_transform() self.bloch_expV_bulk(self.wigner_theta_x, dbeta, **self.V_bulk_mapper_params) self.bloch_expV_boundary(self.wigner_theta_x, dbeta, **self.V_boundary_mapper_params) self.theta2p_transform() # normalize self.wignerfunction /= gpuarray.sum(self.wignerfunction).get() * self.dXdP return self.wignerfunction
def get_wigner_time(self, wigner_current, wigner_init, t): """ Calculate the integral: int_{H(x, p, t) > -Ip} [wigner_current(x,p) - wigner_init(x,p)] dxdp :param wigner_current: gpuarray containing current Wigner function :param wigner_init: gpuarray containing initial Wigner function :param t: current time :return: float """ # If kernel calculating the wigner time is not present, compile it try: wigner_time_mapper = self._wigner_time_mapper except AttributeError: # Allocate memory to map self._tmp_wigner_time = gpuarray.empty(self.rho.shape, np.float64) wigner_time_mapper = self._wigner_time_mapper = SourceModule( self.wigner_time_mapper_cuda_code.format( cuda_consts=self.cuda_consts, K=self.K, V=self.V ), ).get_function("Kernel") wigner_time_mapper(self._tmp_wigner_time, wigner_current, wigner_init, t, **self.rho_mapper_params) return gpuarray.sum(self._tmp_wigner_time).get() * self.wigner_dxdp
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks) findActivityKernel(cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D) getActivityKernel(activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D) cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes) maxVal = gpuarray.max(plotData_d).get() scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity), activeThreads_d) if cudaP == "float": [oneIteration_tex() for i in range(nIterationsPerPlot)] else: [oneIteration_sh() for i in range(nIterationsPerPlot // 2)] if plotting and animIter % 25 == 0: maxVals.append(maxVal) sumConc.append(gpuarray.sum(concentrationIn_d).get()) plotData(maxVals, sumConc) animIter += 1
def get_wigner(self): """ Transform the density matrix saved in self.rho into the unormalized Wigner function :return: self.wignerfunction """ # Create the density matrix out of the wavefunction self.psi2rho(self.wavefunction, self.wignerfunction, **self.wigner_mapper_params) # Step 1: Rotate by +45 degrees # Shear X cufft.fft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1) self.phase_shearX(self.wignerfunction, **self.wigner_mapper_params) cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1) # Shear Y cufft.fft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax0) self.phase_shearY(self.wignerfunction, **self.wigner_mapper_params) cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax0) # Shear X cufft.fft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1) self.phase_shearX(self.wignerfunction, **self.wigner_mapper_params) cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1) # Step 2: FFt the Blokhintsev function self.sign_flip(self.wignerfunction, **self.wigner_mapper_params) cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax0) self.sign_flip(self.wignerfunction, **self.wigner_mapper_params) # normalize self.wignerfunction /= gpuarray.sum(self.wignerfunction).get().real * self.wigner_dXdP return self.wignerfunction
def execute(self, solver, stream=None): slvr = solver # The gaussian shape array can be empty if # no gaussian sources were specified. gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \ else slvr.gauss_shape sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape self.kernel(slvr.uvw, slvr.brightness, gauss, sersic, slvr.wavelength, slvr.antenna1, slvr.antenna2, slvr.jones_scalar, slvr.flag, slvr.weight_vector, slvr.model_vis, slvr.observed_vis, slvr.chi_sqrd_result, **self.get_kernel_params(slvr)) # Call the pycuda reduction kernel. # Divide by the single sigma squared value if a weight vector # is not required. Otherwise the kernel will incorporate the # individual sigma squared values into the sum gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get() if not self.weight_vector: slvr.set_X2(gpu_sum/slvr.sigma_sqrd) else: slvr.set_X2(gpu_sum)
def sum_cuda(a, axis=None): """Sum of GPUArray elements in a given axis direction or all elements. Parameters ---------- a : gpuarray GPUArray with elements to be operated on. axis : int Axis direction to sum through, all if None. Returns ------- gpuarray GPUArray sum. Notes ----- - This is temporary and not an efficient implementation. """ if axis is not None: m, n = a.shape if axis == 0: func = mod.get_function('sum0_cuda') b = pycuda.gpuarray.empty((1, n), dtype=float32) func(a, b, int32(m), int32(n), block=(1, m, 1), grid=(n, 1, 1)) elif axis == 1: func = mod.get_function('sum1_cuda') b = pycuda.gpuarray.empty((m, 1), dtype=float32) func(a, b, int32(m), int32(n), block=(n, 1, 1), grid=(1, m, 1)) return b return cuda_array.sum(a)
def mean_variance(red_kernel, d_data, size): '''Return the mean and variance of a 2D array''' mean = gpu.sum(d_data, dtype=np.float32).get() / np.float32(size) #mean = sum_kernel(d_data,size).get() variance = red_kernel(d_data, mean).get() / np.float32(size) #print "Mean = %f, Variance = %f" % (mean, variance) return mean, variance
def execute(self, solver, stream=None): slvr = solver # The gaussian shape array can be empty if # no gaussian sources were specified. gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \ else slvr.gauss_shape sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape self.kernel(slvr.uvw, slvr.brightness, gauss, sersic, slvr.wavelength, slvr.antenna1, slvr.antenna2, slvr.jones_scalar, slvr.flag, slvr.weight_vector, slvr.model_vis, slvr.observed_vis, slvr.chi_sqrd_result, **self.get_kernel_params(slvr)) # Call the pycuda reduction kernel. # Divide by the single sigma squared value if a weight vector # is not required. Otherwise the kernel will incorporate the # individual sigma squared values into the sum gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get() if not self.weight_vector: slvr.set_X2(gpu_sum / slvr.sigma_sqrd) else: slvr.set_X2(gpu_sum)
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False): global _global_cublas_allocator if axis is None: if calc_mean == False: return gpuarray.sum(x_gpu).get() else: return gpuarray.sum(x_gpu).get() / x_gpu.dtype.type(x_gpu.size) if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if x_gpu.flags.c_contiguous: n, m = x_gpu.shape[1], x_gpu.shape[0] lda = x_gpu.shape[1] trans = "n" if axis == 0 else "t" sum_axis, out_axis = (m, n) if axis == 0 else (n, m) else: n, m = x_gpu.shape[0], x_gpu.shape[1] lda = x_gpu.shape[0] trans = "t" if axis == 0 else "n" sum_axis, out_axis = (n, m) if axis == 0 else (m, n) alpha = (1.0 / sum_axis) if calc_mean else 1.0 if (x_gpu.dtype == np.complex64): gemv = cublas.cublasCgemv elif (x_gpu.dtype == np.float32): gemv = cublas.cublasSgemv elif (x_gpu.dtype == np.complex128): gemv = cublas.cublasZgemv elif (x_gpu.dtype == np.float64): gemv = cublas.cublasDgemv alloc = _global_cublas_allocator ons = ones((sum_axis, ), x_gpu.dtype, alloc) if out is None: out = gpuarray.empty((out_axis, ), x_gpu.dtype, alloc) else: assert out.dtype == x_gpu.dtype assert out.size >= out_axis gemv(_global_cublas_handle, trans, n, m, alpha, x_gpu.gpudata, lda, ons.gpudata, 1, 0.0, out.gpudata, 1) return out
def integrate(self, t, dt, nacptsteps, d_ucoeff): sm, explicit = self._sm, self._explicit moment, updateMoment = sm.moment, sm.updateMomentBDF updateDist, consMaxwellian = sm.updateDistBDF, sm.constructMaxwellian L0, M = self.scratch U0, LU0, U = self.scratch_moms a1, a2, g1, b = [*self.A, *self.G, self.B] pex = lambda *v: print(*v) + exit(-1) psum = lambda v: pex(gpuarray.sum(v)) pMom = lambda v: pex(v.get().reshape(-1,5)) pmin = lambda v: pex(gpuarray.min(v)) pmax = lambda v: pex(gpuarray.max(v)) # Compute the moment of the initial distribution moment(t, d_ucoeff, U0) # Compute the explicit part; L0 = -∇·f(d_ucoeff); explicit(t, d_ucoeff, L0) # Compute the moment of the explicit part moment(t, L0, LU0) # update the moments updateMoment(dt, a1, U0, -g1, LU0, a2, U, b) #pex(U.get().reshape(-1,5)) # implictly construct the Maxwellian (or Gaussian, etc.) given moments consMaxwellian(t, U, M) #pex(gpuarray.sum(L0)) if nacptsteps==-1: #pex(LU0.get().reshape(-1,5)) #pex(gpuarray.sum(d_ucoeff)) pass # update the distribution updateDist(dt, a1, d_ucoeff, -g1, L0, b, M, a2, U, d_ucoeff) #pex(gpuarray.sum(d_ucoeff)) if(nacptsteps==-1): #print("\n>> BDF-111\n") #pMom(U0) #psum(U0) #psum(L0) #pmax(L0) #psum(LU0) #pMom(LU0) #psum(U) #psum(M) #psum(d_ucoeff) #pmin(d_ucoeff) #exit(-1) pass
def calibrate_learning_rate(self, data_provider): lr_multiplier = [] for data, targets in data_provider: _, gradients = self.training_pass(data, targets) lr_multiplier.append([float((grad.size / gpuarray.sum(grad.__abs__())).get()) for grad in gradients]) lr_multiplier = np.array(lr_multiplier).mean(0) lr_multiplier /= lr_multiplier.max() self.lr_multiplier = lr_multiplier.tolist()
def gSTDEV(data1): dA = gpuarray.to_gpu(data1.astype(np.float32)) dM = gpuarray.sum(dA)/len(data1) hR = kn.kSTDEV(dA, dM).get() stdev = np.float64((hR/(len(data1)-1))**.5) return stdev
def test_sum_allocator(self): import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() rng = np.random.randint(low=512,high=1024) a = gpuarray.arange(rng,dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results assert b.get() == rng*(rng-1)//2 assert c.get() == rng*(rng-1)//2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator assert c.allocator == pool.allocate
def test_sum_allocator(self): import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() rng = np.random.randint(low=512, high=1024) a = gpuarray.arange(rng, dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results assert b.get() == rng * (rng - 1) // 2 assert c.get() == rng * (rng - 1) // 2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator assert c.allocator == pool.allocate
def calibrate_learning_rate(self, data_provider, mini_batches=None): lr_multiplier = [] for i, (data, targets) in enumerate(data_provider): if mini_batches is not None and i > mini_batches: break _, gradients = self.training_pass(data, targets) lr_multiplier.append([float((grad.size / gpuarray.sum(grad.__abs__())).get()) for grad in gradients]) lr_multiplier = np.array(lr_multiplier).mean(0) lr_multiplier /= lr_multiplier.max() self.lr_multiplier = lr_multiplier.tolist()
def initialise(self, solver, stream=None): slvr = solver # Run the kernel once so that its cached for use tmp_X2 = gpuarray.sum(slvr.chi_sqrd_result, stream=stream, allocator=slvr.dev_mem_pool.allocate) # Return the result's memory to the pool tmp_X2.gpudata.free()
def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000, )) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def wiener_like_gpu(value, v, V, a, z, t, out, err=1e-4): """Log-likelihood for the simple DDM including contaminants""" # Check if parameters are in allowed range if z<0 or z>1 or t<0 or a <= 0 or V<=0: return -np.inf wfpt_gpu.pdf_gpu(value, float(v), float(V), float(a), float(z), float(t), err, out) logp = gpuarray.sum(out).get() #cumath.log(out)).get() return np.asscalar(logp)
def create_quantiles(data, params): global quantiles, q_lb, q_ub, mask sort_gpu(data) if mask.shape != data.shape: mask = gpuarray.zeros_like(data) n_lb = gpuarray.sum(data < mask) n_ub = gpuarray.sum(data > mask) fill_lb_quantiles(data, quantiles, n_lb, n_ub, q_lb, block=(quantiles.shape[0], 1, 1)) fill_ub_quantiles(data, quantiles, n_lb, n_ub, q_ub, block=(quantiles.shape[0], 1, 1)) q_lb = q_lb.reverse() p_ub = n_ub / (n_ub + n_lb) del n_lb, n_ub return data, q_lb.get(), q_ub.get(), probs*(1-p_ub.get()), probs*p_ub.get()
def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() sum_a = numpy.sum(a) from pycuda.reduction import get_sum_kernel sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def thunk(): x, truth = inputs[0], inputs[1] context = None if hasattr(x[0], 'context'): context = x[0].context z = outputs[0] z_shape = x[0].shape if z[0] is None or z[0].shape != z_shape: z[0] = pygpu.zeros(z_shape, dtype=theano.config.floatX, context=context) x_ptr, _ = get_tens_ptr(x[0]) truth_ptr, _ = get_tens_ptr(truth[0]) z_ptr, z_obj = get_tens_ptr(z[0]) # store as gpuarray best_idx_ptr = gpuarray.GPUArray(shape=(np.prod( truth[0].shape[:2]), ), dtype=np.int32) best_iou_ptr = gpuarray.GPUArray(shape=(np.prod( truth[0].shape[:2]), ), dtype=np.float32) yolo_ptr, _ = get_yolo_info(n_classes, n_anchors, l_obj, l_noobj, anchors) # get best index index_fn(best_idx_ptr, best_iou_ptr, x_ptr, truth_ptr, yolo_ptr, block=(1, 1, 1), grid=(x[0].shape[0], 1, 1)) n_total = np.int32(x[0].shape[0] * n_anchors * np.prod(x[0].shape[-2:])) n_matched = np.int32(gpuarray.sum(best_idx_ptr != -1).get()) grad_fn(z_ptr, best_idx_ptr, best_iou_ptr, x_ptr, truth_ptr, yolo_ptr, n_matched, n_total, block=(n_anchors, 1, 1), grid=(x[0].shape[0], x[0].shape[2], x[0].shape[3])) # free all memory del best_idx_ptr del best_iou_ptr yolo_ptr.free()
def Thibault_probe(self, iters=1): exits2_gpu = self.thr.empty_like(self.exits_gpu) print 'i \t\t eConv \t\t eSup' for i in range(iters): exits = self.exits_gpu.get() self.Psup_probe(exits) # self.thr.to_device(makeExits2(self.sample, self.probe, self.coords, exits), dest=exits2_gpu) # self.error_sup.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm) # exits2_gpu = self.exits_gpu + self.Pmod(2*exits2_gpu - self.exits_gpu) - exits2_gpu # self.error_conv.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm) # self.error_mod.append(None) # self.exits_gpu = exits2_gpu.copy() # update_progress(i / max(1.0, float(iters-1)), 'Thibault probe', i, self.error_conv[-1], self.error_sup[-1])
def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000, )) a = a_gpu.get() sum_a = numpy.sum(a) from pycuda.reduction import get_sum_kernel sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def squared_loss(self, input_data, targets, average=True, cache=None, prediction=False): if cache is not None: activations = cache else: activations = self.feed_forward(input_data, prediction=prediction) loss = gpuarray.sum(matrix_sum_out_axis((targets - activations) ** 2, 1)) if average: loss = loss.mean() return float(loss.get())
def Average_KEnergy( self, temp_GPU, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): energy = gpuarray.sum( Psi1_GPU*Psi1_GPU.conj() ).get() energy += gpuarray.sum( Psi2_GPU*Psi2_GPU.conj() ).get() energy -= gpuarray.sum( Psi3_GPU*Psi3_GPU.conj() ).get() energy -= gpuarray.sum( Psi4_GPU*Psi4_GPU.conj() ).get() energy *= self.mass*self.c*self.c*self.dPx*self.dPy # temp_GPU *= 0. temp_GPU += Psi4_GPU * Psi1_GPU.conj() temp_GPU += Psi1_GPU * Psi4_GPU.conj() temp_GPU += Psi3_GPU * Psi2_GPU.conj() temp_GPU += Psi2_GPU * Psi3_GPU.conj() temp_GPU *= self.Px_GPU #temp_GPU *= self.c energy += gpuarray.sum( temp_GPU ).get()*self.dPx*self.dPy*self.c # temp_GPU *= 0. temp_GPU += Psi4_GPU * Psi1_GPU.conj() temp_GPU -= Psi1_GPU * Psi4_GPU.conj() temp_GPU -= Psi3_GPU * Psi2_GPU.conj() temp_GPU += Psi2_GPU * Psi3_GPU.conj() temp_GPU *= self.Py_GPU #temp_GPU *= -1j energy += gpuarray.sum( temp_GPU ).get()*self.dPx*self.dPy*self.c*(-1j) return energy
def test_sliceset_macroparticles(self): '''Tests whether the sum of all particles per slice is equal to the specified number of macroparticles when specifying z_cuts which lie outside of the bunch ''' #create a bunch and a slice set encompassing the whole bunch z_min, z_max = -2., 2. bunch = self.create_bunch(zmin=z_min, zmax=z_max) z_cuts = (z_min - 1, z_max + 1) mesh = self.create_mesh(z_cuts=z_cuts) slice_set = MeshSlicer(mesh, context).slice(bunch) n_particles = gpuarray.sum(slice_set.n_macroparticles_per_slice).get() self.assertEqual(self.macroparticlenumber, n_particles, 'the SliceSet lost/added some particles')
def stepFunction(): global animIter cuda.memcpy_dtod( plotDataFloat_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes ) maxVal = (gpuarray.max(plotDataFloat_d)).get() multiplyByScalarReal( cudaPre(0.5/(maxVal)), plotDataFloat_d ) floatToUchar( plotDataFloat_d, plotDataChars_d) copyToScreenArray() if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ] #else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ] if plotting and animIter%25 == 0: maxVals.append( maxVal ) sumConc.append( gpuarray.sum(concentrationIn_d).get() ) plotData( maxVals, sumConc ) animIter += 1
def computeEnergy(D_v, T, _gamma, Alpha, Beta): l, m, n = T.shape sum_alpha_beta = gpuarray.zeros_like(D_v) sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta) GR = grad(T) square_matrix(GR, GR) G_norm = gpuarray.zeros_like(T) sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm, 1.0, 1.0, 1.0) sqrt_matrix(G_norm, G_norm) ET = _gamma * gpuarray.sum(G_norm) sparse = D_v - T.reshape(l * m * n, 1) - sum_alpha_beta square_matrix(sparse, sparse) EL = gpuarray.sum(sparse) E = 1 / 2 * EL.get() + ET.get() return EL.get(), ET.get(), E
def fitness(self): # calculate new particle scores i = 0 for p in self.particles: sensor_values = np.zeros((self.number_of_sensors,3),dtype=np.float32,order='C') magnets = self.ball_joint.gen_magnets_angle(p['magnet_angles']) for sens,i in zip(self.sensors,range(0,self.number_of_sensors)): value = sens.getB(magnets) if self.normalize_magnetic_field: sensor_values[i]=value/np.linalg.norm(value) else: sensor_values[i]=value p1_gpu = gpuarray.to_gpu(sensor_values) out_gpu = gpuarray.empty(self.number_of_sensors**2, np.float32) number_of_samples = np.int32(self.number_of_sensors) bdim = (16, 16, 1) dx, mx = divmod(number_of_samples, bdim[0]) dy, my = divmod(number_of_samples, bdim[1]) gdim = ( int((dx + (mx>0))), int((dy + (my>0)))) # print(bdim) # print(gdim) self.distance(number_of_samples, p1_gpu, out_gpu, block=bdim, grid=gdim) out = np.reshape(out_gpu.get(),(number_of_samples,number_of_samples)) # sum = 0 # for val in out_gpu.get(): # sum += val # print(out) # print(sum) # print(gpuarray.sum(out_gpu)) score = gpuarray.sum(out_gpu).get() if score > p['personal_best_score']: # print('score of particle %d improved from %d to %d'%(i,p['personal_best_score'],score)) p['personal_best_score'] = score p['personal_best'] = p['magnet_angles'] i+=1 fig = plt.figure(figsize=(9,9)) ax1 = fig.add_subplot(111, projection='3d') displaySystem(magnets, subplotAx=ax1, suppress=True, direc=True) fig.savefig('pics/'+self.target_folder+'/'+p['name']+'/'+'%03d.png'%self.iteration) plt.close(fig) self.status_bar.update(1) # calculate global best score i = 0 for p in self.particles: if p['personal_best_score']>self.global_best_score: self.global_best_score = p['personal_best_score'] self.global_best_particle = i print('new global best score %d of %s'%(self.global_best_score,p['name'])) i+=1
def test_sliceset_macroparticles(self): '''Tests whether the sum of all particles per slice is equal to the specified number of macroparticles when specifying z_cuts which lie outside of the bunch ''' #create a bunch and a slice set encompassing the whole bunch z_min, z_max = -2., 2. bunch = self.create_bunch(zmin=z_min, zmax=z_max) z_cuts = (z_min-1, z_max+1) mesh = self.create_mesh(z_cuts=z_cuts) slice_set = MeshSlicer(mesh, context).slice(bunch) n_particles = gpuarray.sum(slice_set.n_macroparticles_per_slice).get() self.assertEqual(self.macroparticlenumber, n_particles, 'the SliceSet lost/added some particles')
def run(self, data, get=False): """ Function to perform drift rate conversion""" self.spectr_d = gpuarray.to_gpu(data) self.sweep_kernel(self.spectr_d, self.output_d, self.delay_table_d, self.nfreqs, self.ntimes, self.ndelays, block=self.block_size, grid=self.grid_size) if get: out = self.output_d.get() return out else: operand = self.output_d[self.ndelays // 2] mean = gpuarray.sum(operand / np.float32(self.nfreqs)) var = gpuarray.sum( (operand - mean) * (operand - mean) / np.float32(self.nfreqs)) std = np.sqrt(var.get()) self.output_d = self.output_d - mean thresholded = self.output_d > 3 * std return thresholded.get()
def computeCorrespondence(self): """ Compute point correspondence from result PointCloud to dst. CUDA function and summation reduction is called here. :return: total distance and matrix with point correspondence """ super(ICPParallel, self).computeCorrespondence() target = np.zeros([self.src.num, 3], dtype=np.float32) self.computeCorrespondenceCuda(cuda.In(self.result.points), cuda.Out(target), self.distances_gpu, block=(self.numCore, 1, 1)) return gpuarray.sum(self.distances_gpu).get(), PointCloud(target)