def minmax_pycuda(Dx_gpu, Dy_gpu): """ Given two GPUArrays, finds and returns their mins and maxes. This is all done in the GPU; only the mins/maxes are sent back to the host. """ return (gpuarray.min(Dx_gpu).get(), gpuarray.max(Dx_gpu).get(), gpuarray.min(Dy_gpu).get(), gpuarray.max(Dy_gpu).get())
def pixel_similarity_cuda(self, image): N = image.shape[0] nd = self.pairwise_difference(image, N) diff = gpuarray.max(nd) - gpuarray.min(nd) norm = self.gdivide(nd, diff) C = 1 - norm return C
def _minmax_impl(a_gpu, axis, min_or_max, out, idxout, stream=None): ''' Returns both max and argmax (min/argmin) along an axis. Hacked together from scikits.cuda code, since that doesn't have an "out" argument''' assert len(a_gpu.shape) < 3 if axis is None: ## Note: PyCUDA doesn't have an overall argmax/argmin! if min_or_max == 'max': return gpuarray.max(a_gpu).get() else: return gpuarray.min(a_gpu).get() else: if axis < 0: axis += 2 assert axis in (0, 1) n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = scm._get_minmax_kernel(a_gpu.dtype, min_or_max) target = out idx = idxout if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
def timeStepHydro(): for coord in [ 1, 2, 3]: #Bind textures to read conserved tex_1.set_array( cnsv1_array ) tex_2.set_array( cnsv2_array ) tex_3.set_array( cnsv3_array ) tex_4.set_array( cnsv4_array ) tex_5.set_array( cnsv5_array ) #Bind surfaces to write inter-cell fluxes surf_1.set_array( flx1_array ) surf_2.set_array( flx2_array ) surf_3.set_array( flx3_array ) surf_4.set_array( flx4_array ) surf_5.set_array( flx5_array ) setInterFlux_hll_kernel( np.int32( coord ), cudaPre( gamma ), cudaPre(dx), cudaPre(dy), cudaPre(dz), cnsv1_d, cnsv2_d, cnsv3_d, cnsv4_d, cnsv5_d, times_d, grid=grid3D, block=block3D ) if coord == 1: dt = c0 * gpuarray.min( times_d ).get() print dt #Bind textures to read inter-cell fluxes tex_1.set_array( flx1_array ) tex_2.set_array( flx2_array ) tex_3.set_array( flx3_array ) tex_4.set_array( flx4_array ) tex_5.set_array( flx5_array ) getInterFlux_hll_kernel( np.int32( coord ), cudaPre( dt ), cudaPre( gamma ), cudaPre(dx), cudaPre(dy), cudaPre(dz), cnsv1_d, cnsv2_d, cnsv3_d, cnsv4_d, cnsv5_d, gForceX_d, gForceY_d, gForceZ_d, gravWork_d, grid=grid3D, block=block3D ) copy3D_cnsv1() copy3D_cnsv2() copy3D_cnsv3() copy3D_cnsv4() copy3D_cnsv5()
def minimum_cuda(a, b=None): """Minimum values of two GPUArrays. Parameters ---------- a : gpuarray First GPUArray. b : gpuarray Second GPUArray. Returns ------- gpuarray Minimum values from both GPArrays, or single value if one GPUarray. Examples -------- >>> a = minimum_cuda(give_cuda([1, 2, 3]), give_cuda([3, 2, 1])) [1, 2, 1] >>> type(a) <class 'pycuda.gpuarray.GPUArray'> """ if b is not None: return cuda_array.minimum(a, b) return cuda_array.min(a)
def _minmax_impl(a_gpu, axis, min_or_max, stream=None): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None: ## Note: PyCUDA doesn't have an overall argmax/argmin! if min_or_max == 'max': return gpuarray.max(a_gpu).get() else: return gpuarray.min(a_gpu).get() else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def integrate(self, t, dt, nacptsteps, d_ucoeff): sm, explicit = self._sm, self._explicit moment, updateMoment = sm.moment, sm.updateMomentBDF updateDist, consMaxwellian = sm.updateDistBDF, sm.constructMaxwellian L0, M = self.scratch U0, LU0, U = self.scratch_moms a1, a2, g1, b = [*self.A, *self.G, self.B] pex = lambda *v: print(*v) + exit(-1) psum = lambda v: pex(gpuarray.sum(v)) pMom = lambda v: pex(v.get().reshape(-1,5)) pmin = lambda v: pex(gpuarray.min(v)) pmax = lambda v: pex(gpuarray.max(v)) # Compute the moment of the initial distribution moment(t, d_ucoeff, U0) # Compute the explicit part; L0 = -∇·f(d_ucoeff); explicit(t, d_ucoeff, L0) # Compute the moment of the explicit part moment(t, L0, LU0) # update the moments updateMoment(dt, a1, U0, -g1, LU0, a2, U, b) #pex(U.get().reshape(-1,5)) # implictly construct the Maxwellian (or Gaussian, etc.) given moments consMaxwellian(t, U, M) #pex(gpuarray.sum(L0)) if nacptsteps==-1: #pex(LU0.get().reshape(-1,5)) #pex(gpuarray.sum(d_ucoeff)) pass # update the distribution updateDist(dt, a1, d_ucoeff, -g1, L0, b, M, a2, U, d_ucoeff) #pex(gpuarray.sum(d_ucoeff)) if(nacptsteps==-1): #print("\n>> BDF-111\n") #pMom(U0) #psum(U0) #psum(L0) #pmax(L0) #psum(LU0) #pMom(LU0) #psum(U) #psum(M) #psum(d_ucoeff) #pmin(d_ucoeff) #exit(-1) pass
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None or len( a_gpu.shape ) <= 1: ## Note: PyCUDA doesn't have an overall argmax/argmin! out_shape = (1, ) * len(a_gpu.shape) if min_or_max == 'max': return gpuarray.max(a_gpu).reshape(out_shape), None else: return gpuarray.min(a_gpu).reshape(out_shape), None else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m, ) target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n, ) target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def get_K_min(self): """ Return the kinetic energy minimum """ # fill array with values of the kinetic energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_K")(self.weighted, **self.wigner_mapper_params) return gpuarray.min(self.weighted).get()
def get_K_min(self): """ Return the kinetic energy minimum """ # allocate memory k_p_p_prime = gpuarray.zeros((self.P.size, self.P.size), np.float64) # fill array with values of the kinetic energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_K")(k_p_p_prime, **self.rho_mapper_params) return gpuarray.min(k_p_p_prime).get()
def get_K_min(self): """ Return the kinetic energy minimum in the lambda p space """ # allocate memory k_p_lambda = gpuarray.zeros((self.P.size, self.Lambda.size), np.float64) # fill array with values of the kinetic energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_K_bulk")(k_p_lambda, **self.K_bulk_mapper_params) fill_compiled.get_function("fill_K_boundary")(k_p_lambda, **self.K_boundary_mapper_params) return gpuarray.min(k_p_lambda).get()
def get_V_min(self): """ Return the potential energy minimum in the x theta space """ # allocate memory v_theta_x = gpuarray.zeros((self.Theta.size, self.X.size), np.float64) # fill array with values of the potential energy fill_compiled = SourceModule( self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V) ) fill_compiled.get_function("fill_V_bulk")(v_theta_x, **self.V_bulk_mapper_params) fill_compiled.get_function("fill_V_boundary")(v_theta_x, **self.V_boundary_mapper_params) return gpuarray.min(v_theta_x).get()
def implicit_iteration( ): global alpha #Make FFT fftPlan.execute( psi_d, psiFFT_d ) #get Derivatives getPartialsXY( Lx, Ly, psiFFT_d, partialX_d, fftKx_d, partialY_d, fftKy_d, block=block3D, grid=grid3D) fftPlan.execute( partialX_d, inverse=True ) fftPlan.execute( partialY_d, inverse=True ) implicitStep1( xMin, yMin, zMin, dx, dy, dz, alpha, omega, gammaX, gammaY, gammaZ, partialX_d, partialY_d, psi_d, G_d, x0, y0, grid=grid3D, block=block3D) fftPlan.execute( G_d ) implicitStep2( dtImag, fftKx_d , fftKy_d, fftKz_d, alpha, psiFFT_d, G_d, block=block3D, grid=grid3D) fftPlan.execute( psiFFT_d, psi_d, inverse=True) #setBoundryConditionsKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), psi_d, block=block3D, grid=grid3D) normalize(dx, dy, dz, psi_d) #GetAlphas getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() ) #OPTIMIZACION
def timeTransition(): global realDynamics, alpha, applyTransition realDynamics = not realDynamics applyTransition = False if realDynamics: cuda.memcpy_dtod(psiK2_d.ptr, psi_d.ptr, psi_d.nbytes) cuda.memcpy_dtod(psiRunge_d.ptr, psi_d.ptr, psi_d.nbytes) if realTEXTURE: copy3DpsiK1Real() copy3DpsiK1Imag() copy3DpsiK2Real() copy3DpsiK2Imag() print "Real Dynamics" else: #GetAlphas getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() ) #OPTIMIZACION print "Imaginary Dynamics"
def __call__(self,theta): self.nCalls += 1 if len(theta) != self.cpa_space.d: raise ValueError(theta.shape,self.cpa_space.d) self.calc_ll(theta) ll = self.ll # ret = gpuarray.sum(ll.gpu).get() if 1: ret = sum_krnl(ll.gpu, stream=None).get() else: raise ValueError("BAD IDEA") ret = gpuarray.min(ll.gpu).get() * len(ll) # tmp = calc_sum_abs_double_prime(self.transformed.gpu,self.nPts).get() # # tmp /= (0.02)**2 ## print 'tmp',tmp # ret -= tmp / 10 # ipshell('hi') # 1/0 return ret
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False): """ Returns both max and argmax (min/argmin) along an axis.""" assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None or len(a_gpu.shape) <= 1: ## Note: PyCUDA doesn't have an overall argmax/argmin! out_shape = (1,) * len(a_gpu.shape) if min_or_max == "max": return gpuarray.max(a_gpu).reshape(out_shape), None else: return gpuarray.min(a_gpu).reshape(out_shape), None else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m,) target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n,) target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def __call__(self, theta): self.nCalls += 1 if len(theta) != self.cpa_space.d: raise ValueError(theta.shape, self.cpa_space.d) self.calc_ll(theta) ll = self.ll # ret = gpuarray.sum(ll.gpu).get() if 1: ret = sum_krnl(ll.gpu, stream=None).get() else: raise ValueError("BAD IDEA") ret = gpuarray.min(ll.gpu).get() * len(ll) # tmp = calc_sum_abs_double_prime(self.transformed.gpu,self.nPts).get() # # tmp /= (0.02)**2 ## print 'tmp',tmp # ret -= tmp / 10 # ipshell('hi') # 1/0 return ret
def computeDt(self): max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get() return max_dt * 0.5
def min(self): return gpuarray.min(self.arr).get().min()
def normalize(array): array = asgpuarray(array) array -= gpuarray.min(array) array /= gpuarray.max(array) return array
c = gpuarray.empty((100, 100), dtype=dtype) print('c:\n{0}\nshape={1}\n'.format(c, c.shape)) d = gpuarray.zeros((100, 100), dtype=dtype) print('d:\n{0}\nshape={1}\n'.format(d, d.shape)) e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype) print('e:\n{0}\nshape={1}\n'.format(e, e.shape)) f = gpuarray.if_positive(e < 50, e - 100, e + 100) print('f:\n{0}\nshape={1}\n'.format(f, f.shape)) g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e)) print('g:\n{0}\nshape={1}\n'.format(g, g.shape)) h = gpuarray.maximum(e, f) print('h:\n{0}\nshape={1}\n'.format(h, h.shape)) i = gpuarray.minimum(e, f) print('i:\n{0}\nshape={1}\n'.format(i, i.shape)) g = gpuarray.sum(a) print(g, type(g)) k = gpuarray.max(a) print(k, type(k)) l = gpuarray.min(a) print(l, type(l))
fftKx_h[i] = i*2*np.pi/Lx for i in range(nWidth/2, nWidth): fftKx_h[i] = (i-nWidth)*2*np.pi/Lx for i in range(nHeight/2): fftKy_h[i] = i*2*np.pi/Ly for i in range(nHeight/2, nHeight): fftKy_h[i] = (i-nHeight)*2*np.pi/Ly for i in range(nDepth/2): fftKz_h[i] = i*2*np.pi/Lz for i in range(nDepth/2, nDepth): fftKz_h[i] = (i-nDepth)*2*np.pi/Lz psi_d = gpuarray.to_gpu(psi_h) alphas_d = gpuarray.to_gpu( np.zeros_like(psi_h.real) ) normalize( dx, dy, dz, psi_d ) getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha=( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() psiMod_d = gpuarray.to_gpu( np.zeros_like(psi_h.real) ) psiFFT_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) partialX_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) partialY_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) G_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) fftKx_d = gpuarray.to_gpu( fftKx_h ) #OPTIMIZATION fftKy_d = gpuarray.to_gpu( fftKy_h ) fftKz_d = gpuarray.to_gpu( fftKz_h ) activity_d = gpuarray.to_gpu( np.ones( nBlocks3D, dtype=np.uint8 ) ) psiOther_d = gpuarray.to_gpu( np.zeros_like(psi_h.real) ) psiK1_d = gpuarray.to_gpu( psi_h ) psiK2_d = gpuarray.to_gpu( psi_h ) psiRunge_d = gpuarray.to_gpu( psi_h ) #For FFT version laplacian_d = gpuarray.to_gpu( np.zeros_like(psi_h) )