def stepFuntion(): getModulo( psi_d, psiMod_d ) maxVal = (gpuarray.max(psiMod_d)).get() multiplyByScalarReal( cudaPre(0.95/(maxVal)), psiMod_d ) sendModuloToUCHAR( psiMod_d, plotData_d) copyToScreenArray() if volumeRender.nTextures == 2: if not realDynamics: cuda.memset_d8(activity_d.ptr, 0, nBlocks3D ) findActivityKernel( cudaPre(0.001), psi_d, activity_d, grid=grid3D, block=block3D ) if plotVar == 1: getActivityKernel( psiOther_d, activity_d, grid=grid3D, block=block3D ) if plotVar == 0: if realTEXTURE: tex_psiReal.set_array( psiK2Real_array ) tex_psiImag.set_array( psiK2Imag_array ) getVelocity_texKernel( dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D ) else: getVelocityKernel( np.int32(neighbors), dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D ) maxVal = (gpuarray.max(psiOther_d)).get() if maxVal > 0: multiplyByScalarReal( cudaPre(1./maxVal), psiOther_d ) sendModuloToUCHAR( psiOther_d, plotData_d_1) copyToScreenArray_1() if applyTransition: timeTransition() if realDynamics: realStep() else: imaginaryStep()
def minmax_pycuda(Dx_gpu, Dy_gpu): """ Given two GPUArrays, finds and returns their mins and maxes. This is all done in the GPU; only the mins/maxes are sent back to the host. """ return (gpuarray.min(Dx_gpu).get(), gpuarray.max(Dx_gpu).get(), gpuarray.min(Dy_gpu).get(), gpuarray.max(Dy_gpu).get())
def gpu_getmax(map): """ Use pycuda to get the maximum absolute deviation of the residual map, with the correct sign """ imax=gpu.max(cumath.fabs(map)).get() if gpu.max(map).get()!=imax: imax*=-1 return np.float32(imax)
def gpu_getmax(map): """ Use pycuda to get the maximum absolute deviation of the residual map, with the correct sign """ imax = gpu.max(cumath.fabs(map)).get() if gpu.max(map).get() != imax: imax *= -1 return np.float32(imax)
def maximum_cuda(a, b=None): """Maximum values of two GPUArrays. Parameters ---------- a : gpuarray First GPUArray. b : gpuarray Second GPUArray. Returns ------- gpuarray Maximum values from both GPArrays, or single value if one GPUarray. Examples -------- >>> a = maximum_cuda(give_cuda([1, 2, 3]), give_cuda([3, 2, 1])) [3, 2, 3] >>> type(a) <class 'pycuda.gpuarray.GPUArray'> """ if b is not None: return cuda_array.maximum(a, b) return cuda_array.max(a)
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks) findActivityKernel(cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D) getActivityKernel(activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D) cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes) maxVal = gpuarray.max(plotData_d).get() scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity), activeThreads_d) if cudaP == "float": [oneIteration_tex() for i in range(nIterationsPerPlot)] else: [oneIteration_sh() for i in range(nIterationsPerPlot // 2)] if plotting and animIter % 25 == 0: maxVals.append(maxVal) sumConc.append(gpuarray.sum(concentrationIn_d).get()) plotData(maxVals, sumConc) animIter += 1
def _minmax_impl(a_gpu, axis, min_or_max, stream=None): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None: ## Note: PyCUDA doesn't have an overall argmax/argmin! if min_or_max == 'max': return gpuarray.max(a_gpu).get() else: return gpuarray.min(a_gpu).get() else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def pixel_similarity_cuda(self, image): N = image.shape[0] nd = self.pairwise_difference(image, N) diff = gpuarray.max(nd) - gpuarray.min(nd) norm = self.gdivide(nd, diff) C = 1 - norm return C
def stepFuntion(): maxVal = ( gpuarray.max( cnsv1_d ) ).get() convertToUCHAR( cudaPre( 0.95/maxVal ), cnsv1_d, plotData_d) copyToScreenArray() timeStepHydro() if usingGravity: getGravForce()
def _minmax_impl(a_gpu, axis, min_or_max, out, idxout, stream=None): ''' Returns both max and argmax (min/argmin) along an axis. Hacked together from scikits.cuda code, since that doesn't have an "out" argument''' assert len(a_gpu.shape) < 3 if axis is None: ## Note: PyCUDA doesn't have an overall argmax/argmin! if min_or_max == 'max': return gpuarray.max(a_gpu).get() else: return gpuarray.min(a_gpu).get() else: if axis < 0: axis += 2 assert axis in (0, 1) n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = scm._get_minmax_kernel(a_gpu.dtype, min_or_max) target = out idx = idxout if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
def integrate(self, t, dt, nacptsteps, d_ucoeff): sm, explicit = self._sm, self._explicit moment, updateMoment = sm.moment, sm.updateMomentBDF updateDist, consMaxwellian = sm.updateDistBDF, sm.constructMaxwellian L0, M = self.scratch U0, LU0, U = self.scratch_moms a1, a2, g1, b = [*self.A, *self.G, self.B] pex = lambda *v: print(*v) + exit(-1) psum = lambda v: pex(gpuarray.sum(v)) pMom = lambda v: pex(v.get().reshape(-1,5)) pmin = lambda v: pex(gpuarray.min(v)) pmax = lambda v: pex(gpuarray.max(v)) # Compute the moment of the initial distribution moment(t, d_ucoeff, U0) # Compute the explicit part; L0 = -∇·f(d_ucoeff); explicit(t, d_ucoeff, L0) # Compute the moment of the explicit part moment(t, L0, LU0) # update the moments updateMoment(dt, a1, U0, -g1, LU0, a2, U, b) #pex(U.get().reshape(-1,5)) # implictly construct the Maxwellian (or Gaussian, etc.) given moments consMaxwellian(t, U, M) #pex(gpuarray.sum(L0)) if nacptsteps==-1: #pex(LU0.get().reshape(-1,5)) #pex(gpuarray.sum(d_ucoeff)) pass # update the distribution updateDist(dt, a1, d_ucoeff, -g1, L0, b, M, a2, U, d_ucoeff) #pex(gpuarray.sum(d_ucoeff)) if(nacptsteps==-1): #print("\n>> BDF-111\n") #pMom(U0) #psum(U0) #psum(L0) #pmax(L0) #psum(LU0) #pMom(LU0) #psum(U) #psum(M) #psum(d_ucoeff) #pmin(d_ucoeff) #exit(-1) pass
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None or len( a_gpu.shape ) <= 1: ## Note: PyCUDA doesn't have an overall argmax/argmin! out_shape = (1, ) * len(a_gpu.shape) if min_or_max == 'max': return gpuarray.max(a_gpu).reshape(out_shape), None else: return gpuarray.min(a_gpu).reshape(out_shape), None else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m, ) target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n, ) target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def check_termination(self): """ Check various termination criteria """ # First check if we are doing termination based on running time if (self.options.time_limit): self.time = time.clock - self.time_start if (self.time >= self.options.maxtime): self.term_reason = 'Exceeded time limit' return # Now check if we are doing break by tolx if (self.options.use_tolx): if (np.sqrt(cua.dot(self.dx, self.dx).get()) / np.sqrt(cua.dot(self.oldx, self.oldx).get()) < self.options.tolx): self.term_reason = 'Relative change in x small enough' return # Are we doing break by tolo (tol obj val) if (self.options.use_tolo and self.iter > 2): delta = abs(self.obj - self.oldobj) if (delta < self.options.tolo): self.term_reason = 'Relative change in objvalue small enough' return # Check if change in x and gradient are small enough # we don't want that for now # if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \ # or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg): # self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9' # return # Finally the plain old check if max iter has been achieved if (self.iter >= self.options.maxiter): self.term_reason = 'Maximum number of iterations reached' return # KKT violation if (self.options.use_kkt): if np.abs(np.sqrt(cua.dot(self.x, self.grad).get())) <= options.tolk: self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm' return # Gradient check if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get() if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0
def check_termination(self): """ Check various termination criteria """ # First check if we are doing termination based on running time if (self.options.time_limit): self.time = time.clock - self.time_start if (self.time >= self.options.maxtime): self.term_reason = 'Exceeded time limit' return # Now check if we are doing break by tolx if (self.options.use_tolx): if (np.sqrt(cua.dot(self.dx,self.dx).get())/ np.sqrt(cua.dot(self.oldx,self.oldx).get()) < self.options.tolx): self.term_reason = 'Relative change in x small enough' return # Are we doing break by tolo (tol obj val) if (self.options.use_tolo and self.iter > 2): delta = abs(self.obj-self.oldobj) if (delta < self.options.tolo): self.term_reason ='Relative change in objvalue small enough' return # Check if change in x and gradient are small enough # we don't want that for now # if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \ # or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg): # self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9' # return # Finally the plain old check if max iter has been achieved if (self.iter >= self.options.maxiter): self.term_reason = 'Maximum number of iterations reached' return # KKT violation if (self.options.use_kkt): if np.abs(np.sqrt(cua.dot(self.x,self.grad).get())) <= options.tolk: self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm' return # Gradient check if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get(); if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0;
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) while step < max_steps: # Just finish the rest of the steps if the # of photons is low if nphotons < nthreads_per_block * 16 * 8 or use_weights: nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if ga.max(self.flags).get() & (1 << 31): print >>sys.stderr, "WARNING: ABORTED PHOTONS" cuda.Context.get_current().synchronize()
def stepFunction(): global animIter cuda.memcpy_dtod( plotDataFloat_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes ) maxVal = (gpuarray.max(plotDataFloat_d)).get() multiplyByScalarReal( cudaPre(0.5/(maxVal)), plotDataFloat_d ) floatToUchar( plotDataFloat_d, plotDataChars_d) copyToScreenArray() if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ] #else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ] if plotting and animIter%25 == 0: maxVals.append( maxVal ) sumConc.append( gpuarray.sum(concentrationIn_d).get() ) plotData( maxVals, sumConc ) animIter += 1
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks ) findActivityKernel( cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D ) getActivityKernel( activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D ) cuda.memcpy_dtod( plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes ) maxVal = gpuarray.max( plotData_d ).get() scalePlotData(100./maxVal, plotData_d, np.uint8(showActivity), activeThreads_d ) if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ] else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ] if plotting and animIter%25 == 0: maxVals.append( maxVal ) sumConc.append( gpuarray.sum(concentrationIn_d).get() ) plotData( maxVals, sumConc ) animIter += 1
def timeTransition(): global realDynamics, alpha, applyTransition realDynamics = not realDynamics applyTransition = False if realDynamics: cuda.memcpy_dtod(psiK2_d.ptr, psi_d.ptr, psi_d.nbytes) cuda.memcpy_dtod(psiRunge_d.ptr, psi_d.ptr, psi_d.nbytes) if realTEXTURE: copy3DpsiK1Real() copy3DpsiK1Imag() copy3DpsiK2Real() copy3DpsiK2Imag() print "Real Dynamics" else: #GetAlphas getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() ) #OPTIMIZACION print "Imaginary Dynamics"
def implicit_iteration( ): global alpha #Make FFT fftPlan.execute( psi_d, psiFFT_d ) #get Derivatives getPartialsXY( Lx, Ly, psiFFT_d, partialX_d, fftKx_d, partialY_d, fftKy_d, block=block3D, grid=grid3D) fftPlan.execute( partialX_d, inverse=True ) fftPlan.execute( partialY_d, inverse=True ) implicitStep1( xMin, yMin, zMin, dx, dy, dz, alpha, omega, gammaX, gammaY, gammaZ, partialX_d, partialY_d, psi_d, G_d, x0, y0, grid=grid3D, block=block3D) fftPlan.execute( G_d ) implicitStep2( dtImag, fftKx_d , fftKy_d, fftKz_d, alpha, psiFFT_d, G_d, block=block3D, grid=grid3D) fftPlan.execute( psiFFT_d, psi_d, inverse=True) #setBoundryConditionsKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), psi_d, block=block3D, grid=grid3D) normalize(dx, dy, dz, psi_d) #GetAlphas getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() ) #OPTIMIZACION
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False): """ Returns both max and argmax (min/argmin) along an axis.""" assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None or len(a_gpu.shape) <= 1: ## Note: PyCUDA doesn't have an overall argmax/argmin! out_shape = (1,) * len(a_gpu.shape) if min_or_max == "max": return gpuarray.max(a_gpu).reshape(out_shape), None else: return gpuarray.min(a_gpu).reshape(out_shape), None else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m,) target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n,) target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx
def pinv(a_gpu, dev, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = pinv(a_gpu, pycuda.autoinit.device) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = pinv(b_gpu, pycuda.autoinit.device) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ # Check input dtype because the SVD can only be computed in single # precision: if a_gpu.dtype not in [np.float32, np.complex64]: raise ValueError('unsupported type') # Compute SVD: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 0) uh_gpu = transpose(u_gpu, dev) # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, s_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Suppress very small singular values: cutoff_invert_s_mod = \ SourceModule(cutoff_invert_s_mod_template.substitute( max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid)) cutoff_invert_s = \ cutoff_invert_s_mod.get_function('cutoff_invert_s') cutoff_gpu = gpuarray.max(s_gpu)*rcond cutoff_invert_s(s_gpu.gpudata, cutoff_gpu.gpudata, np.uint32(s_gpu.size), block=block_dim, grid=grid_dim) # The singular values must data type is in uh_gpu: if s_gpu.dtype == uh_gpu.dtype: s_diag_gpu = diag(s_gpu, dev) else: s_diag_gpu = diag(s_gpu.astype(uh_gpu.dtype), dev) # Finish pinv computation: v_gpu = transpose(vh_gpu, dev) suh_gpu = dot(s_diag_gpu, uh_gpu) return dot(v_gpu, suh_gpu)
def L1Norm(X): return gpuarray.max(X * (gpuarray.zeros((X.shape[1],), dtype=int) + 1)).get()
fftKx_h[i] = i*2*np.pi/Lx for i in range(nWidth/2, nWidth): fftKx_h[i] = (i-nWidth)*2*np.pi/Lx for i in range(nHeight/2): fftKy_h[i] = i*2*np.pi/Ly for i in range(nHeight/2, nHeight): fftKy_h[i] = (i-nHeight)*2*np.pi/Ly for i in range(nDepth/2): fftKz_h[i] = i*2*np.pi/Lz for i in range(nDepth/2, nDepth): fftKz_h[i] = (i-nDepth)*2*np.pi/Lz psi_d = gpuarray.to_gpu(psi_h) alphas_d = gpuarray.to_gpu( np.zeros_like(psi_h.real) ) normalize( dx, dy, dz, psi_d ) getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha=( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() psiMod_d = gpuarray.to_gpu( np.zeros_like(psi_h.real) ) psiFFT_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) partialX_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) partialY_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) G_d = gpuarray.to_gpu( np.zeros_like(psi_h) ) fftKx_d = gpuarray.to_gpu( fftKx_h ) #OPTIMIZATION fftKy_d = gpuarray.to_gpu( fftKy_h ) fftKz_d = gpuarray.to_gpu( fftKz_h ) activity_d = gpuarray.to_gpu( np.ones( nBlocks3D, dtype=np.uint8 ) ) psiOther_d = gpuarray.to_gpu( np.zeros_like(psi_h.real) ) psiK1_d = gpuarray.to_gpu( psi_h ) psiK2_d = gpuarray.to_gpu( psi_h ) psiRunge_d = gpuarray.to_gpu( psi_h ) #For FFT version laplacian_d = gpuarray.to_gpu( np.zeros_like(psi_h) )
d_seed_threshold, block=blocksize, grid=gridsize) # reset the queue for queue calculation d_queue.fill(0) cu.memcpy_dtod(d_scan.gpudata, d_nextFront.gpudata, d_nextFront.nbytes) scan_kernel(d_scan) # scan the front for queue index determination # call queue kernel in order to generate queue queue_kernel(d_nextFront, d_scan, d_queue, width, height, block=blocksize, grid=gridsize) # generate queue qLen = np.int32(gpu.max( d_scan).get()) # max value in scan must be the largest index of queue end_gpu_time.record() end_gpu_time.synchronize() gpu_comp_time += start_gpu_time.time_till(end_gpu_time) * 1e-3 contComp = True if (qLen == 0): contComp = False steps = 0 # While the next front has pixels we need to process while contComp: # Region Growing # Run the CUDA kernel with the appropriate inputs start_gpu_time.record() d_nextFront.fill(0) # set nextFront to zero
def _max_CUDA(a, stream=None): return cu_array.max(a=a, stream=stream)
r_space_gpu = buffer_r_space - beta * r_space_gpu sample = gpuif((sample.real < 0).astype(np.bool), r_space_gpu, sample) r_space_gpu = gpuif(MaskBoolean, sample, r_space_gpu) #### OSS #### if (HIOfirst == 0 or iter > np.ceil(iterations/filtercount)) and iter < np.ceil(iterations-iterations/filtercount): newsigma = sigma[(iter-1)] if lastUsedSigma != newsigma: print(str(iter) + ' changing filter to ' + str(newsigma)) kfilter = np.exp(-(((np.sqrt((yy)**2+(xx)**2)**2))/(2*(newsigma)**2))).astype(np.complex64) kfilter_gpu.set(kfilter) temp_gpu = kfilter_gpu + 0.0 cu_fft.fft(kfilter_gpu,temp_gpu,plan_forward) temp_gpu = temp_gpu/gpuarray.max(temp_gpu.real).get().astype(np.float32).astype(np.complex64) #CuFFTShift(kfilter_gpu,temp_gpu) # Desnecessauro lastUsedSigma = newsigma cu_fft.fft(r_space_gpu, ktemp_gpu, plan_forward) ktemp_gpu = ktemp_gpu*kfilter_gpu cu_fft.ifft(ktemp_gpu, r_space_gpu, plan_inverse,True) if np.mod(iterations,iter//filtercount)==0: r_space_gpu = R2D[filtnum-1] + 0.0 else: r_space_gpu = gpuif(MaskBoolean,sample,r_space_gpu) ##### ### ####
def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Suppress very small singular values: cutoff_gpu = gpuarray.max(s_gpu)*rcond ctype = tools.dtype_to_ctype(s_gpu.dtype) cutoff_func = el.ElementwiseKernel("{ctype} *s, {ctype} *cutoff".format(ctype=ctype), "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}") cutoff_func(s_gpu, cutoff_gpu) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, track=False): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in range(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) if track: step_photon_ids = [] step_photons = [] #save the first step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) while step < max_steps: # Just finish the rest of the steps if the # of photons is low and not tracking if not track and (nphotons < nthreads_per_block * 16 * 8 or use_weights): nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) if track: #save the next step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if nphotons == 0: break if ga.max(self.flags).get() & (1 << 31): print("WARNING: ABORTED PHOTONS", file=sys.stderr) cuda.Context.get_current().synchronize() if track: return step_photon_ids, step_photons
gpuSum = reduct.ReductionKernel(numpy.int32, neutral="0", reduce_expr="a+b",map_expr="1 << x[i]", arguments="long* x") a_gpu = gpuarray.to_gpu(numpy.asarray(range(n), dtype = numpy.int64)) t0 = time.time() krnl = reduct.ReductionKernel(numpy.int64, neutral="0", reduce_expr="x[i] + x[i+1]", map_expr="x[i]", arguments="long *x") t1 = time.time() #res = krnl(a_gpu).get() t5 = time.time() print gpuarray.max(a_gpu).get() t6 = time.time() print '%0.6f' % ((t6 - t5)*1000) print '%0.6f' % ((t1 - t0)*1000) t2 = time.time() sum = sum(range(n)) #maxcpu = max(range(n))
def propagate_hit(self, gpu_geometry, rng_states, parameters): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. got one abort:: In [1]: a = ph("hhMOCK") In [9]: f = a[:,3,2].view(np.uint32) In [12]: np.where( f & 1<<31 ) Out[12]: (array([279]),) failed to just mock that one:: RANGE=279:280 MockNuWa MOCK """ nphotons = self.pos.size nwork = nphotons nthreads_per_block = parameters['threads_per_block'] max_blocks = parameters['max_blocks'] max_steps = parameters['max_steps'] use_weights = False scatter_first = 0 self.upload_queues(nwork) solid_id_map_gpu = gpu_geometry.solid_id_map solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu small_remainder = nthreads_per_block * 16 * 8 block = (nthreads_per_block, 1, 1) results = {} results['name'] = "propagate_hit" results['nphotons'] = nphotons results['nwork'] = nwork results['nsmall'] = small_remainder results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i" step = 0 times = [] npass = 0 nabort = 0 while step < max_steps: npass += 1 if nwork < small_remainder or use_weights: nsteps = max_steps - step # Just finish the rest of the steps if the # of photons is low log.debug( "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s " % (small_remainder, nwork, nsteps, max_steps)) else: nsteps = 1 pass log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " % (nphotons, nwork, step, max_steps, nsteps)) abort = False for first_photon, photons_this_round, blocks in chunk_iterator( nwork, nthreads_per_block, max_blocks): if abort: nabort += 1 else: grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.input_queue_gpu[1:].gpudata, self.output_queue_gpu.gpudata, rng_states, self.pos.gpudata, self.dir.gpudata, self.wavelengths.gpudata, self.pol.gpudata, self.t.gpudata, self.flags.gpudata, self.last_hit_triangles.gpudata, self.weights.gpudata, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, solid_id_map_gpu.gpudata, solid_id_to_channel_id_gpu.gpudata, ) log.info( "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s " % (repr(grid), repr(block), first_photon, photons_this_round)) get_time = self.propagate_hit_kernel.prepared_timed_call( grid, block, *args) t = get_time() times.append(t) if t > self.max_time: abort = True log.warn( "kernel launch time %s > max_time %s : ABORTING " % (t, self.max_time)) pass pass pass log.info("step %s propagate_hit_kernel times %s " % (step, repr(times))) pass step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: nwork = self.swap_queues() pass pass log.info("calling max ") if ga.max(self.flags).get() & (1 << 31): log.warn("ABORTED PHOTONS") log.info("done calling max ") cuda.Context.get_current().synchronize() results['npass'] = npass results['nabort'] = nabort results['nlaunch'] = len(times) results['tottime'] = sum(times) results['maxtime'] = max(times) results['mintime'] = min(times) results[ 'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f" return results
c = gpuarray.empty((100, 100), dtype=dtype) print('c:\n{0}\nshape={1}\n'.format(c, c.shape)) d = gpuarray.zeros((100, 100), dtype=dtype) print('d:\n{0}\nshape={1}\n'.format(d, d.shape)) e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype) print('e:\n{0}\nshape={1}\n'.format(e, e.shape)) f = gpuarray.if_positive(e < 50, e - 100, e + 100) print('f:\n{0}\nshape={1}\n'.format(f, f.shape)) g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e)) print('g:\n{0}\nshape={1}\n'.format(g, g.shape)) h = gpuarray.maximum(e, f) print('h:\n{0}\nshape={1}\n'.format(h, h.shape)) i = gpuarray.minimum(e, f) print('i:\n{0}\nshape={1}\n'.format(i, i.shape)) g = gpuarray.sum(a) print(g, type(g)) k = gpuarray.max(a) print(k, type(k)) l = gpuarray.min(a) print(l, type(l))
def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Get block/grid sizes; the number of threads per block is limited # to 512 because the cutoff_invert_s kernel defined above uses too # many registers to be invoked in 1024 threads per block (i.e., on # GPUs with compute capability >= 2.x): dev = misc.get_current_device() max_threads_per_block = 512 block_dim, grid_dim = misc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block) # Suppress very small singular values: use_double = 1 if s_gpu.dtype == np.float64 else 0 cutoff_invert_s_mod = \ SourceModule(cutoff_invert_s_template.substitute(use_double=use_double)) cutoff_invert_s = \ cutoff_invert_s_mod.get_function('cutoff_invert_s') cutoff_gpu = gpuarray.max(s_gpu)*rcond cutoff_invert_s(s_gpu, cutoff_gpu, np.uint32(s_gpu.size), block=block_dim, grid=grid_dim) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
def calculateTimestep(meshPropSpeedsGPU, cellDim): maxPropSpeed = gpuarray.max(cumath.fabs(meshPropSpeedsGPU)).get() return cellDim / (4.0 * maxPropSpeed)
def cuda_gridvis(csrh_sun, csrh_satellite, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() #f = pyfits.open(settings['vfile']) # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) ## grab data #f = pyfits.open(settings['vfile']) Data = np.ndarray(shape=(44, 44, 16), dtype=complex) UVW = np.ndarray(shape=(780, 1), dtype='float64') Data, UVW = visibility(csrh_sun, csrh_satellite, chan) print "UVW*****\n", UVW # determin the file type (uvfits or fitsidi) h_uu = np.ndarray(shape=(780), dtype='float64') h_vv = np.ndarray(shape=(780), dtype='float64') h_rere = np.ndarray(shape=(780), dtype='float32') h_imim = np.ndarray(shape=(780), dtype='float32') freq = 1702500000. light_speed = 299792458. # Speed of light ## quickly figure out what data is not flagged #np.float32(f[7].header['CRVAL3']) 299792458vvvv #good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) #h_u = np.float32(freq*f[0].data.par('uu')[good]) #h_v = np.float32(freq*f[0].data.par('vv')[good]) blen = 0 for antenna1 in range(0, 39): for antenna2 in range(antenna1 + 1, 40): h_rere[blen] = Data[antenna1][antenna2][chan].real h_imim[blen] = Data[antenna1][antenna2][chan].imag h_uu[blen] = freq * UVW[blen][0] h_vv[blen] = freq * UVW[blen][1] blen += 1 print "h_u", h_uu #h_u = np.float32(h_u.ravel()) #h_v = np.float32(h_v.ravel()) gcount = np.int32(np.size(h_uu)) #gcount = len(gcount.ravel()) #h_re = np.float32(h_re.ravel()) #h_im = np.float32(h_im.ravel()) #freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) blen = 0 bl_order = np.ndarray(shape=(780, 2), dtype=int) good = [] for border1 in range(0, 39): for border2 in range(border1 + 1, 40): bl_order[blen][0] = border1 bl_order[blen][1] = border2 blen = blen + 1 blen = 0 h_u = [] h_v = [] h_re = [] h_im = [] Flag_Ant = [ 0, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 37, 38, 39 ] for blen in range(0, 780): if (bl_order[blen][0] not in Flag_Ant) and (bl_order[blen][1] not in Flag_Ant): good.append(blen) h_u.append(h_uu[blen]) h_v.append(h_vv[blen]) h_re.append(h_rere[blen]) h_im.append(h_imim[blen]) #print "Good:",good gcount = np.int32(np.size(h_u)) ## assume data is unpolarized #print chan print 'GCOUNT', gcount #print "H_U", h_u #print "H_V", h_v #print h_re #print h_im # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_uu, dtype='float32')) d_v = gpu.to_gpu(np.array(h_vv, dtype='float32')) d_re = gpu.to_gpu(np.array(h_rere, dtype='float32')) d_im = gpu.to_gpu(np.array(h_imim, dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def bicgstabMemory(cublasHandle, x_gpu, b_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionr, sizePartitionc, M, max_it, tol): # --- flag: 0 = solution found to tolerance # 1 = no convergence given max_it # -1 = breakdown: rho = 0 # -2 = breakdown: omega = 0 N = xcg_gpu.size # --- Initializations iter = np.float32(0) flag = np.float32(0) alpha = np.float32(0) rho_1 = np.float32(0) v_gpu = gpuarray.zeros(N, dtype=np.float32) p_gpu = gpuarray.zeros(N, dtype=np.float32) # d_p_hat = gpuarray.zeros(N, dtype = np.float32) # d_s_hat = gpuarray.zeros(N, dtype = np.float32) # d_t = gpuarray.zeros(N, dtype = np.float32) #bnrm2 = np.sqrt((culinalg.dot(b_gpu, b_gpu.conj(), 'T', 'N').real).get()) bnrm2 = cublas.cublasScnrm2(cublasHandle, N, b_gpu.gpudata, 1) if bnrm2 == np.float32(0.0): bnrm2 = np.float32(1.0) yprime_gpu = computeAx(x_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionc, XX_gpu.size) xprime_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu, Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c, Deltaxprime * Deltayprime * Deltazprime, sizePartitionr, b_gpu.size) r_gpu = b_gpu - xprime_gpu error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2 if (error < tol): return x_gpu, error, iter, flag omega = np.float32(1.0) r_tld_gpu = r_gpu.copy() for iter in range(max_it): rho = cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1, r_gpu.gpudata, 1) # direction vector if (rho == np.float32(0.0)): break if (iter > 0): beta = (rho / rho_1) * (alpha / omega) cublas.cublasCaxpy(cublasHandle, N, -omega, v_gpu.gpudata, 1, p_gpu.gpudata, 1) cublas.cublasCscal(cublasHandle, N, beta, p_gpu.gpudata, 1) cublas.cublasCaxpy(cublasHandle, N, np.float32(1.0), r_gpu.gpudata, 1, p_gpu.gpudata, 1) else: p_gpu = r_gpu.copy() p_hat_gpu = p_gpu.copy() yprime_gpu = computeAx(p_hat_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionc, XX_gpu.size) v_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu, Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c, Deltaxprime * Deltayprime * Deltazprime, sizePartitionr, b_gpu.size) alpha = rho / cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1, v_gpu.gpudata, 1) s_gpu = r_gpu.copy() cublas.cublasCaxpy(cublasHandle, N, -alpha, v_gpu.gpudata, 1, s_gpu.gpudata, 1) norms = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata, 1) if (norms < tol): # --- early convergence check cublas.cublasCaxpy(cublasHandle, N, np.float32(alpha), p_hat_gpu.gpudata, 1, x_gpu.gpudata, 1) break # --- stabilizer s_hat_gpu = s_gpu.copy() yprime_gpu = computeAx(s_hat_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionc, XX_gpu.size) t_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu, Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c, Deltaxprime * Deltayprime * Deltazprime, sizePartitionr, b_gpu.size) omega = cublas.cublasCdotc(cublasHandle, N, t_gpu.gpudata, 1, s_gpu.gpudata, 1) / cublas.cublasCdotc( cublasHandle, N, t_gpu.gpudata, 1, t_gpu.gpudata, 1) # --- update approximation cublas.cublasCaxpy(cublasHandle, N, alpha, p_hat_gpu.gpudata, 1, x_gpu.gpudata, 1) cublas.cublasCaxpy(cublasHandle, N, omega, s_hat_gpu.gpudata, 1, x_gpu.gpudata, 1) r_gpu = s_gpu.copy() cublas.cublasCaxpy(cublasHandle, N, -omega, t_gpu.gpudata, 1, r_gpu.gpudata, 1) error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2 # --- check convergence if (error <= tol): break if (omega == np.float32(0.0)): break rho_1 = rho print("iteration") temp = np.sqrt( gpuarray.max(s_gpu.real * s_gpu.real + s_gpu.imag * s_gpu.imag).get()) if ((error <= np.float32(tol)) or temp <= tol): # --- converged if (temp <= tol): error = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata, 1) / bnrm2 flag = 0 elif (omega == np.float32(0.0)): # --- breakdown flag = -2 elif (rho == np.float32(0.0)): flag = -1 else: # --- no convergence flag = 1 p_hat_gpu.gpudata.free() s_hat_gpu.gpudata.free() v_gpu.gpudata.free() t_gpu.gpudata.free() return xcg_gpu, 0, 0, 0
Theil(nps, indices, data_gpu, pt_ax_gpu, pts_gpu, Nevts, block=(BLOCKSIZE, 1, 1), grid=(nps / BLOCKSIZE, 1, 1)) theils = indices.get() pion = pion_gpu.get() kaon = kaon_gpu.get() all_shit = [] best_th = gpuarray.max(indices).get() mask = indices == best_th S = gpuarray.sum(mask).get() mpi = gpuarray.sum(pion_gpu * mask) / S mk = gpuarray.sum(kaon_gpu * mask) / S if S > 1: print "interesting, ", S, " points out of ", len( mask), "have all the maximum Theil index, will average them" print "Best Theil:", best_th, "Mpi: ", mpi, "MK: ", mk, "Scale: ", scale print "##############################################" result = [best_th, mpi.get(), mk.get()] #ploteo o resultado neste espazo: def toy(x, Mt, mt, M=MPDG, m=mPDG): a = np.sqrt(M**2 - 4. * m**2) / M
def cuda_gridvis(sub_array, f, settings, plan, chan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() if sub_array==1: Antennas = 40 else: Antennas = 60 # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) # determin the file type (uvfits or fitsidi) h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64') h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32') #Get Visibility Data and values of UVW if settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) print "h_u", h_u.shape h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1]) freq = 1702500000. light_speed = 299792458. # Speed of light ## assume data is unpolarized #print chan print 'GCOUNT', gcount # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_u,dtype='float32')) d_v = gpu.to_gpu(np.array(h_v,dtype='float32')) d_re = gpu.to_gpu(np.array(h_re,dtype='float32')) d_im = gpu.to_gpu(np.array(h_im,dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters if imsize == 1024: blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) else: blocksize2D = (16, 32, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (32, 32, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (512, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution. # This kernel also calculates the point spread function and the local sampling # from the data (for applying the weights later). gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
d_thisFront, d_nextFront = d_nextFront, d_thisFront d_nextFront.fill(0) #d_nextFront = gpu.zeros_like(d_thisFront) # Run the CUDA kernel with the appropriate inputs start_gpu_time.record() regionGrow_kernel(d_image, d_region, d_thisFront, d_nextFront, width, height, d_threshold, block=blocksize, grid=gridsize) # nextFront should have all zeroes if there are no more fronts moreFronts = gpu.max(d_nextFront).get() end_gpu_time.record() end_gpu_time.synchronize() gpu_comp_time += start_gpu_time.time_till(end_gpu_time) * 1e-3 # check if the max element in nextFront is a zero if (moreFronts == 0): contComp = False # terminate loop # Increment counter i += 1 # Copy from device to host start_gpu_time.record() h_region = d_region.get() h_region = h_region.reshape([height, width]) end_gpu_time.record()
def cuda_gridvis(settings,plan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start=time.time() # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2*imsize) noff = np.int32((nx-imsize)/2) ## constants arc2rad = np.float32(np.pi/180/3600.) du = np.float32(1./(arc2rad*cell*nx)) ## grab data f = pyfits.open(settings['vfile']) ## quickly figure out what data is not flagged freq = np.float32(f[0].header['CRVAL4']) good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) h_u = np.float32(freq*f[0].data.par('uu')[good]) h_v = np.float32(freq*f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0])) h_im = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1])) ## make GPU arrays h_grd = np.zeros((nx,nx),dtype=np.complex64) h_cnt = np.zeros((nx,nx),dtype=np.int32) d_u = gpu.to_gpu(h_u) d_v = gpu.to_gpu(h_v) d_re = gpu.to_gpu(h_re) d_im = gpu.to_gpu(h_im) d_cnt = gpu.zeros((np.int(nx),np.int(nx)),np.int32) d_grd = gpu.zeros((np.int(nx),np.int(nx)),np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize),np.int(imsize)),np.float32) ## define kernel parameters blocksize2D = (8,16,1) gridsize2D = (np.int(np.ceil(1.*nx/blocksize2D[0])),np.int(np.ceil(1.*nx/blocksize2D[1]))) blocksizeF2D = (16,16,1) gridsizeF2D = (np.int(np.ceil(1.*imsize/blocksizeF2D[0])),np.int(np.ceil(1.*imsize/blocksizeF2D[1]))) blocksize1D = (256,1,1) gridsize1D = (np.int(np.ceil(1.*gcount/blocksize1D[0])),1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf,width) ## make grid correction h_corr = corrfun(nx,width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf,h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get()/du)) vmax = np.int32(np.ceil(d_vmax.get()/du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd,d_bm,d_cnt,d_u,d_v,d_re,d_im,nx,du,gcount,umax,vmax,\ block=blocksize2D,grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm,nx,hfac,block=blocksize2D,grid=gridsize2D) shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd,nx,hfac,block=blocksize2D,grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm,d_bm,plan) ## Shift shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm,d_corr,nx,block=blocksize2D,grid=gridsize2D) # Trim trimIm_kernel(d_nbm,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1./bmax) nrmBeam_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd,d_grd,plan) ## Shift shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd,d_corr,nx,block=blocksize2D,grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D) ## Finish timers t_end=time.time() t_full=t_end-t_start print "Gridding execution time %0.5f"%t_full+' s' print "\t%0.5f"%(t_full/gcount)+' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf,d_fim
def cuda_gridvis(self, plan, x_offset, y_offset): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ If the size of the image is 1024x1024, the plan should be at least 1024*1.414 (about 25 degrees' rotation) And to satisfy the requirements of CLEAN, the dirty image should be 1024* 2.828 """ logger.debug("Gridding the visibilities") t_start = time.time() nx = np.int32(2 * self.imsize) noff = np.int32((nx - self.imsize) / 2) arc2rad = np.float32(np.pi / 180. / 3600.) du = np.float32(1. / (arc2rad * self.cell)) / (self.imsize * 2.) logger.debug("1 Pixel DU = %f" % du) h_uu = np.float32(self.h_uu.ravel()) h_vv = np.float32(self.h_vv.ravel()) h_rere = np.float32(self.h_rere.ravel()) h_imim = np.float32(self.h_imim.ravel()) blen = 0 bl_order = np.ndarray(shape=(self.baseline_number, 2), dtype=int) good = [] if self.baseline_number == 780: # MUSER-I antennas = 40 else: antennas = 60 # print antennas for border1 in range(0, antennas - 1): for border2 in range(border1 + 1, antennas): bl_order[blen][0] = border1 bl_order[blen][1] = border2 blen = blen + 1 h_u = [] h_v = [] h_re = [] h_im = [] for blen in range(0, self.baseline_number): if (bl_order[blen][0] not in self.Flag_Ant) and (bl_order[blen][1] not in self.Flag_Ant): good.append(blen) h_u.append(h_uu[blen]) h_v.append(h_vv[blen]) h_re.append(h_rere[blen]) h_im.append(h_imim[blen]) gcount = np.int32(np.size(h_u)) # h_ : host, d_ : device # h_grd = np.zeros((nx, nx), dtype=np.complex64) # h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(np.array(h_u, dtype='float32')) d_v = gpu.to_gpu(np.array(h_v, dtype='float32')) d_re = gpu.to_gpu(np.array(h_re, dtype='float32')) d_im = gpu.to_gpu(np.array(h_im, dtype='float32')) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_cbm = gpu.zeros_like(d_grd) d_fbm = gpu.zeros((np.int(nx), np.int(nx)), np.float32) d_fim = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) d_dim = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) d_sun_disk = gpu.zeros_like(d_grd) d_fdisk = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) ## define kernel parameters self.calc_gpu_thread(nx, self.imsize, gcount) width = 6. ngcf = 24. h_cgf = self.gcf(ngcf, width) ## make grid correction h_corr = self.corrfun(nx, width) d_cgf = self.module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) self.gridVis_wBM_kernel(d_grd, d_bm, d_cbm, d_cnt, d_u, d_v, d_re, d_im, np.int32(nx), np.float32(du), np.int32(gcount), np.int32(umax), np.int32(vmax), np.int32(1 if self.correct_p_angle else 0), block=self.blocksize_2D, grid=self.gridsize_2D) ## apply weights self.wgtGrid_kernel(d_bm, d_cnt, self.briggs, nx, 0, block=self.blocksize_2D, grid=self.gridsize_2D) hfac = np.int32(1) self.dblGrid_kernel(d_bm, nx, hfac, block=self.blocksize_2D, grid=self.gridsize_2D) self.dblGrid_kernel(d_cbm, nx, hfac, block=self.blocksize_2D, grid=self.gridsize_2D) self.shiftGrid_kernel(d_bm, d_nbm, nx, block=self.blocksize_2D, grid=self.gridsize_2D) self.shiftGrid_kernel(d_cbm, d_bm, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## normalize self.wgtGrid_kernel(d_grd, d_cnt, self.briggs, nx, 0, block=self.blocksize_2D, grid=self.gridsize_2D) ## Reflect grid about v axis hfac = np.int32(-1) self.dblGrid_kernel(d_grd, nx, hfac, block=self.blocksize_2D, grid=self.gridsize_2D) ## Shift both self.shiftGrid_kernel(d_grd, d_ngrd, nx, block=self.blocksize_2D, grid=self.gridsize_2D) fft.fft(d_ngrd, d_grd, plan) ## Shift self.shiftGrid_kernel(d_grd, d_ngrd, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Correct for C self.corrGrid_kernel(d_ngrd, d_corr, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Trim self.trimIm_kernel(d_ngrd, d_dim, nx, self.imsize, block=self.blocksize_F2D, grid=self.gridsize_F2D) self.copyIm_kernel(d_ngrd, d_fbm, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Normalize (Jy/beam)i # self.nrmGrid_kernel(d_dim, bmax1, self.imsize, block=self.blocksize_F2D, grid=self.gridsize_F2D) # self.nrmGrid_kernel(d_fbm, bmax2, nx, block=self.blocksize_2D, grid=self.gridsize_2D) ## Finish timers t_end = time.time() t_full = t_end - t_start logger.debug("Gridding execution time %0.5f" % t_full + ' s') logger.debug("\t%0.5f" % (t_full / gcount) + ' s per visibility') # ---------------------- ## Return dirty psf (CPU), dirty image (GPU) and sun disk return d_dim
def normalize(array): array = asgpuarray(array) array -= gpuarray.min(array) array /= gpuarray.max(array) return array
def max(self): t = gpuarray.max(self._dat[0:self.npart_local:, :]) return t.get()
def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Suppress very small singular values: cutoff_gpu = gpuarray.max(s_gpu) * rcond ctype = tools.dtype_to_ctype(s_gpu.dtype) cutoff_func = el.ElementwiseKernel( "{ctype} *s, {ctype} *cutoff".format(ctype=ctype), "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}") cutoff_func(s_gpu, cutoff_gpu) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
def normalize( data ): maxVal = gpuarray.max(data).get() linearDouble(1./maxVal, np.float64(0.), data, data )
def cuda_gridvis(settings, plan): """ Grid the visibilities parallelized by pixel. References: - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy" by Thompson, Moran, & Swenson - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/ """ print "Gridding the visibilities" t_start = time.time() # unpack parameters vfile = settings['vfile'] briggs = settings['briggs'] imsize = settings['imsize'] cell = settings['cell'] nx = np.int32(2 * imsize) noff = np.int32((nx - imsize) / 2) ## constants arc2rad = np.float32(np.pi / 180 / 3600.) du = np.float32(1. / (arc2rad * cell * nx)) ## grab data f = pyfits.open(settings['vfile']) # determin the file type (uvfits or fitsidi) if settings['vfile'].find('.fitsidi') != -1: ## quickly figure out what data is not flagged freq = 3.45E11 #np.float32(f[7].header['CRVAL3']) 299792458vvvv #good = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0) #h_u = np.float32(freq*f[0].data.par('uu')[good]) #h_v = np.float32(freq*f[0].data.par('vv')[good]) light_speed = 299792458. # Speed of light h_u = np.ndarray(shape=(780, 1),dtype='float64') h_v = np.ndarray(shape=(780, 1),dtype='float64') h_re = np.ndarray(shape=(780, 1),dtype='float32') h_im = np.ndarray(shape=(780, 1),dtype='float32') h_u = np.float64(light_speed * f[0].data[:].UU) h_v = np.float64(light_speed * f[0].data[:].VV) for bl in range(0, 780): #gcount += np.int32(np.size(h_u[bl])) ## assume data is unpolarized #h_re = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0])) #h_im = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1])) h_re[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][0]) h_im[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][1]) ## make GPU arrays h_u = np.float32(h_u.ravel()) h_v = np.float32(h_v.ravel()) gcount = np.int32(np.size(h_u)) #gcount = len(gcount.ravel()) h_re = np.float32(h_re.ravel()) h_im = np.float32(h_im.ravel()) print len(h_re),len(h_im) elif settings['vfile'].find('.uvfits') != -1: freq = 3.45E11 #np.float32(f[0].header['CRVAL4']) light_speed = 299792458. good = np.where(f[0].data.data[:, 0, 0, 0, 0, 0] != 0) h_u = np.float32(light_speed * f[0].data.par('uu')[good]) h_v = np.float32(light_speed * f[0].data.par('vv')[good]) gcount = np.int32(np.size(h_u)) ## assume data is unpolarized h_re = np.float32(f[0].data.data[good, 0, 0, 0, 0, 0]) h_im = np.float32(f[0].data.data[good, 0, 0, 0, 0, 1]) print h_u # h_ : host, d_ : device h_grd = np.zeros((nx, nx), dtype=np.complex64) h_cnt = np.zeros((nx, nx), dtype=np.int32) d_u = gpu.to_gpu(h_u) d_v = gpu.to_gpu(h_v) d_re = gpu.to_gpu(h_re) d_im = gpu.to_gpu(h_im) d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32) d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64) d_ngrd = gpu.zeros_like(d_grd) d_bm = gpu.zeros_like(d_grd) d_nbm = gpu.zeros_like(d_grd) d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32) ## define kernel parameters blocksize2D = (8, 16, 1) gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1]))) blocksizeF2D = (16, 16, 1) gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1]))) blocksize1D = (256, 1, 1) gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1) # ------------------------ # make gridding kernels # ------------------------ ## make spheroidal convolution kernel (don't mess with these!) width = 6. ngcf = 24. h_cgf = gcf(ngcf, width) ## make grid correction h_corr = corrfun(nx, width) d_cgf = module.get_global('cgf')[0] d_corr = gpu.to_gpu(h_corr) cu.memcpy_htod(d_cgf, h_cgf) # ------------------------ # grid it up # ------------------------ d_umax = gpu.max(cumath.fabs(d_u)) d_vmax = gpu.max(cumath.fabs(d_v)) umax = np.int32(np.ceil(d_umax.get() / du)) vmax = np.int32(np.ceil(d_vmax.get() / du)) ## grid ($$) # This should be improvable via: # - shared memory solution? I tried... # - better coalesced memory access? I tried... # - reorganzing and indexing UV data beforehand? # (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf) # - storing V(u,v) in texture memory? gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \ block=blocksize2D, grid=gridsize2D) ## apply weights wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) hfac = np.int32(1) dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D) shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## normalize wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D) ## Reflect grid about v axis hfac = np.int32(-1) dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D) ## Shift both shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) # ------------------------ # Make the beam # ------------------------ ## Transform to image plane fft.fft(d_nbm, d_bm, plan) ## Shift shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D) # Trim trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize d_bmax = gpu.max(d_fim) bmax = d_bmax.get() bmax = np.float32(1. / bmax) nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Pull onto CPU dpsf = d_fim.get() # ------------------------ # Make the map # ------------------------ ## Transform to image plane fft.fft(d_ngrd, d_grd, plan) ## Shift shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D) ## Correct for C corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D) ## Trim trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Normalize (Jy/beam) nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D) ## Finish timers t_end = time.time() t_full = t_end - t_start print "Gridding execution time %0.5f" % t_full + ' s' print "\t%0.5f" % (t_full / gcount) + ' s per visibility' ## Return dirty psf (CPU) and dirty image (GPU) return dpsf, d_fim
def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Get block/grid sizes; the number of threads per block is limited # to 512 because the cutoff_invert_s kernel defined above uses too # many registers to be invoked in 1024 threads per block (i.e., on # GPUs with compute capability >= 2.x): dev = misc.get_current_device() max_threads_per_block = 512 block_dim, grid_dim = misc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block) # Suppress very small singular values: use_double = 1 if s_gpu.dtype == np.float64 else 0 cutoff_invert_s_mod = \ SourceModule(cutoff_invert_s_template.substitute(use_double=use_double)) cutoff_invert_s = \ cutoff_invert_s_mod.get_function('cutoff_invert_s') cutoff_gpu = gpuarray.max(s_gpu) * rcond cutoff_invert_s(s_gpu, cutoff_gpu, np.uint32(s_gpu.size), block=block_dim, grid=grid_dim) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
def calculateTimestep(PropSpeedsGPU, cellDim): maxPropSpeed = gpuarray.max(cumath.fabs(PropSpeedsGPU)).get() return cellDim / (4.0 * maxPropSpeed), maxPropSpeed
def max(self): return gpuarray.max(self.arr).get().max()