def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{ """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*, and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude vector; else, the magnitude vector will be computed (on the GPU) from the count matrix. Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles. """ # Set up lingo and count matrices on device #{{{ if self.usePycudaArray: # Set up using PyCUDA CUDAArray support self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C') self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C') self.gpu.tex2lr.set_array(self.gpu.rsmiles) self.gpu.tex2cr.set_array(self.gpu.rcounts) else: # Manually handle setup temprlmat = self._padded_array(refsmilesmat) if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768: raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape) self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes) cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream) temprcmat = self._padded_array(refcountsmat) self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes) cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream) descriptor = cuda.ArrayDescriptor() descriptor.width = temprcmat.shape[1] descriptor.height = temprcmat.shape[0] descriptor.format = cuda.array_format.UNSIGNED_INT32 descriptor.num_channels = 1 self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0]) self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0]) self.gpu.stream.synchronize() del temprlmat del temprcmat #}}} self.rlengths = reflengths self.rshape = refsmilesmat.shape self.nref = refsmilesmat.shape[0] # Copy reference lengths to GPU self.gpu.rl_gpu = cuda.to_device(reflengths) # Allocate buffers for query set magnitudes self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes) if refmags is not None: cuda.memcpy_htod(self.gpu.rmag_gpu,refmags) else: # Calculate query set magnitudes on GPU magthreads = 256 self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr]) return
def initData(fn=None): global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid # Cuda array initialization array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major pixels.fill(0) # Resetting the array to 0 pbo_buffer = glGenBuffers(1) # generate 1 buffer reference glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size assert(bsize == imWidth*imHeight) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind if ver2011: cuda_pbo_resource = pycuda.gl.RegisteredBuffer(int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD) else: cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource glGenTextures(1, texid); # generate 1 texture reference glBindTexture(GL_TEXTURE_2D, texid); # binding to this texture glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); # Allocate the texture glBindTexture(GL_TEXTURE_2D, 0) # Unbind glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment
def prepare_matrix(self, matrix): plan = self.plan given = plan.given assert matrix.shape == ( plan.image_dofs_per_el, plan.preimage_dofs_per_el) return cuda.matrix_to_array(matrix.astype(given.float_type), "F", allow_double_hack=True)
def __init__(self, lens_file=None, lens_psf_size=None, lens_grid_size=None): if lens_file: self.lens = True grid = scipy.misc.imread(lens_file, flatten=True) if np.max(grid) > 255: grid /= 2**(16-1) else: grid /= 255 self.grid_gpu = cu.matrix_to_array(grid, 'C') self.lens_psf_size = lens_psf_size self.lens_grid_size = lens_grid_size else: self.lens = False
def set_mask(self, mask): self.debug(3, "Setting the mask") assert mask.shape == (self.h, self.w), \ "Got a {} mask in a {} routine.".format(mask.shape, (self.h, self.w)) if not mask.dtype == np.float32: self.debug(2, "Converting the mask to float32") mask = mask.astype(np.float32) if isinstance(mask, np.ndarray): self.maskArray = cuda.matrix_to_array(mask, 'C') elif isinstance(mask, gpuarray.GPUArray): self.maskArray = cuda.gpuarray_to_array(mask, 'C') else: self.debug(0, "Error! Mask data type not understood") raise ValueError self.texMask.set_array(self.maskArray)
def setImage(self,img_d): """ Set the image to compare with the original """ assert img_d.shape == (self.w,self.h),"Got a {} image in a {} correlation routine!".format(img_d.shape,(self.w,self.h)) if isinstance(img_d,np.ndarray): self.debug(3,"Creating texture from numpy array") self.array_d = cuda.matrix_to_array(img_d,"C") elif isinstance(img_d,gpuarray.GPUArray): self.debug(3,"Creating texture from gpuarray") self.array_d = cuda.gpuarray_to_array(img_d,"C") else: print("[Error] Unknown type of data given to CorrelStage.setImage()") raise ValueError self.tex_d.set_array(self.array_d) self.devX.set(np.zeros(self.Nfields,dtype=np.float32))
def set_image(self, img_d): """ Set the image to compare with the original Note that calling this method is not necessary: you can do .compute(image) This will automatically call this method first """ assert img_d.shape == (self.h, self.w), \ "Got a {} image in a {} correlation routine!".format( img_d.shape, (self.h, self.w)) if isinstance(img_d, np.ndarray): self.debug(3, "Creating texture from numpy array") self.array_d = cuda.matrix_to_array(img_d, "C") elif isinstance(img_d, gpuarray.GPUArray): self.debug(3, "Creating texture from gpuarray") self.array_d = cuda.gpuarray_to_array(img_d, "C") else: self.debug(0, "Error ! Unknown type of data given to .set_image()") raise ValueError self.tex_d.set_array(self.array_d) self.devX.set(np.zeros(self.fields_count, dtype=np.float32))
def initData(fn=None): global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid # Cuda array initialization array = cuda_driver.matrix_to_array( pixels, "C") # C-style instead of Fortran-style: row-major pixels.fill(0) # Resetting the array to 0 pbo_buffer = glGenBuffers(1) # generate 1 buffer reference glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth * imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer bsize = glGetBufferParameteriv( GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size assert (bsize == imWidth * imHeight) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind if ver2011: cuda_pbo_resource = pycuda.gl.RegisteredBuffer( int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD) else: cuda_pbo_resource = cuda_gl.BufferObject( int(pbo_buffer)) # Mapping GLBuffer to cuda_resource glGenTextures(1, texid) # generate 1 texture reference glBindTexture(GL_TEXTURE_2D, texid) # binding to this texture glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None) # Allocate the texture glBindTexture(GL_TEXTURE_2D, 0) # Unbind glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment
def set_params(self, psf_size, grid_size, im_size, params=None): # generate grid psf_size = np.array(psf_size) grid_size = np.array(grid_size) im_size = np.array(im_size) self.psf_size = psf_size + (1 - np.mod(psf_size, 2)) self.grid_size = grid_size self.im_size = im_size if params != None: self.params = params self.shape = (params.size / 3,) else: self._psf2params() if not self.lens: grid = np.zeros(self.psf_size, dtype=np.float32) grid[(self.psf_size[0]-1)/2, (self.psf_size[1]-1)/2] = 1. grid = np.tile(grid, self.grid_size) self.lens_psf_size = self.psf_size #lens_grid_size = (1,1) self.lens_grid_size = self.grid_size self.grid_gpu = cu.matrix_to_array(grid, 'C') params_count = np.uint32(self.params.size / 3) params_gpu = cu.matrix_to_array(self.params.astype(np.float32), 'C') #self.output_size = np.array(self.grid_size)*np.array(self.psf_size) output_size = np.array((np.prod(self.grid_size), self.psf_size[0], self.psf_size[1])) preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype) mod = SourceModule(preproc + basis_code, keep=True) in_tex = mod.get_texref('in_tex') in_tex.set_array(self.grid_gpu) in_tex.set_filter_mode(cu.filter_mode.LINEAR) #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) params_tex = mod.get_texref('params_tex') params_tex.set_array(params_gpu) offset = ((np.array(self.im_size) - np.array(grid.shape)) / np.array(self.grid_size).astype(np.float32)) offset = np.float32(offset) grid_scale = ((np.array(self.lens_grid_size) - 1) / (np.array(self.grid_size) - 1).astype(np.float32)) grid_scale = np.float32(grid_scale) block_size = (16,16,1) gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])), int(np.ceil(float(params_count)/block_size[1]))) basis_gpu = cua.empty((int(params_count), int(output_size[0]), int(output_size[1]), int(output_size[2])), np.float32) #self.basis_host = cu.pagelocked_empty((int(params_count), # int(output_size[0]), int(output_size[1]), int(output_size[2])), # np.float32, mem_flags=cu.host_alloc_flags.DEVICEMAP) basis_fun_gpu = mod.get_function("basis") basis_fun_gpu(basis_gpu.gpudata, np.uint32(np.prod(output_size)), np.uint32(self.grid_size[1]), np.uint32(self.psf_size[0]), np.uint32(self.psf_size[1]), np.uint32(self.im_size[0]), np.uint32(self.im_size[1]), offset[0], offset[1], grid_scale[0], grid_scale[1], np.uint32(self.lens_psf_size[0]), np.uint32(self.lens_psf_size[1]), params_count, block=block_size, grid=gpu_grid_size) self.basis_host = basis_gpu.get() self._intern_shape = self.basis_host.shape self.basis_host = self.basis_host.reshape((self._intern_shape[0], self._intern_shape[1]*self._intern_shape[2]*self._intern_shape[3])) self.basis_host = scipy.sparse.csr_matrix(self.basis_host)
def weighted_basis_gpu(psf_size, grid_size, im_size, params, lens_file=None, lens_psf_size=None, lens_grid_size=None): # generate grid psf_size = psf_size + (1 - np.mod(psf_size, 2)) if lens_file: grid = scipy.misc.imread(lens_file, flatten=True) if np.max(grid) > 255: grid /= 2**(16-1) else: grid /= 255 else: grid = np.zeros(psf_size, dtype=np.float32) grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1. grid = np.tile(grid, grid_size) lens_psf_size = psf_size #lens_grid_size = (1,1) lens_grid_size = grid_size grid_gpu = cu.matrix_to_array(grid, 'C') params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C') block_size = (16,16,1) output_size = np.array(grid_size)*np.array(psf_size) gpu_grid_size = (int(np.ceil(float(output_size[1])/block_size[0])), int(np.ceil(float(output_size[0])/block_size[1]))) weighted_basis_gpu = cua.empty((int(output_size[0]), int(output_size[1])), np.float32) preproc = '' #_generate_preproc(basis_gpu.dtype) mod = SourceModule(preproc + basis_code, keep=True) basis_fun_gpu = mod.get_function("weighted_basis") in_tex = mod.get_texref('in_tex') in_tex.set_array(grid_gpu) in_tex.set_filter_mode(cu.filter_mode.LINEAR) #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) params_tex = mod.get_texref('params_tex') params_tex.set_array(params_gpu) offset = ((np.array(im_size) - np.array(grid.shape)) / np.array(grid_size).astype(np.float32)) offset = np.float32(offset) grid_scale = ((np.array(lens_grid_size) - 1) / (np.array(grid_size) - 1).astype(np.float32)) grid_scale = np.float32(grid_scale) #psf_scale = ((np.array(lens_psf_size) - 1) / # (np.array(psf_size) - 1).astype(np.float32)) #psf_scale = np.float32(psf_scale) basis_fun_gpu(weighted_basis_gpu.gpudata, np.uint32(output_size[0]), np.uint32(output_size[1]), np.uint32(psf_size[0]), np.uint32(psf_size[1]), np.uint32(im_size[0]), np.uint32(im_size[1]), offset[0], offset[1], grid_scale[0], grid_scale[1], np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]), np.uint32(params.size/3), block=block_size, grid=gpu_grid_size) return weighted_basis_gpu
def init_stitch(N): """outputs the high resolution k-box, and the smoothed r box Input ----------- N: int32 size of box to load onto the GPU, should be related to DIM by powers of 2 """ if N is None: N = np.int32(HII_DIM) #prepare for stitching META_GRID_SIZE = DIM/N M = np.int32(HII_DIM/META_GRID_SIZE) #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN/DIM) print 'scale', scale HII_scale = np.float32(BOX_LEN/HII_DIM) shape = (DIM,DIM,N) stitch_grid_size = (DIM/(block_size[0]), DIM/(block_size[0]), N/(block_size[0])) HII_stitch_grid_size = (HII_DIM/(block_size[0]), HII_DIM/(block_size[0]), M/(block_size[0])) #ratio of large box to small size kernel_source = open(cmd_folder+"/initialize_stitch.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'DIM': DIM, 'VOLUME': VOLUME, 'META_BLOCKDIM': N } main_module = nvcc.SourceModule(kernel_code) init_stitch = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64) plan1d = Plan((np.int64(DIM)), dtype=np.complex64) print "init pspec" interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) #hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64') hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64) #hbox_small = np.zeros(HII_shape, dtype=np.float32) #hbox_large = n smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) # Set up pinned memory for transfer #largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize()) largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32) largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64) largebox_d = gpuarray.zeros(shape, dtype=np.float32) largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) print "init boxes" for meta_z in xrange(META_GRID_SIZE): # MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3) init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy() #if want to get velocity need to use this if True: print "saving kbox" np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) print "Executing FFT on device" #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print hbox_large.dtype print "Finished FFT on device" np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) if True: print "loading kbox" hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() #cuda.memcpy_htod_async(largebox_d, largebox_pin) largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size); hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async() #import IPython; IPython.embed() print "Executing FFT on host" #hbox_large = hifft(hbox_large).astype(np.complex64).real #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print "Finished FFT on host" #import IPython; IPython.embed() # for meta_x in xrange(META_GRID_SIZE): # for meta_y in xrange(META_GRID_SIZE): # for meta_z in xrange(META_GRID_SIZE): # largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N]) # HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size); # hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get() #plan = Plan(shape, dtype=np.complex64) #plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) # This saves a large resolution deltax print "downsampling" smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) #largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) largebox_d /= scale**3 # subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async() np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small) #import IPython; IPython.embed() # To get velocities: reload the k-space box hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32) hvbox_small = np.zeros(HII_shape, dtype=np.float32) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64) smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): for meta_z in xrange(META_GRID_SIZE): largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size) print hvbox_large.shape, largevbox_d.shape hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async() hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real for meta_z in xrange(META_GRID_SIZE): largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async() np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
def generate_basis_gpu(psf_size, grid_size, im_size, params, lens_file=None, lens_psf_size=None, lens_grid_size=None): # generate grid psf_size = psf_size + (1 - np.mod(psf_size, 2)) if lens_file: grid = scipy.misc.imread(lens_file, flatten=True) if np.max(grid) > 255: grid /= 2**(16-1) else: grid /= 255 else: grid = np.zeros(psf_size, dtype=np.float32) grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1. grid = np.tile(grid, grid_size) lens_psf_size = psf_size #lens_grid_size = (1,1) lens_grid_size = grid_size grid_gpu = cu.matrix_to_array(grid, 'C') # generate parameters of basis functions #p = max(1, np.floor(psf_size[0] / 2)) #p = min(8, p) #dp = min(45. / np.floor(psf_size * grid_size / 2)) #dp = min(0.8, dp) #dp = np.radians(dp) #p = np.radians(p) #l = max(1, np.floor(psf_size[0] / 2)) #l = np.ceil(l / 2) #params = np.mgrid[-l:l+1, -l:l+1, -p:p+dp/10:dp].astype(np.float32).T #params = params.reshape(params.size / 3, 3) params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C') block_size = (16,16,1) output_size = np.array((np.prod(np.array(grid_size)),psf_size[0],psf_size[1])) gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])), int(np.ceil(float(params.size/3)/block_size[1]))) basis_gpu = cua.empty((params.size/3, int(output_size[0]), int(output_size[1]), int(output_size[2])), np.float32) preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype) mod = SourceModule(preproc + basis_code, keep=True) basis_fun_gpu = mod.get_function("basis") in_tex = mod.get_texref('in_tex') in_tex.set_array(grid_gpu) in_tex.set_filter_mode(cu.filter_mode.LINEAR) #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) params_tex = mod.get_texref('params_tex') params_tex.set_array(params_gpu) offset = ((np.array(im_size) - np.array(grid.shape)) / np.array(grid_size).astype(np.float32)) offset = np.float32(offset) grid_scale = ((np.array(lens_grid_size) - 1) / (np.array(grid_size) - 1).astype(np.float32)) grid_scale = np.float32(grid_scale) #psf_scale = ((np.array(lens_psf_size) - 1) / # (np.array(psf_size) - 1).astype(np.float32)) #psf_scale = np.float32(psf_scale) basis_fun_gpu(basis_gpu.gpudata, np.uint32(np.prod(output_size)), np.uint32(grid_size[1]), np.uint32(psf_size[0]), np.uint32(psf_size[1]), np.uint32(im_size[0]), np.uint32(im_size[1]), offset[0], offset[1], grid_scale[0], grid_scale[1], np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]), np.uint32(params.size/3), block=block_size, grid=gpu_grid_size) return basis_gpu
def watershed(I): # Get contiguous image + shape. height, width = I.shape I = np.float32(I.copy()) # Get block/grid size for steps 1-3. block_size = (6,6,1) grid_size = (width/(block_size[0]-2), height/(block_size[0]-2)) # Get block/grid size for step 4. block_size2 = (16,16,1) grid_size2 = (width/(block_size2[0]-2), height/(block_size2[0]-2)) # Initialize variables. labeled = np.zeros([height,width]) labeled = np.float32(labeled) width = np.int32(width) height = np.int32(height) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counter_d = gpu.to_gpu_async(count) # Bind CUDA textures. I_cu = cu.matrix_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() # Step 2. increment_kernel(labeled_d,width,height, block=block_size2,grid=grid_size2) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new minima_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 3. counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new plateau_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new flood_kernel(labeled_d, counters_d, width, height, block=block_size2, grid=grid_size2) new = counters_d.get()[0] result = labeled_d.get() # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) return result
def watershed(I): # Get contiguous image + shape. height, width = I.shape I = np.float32(I.copy()) # Get block/grid size for steps 1-3. block_size = (6, 6, 1) grid_size = (width / (block_size[0] - 2), height / (block_size[0] - 2)) # Get block/grid size for step 4. block_size2 = (16, 16, 1) grid_size2 = (width / (block_size2[0] - 2), height / (block_size2[0] - 2)) # Initialize variables. labeled = np.zeros([height, width]) labeled = np.float32(labeled) width = np.int32(width) height = np.int32(height) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counter_d = gpu.to_gpu_async(count) # Bind CUDA textures. I_cu = cu.matrix_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() # Step 2. increment_kernel(labeled_d, width, height, block=block_size2, grid=grid_size2) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new minima_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 3. counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new plateau_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new flood_kernel(labeled_d, counters_d, width, height, block=block_size2, grid=grid_size2) new = counters_d.get()[0] result = labeled_d.get() # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) return result
__global__ void interp(float* out, float ox, float oy, int w, int h) { int idx = threadIdx.x+blockIdx.x*blockDim.x; int idy = threadIdx.y+blockIdx.y*blockDim.y; out[idy*w+idx] = tex2D(tex,(idx-ox)/w,(idy-oy)/h); } """ mod = SourceModule(src_mod) interp=mod.get_function("interp") h,w = img.shape print("w={}, h={}".format(w,h)) devArray = cuda.matrix_to_array(img,"C") tex=mod.get_texref('tex') tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tex.set_filter_mode(cuda.filter_mode.LINEAR) tex.set_address_mode(0,cuda.address_mode.CLAMP) tex.set_address_mode(1,cuda.address_mode.CLAMP) tex.set_array(devArray) arg_types = "Pffii" devOut = gpuarray.GPUArray(img.shape,np.float32)
def init(): """outputs the high resolution k-box, and the smoothed r box""" N = np.int32(DIM) #prepare for stitching #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN)/DIM HII_scale = np.float32(BOX_LEN)/HII_DIM shape = (N,N,N) MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) kernel_source = open(cmd_folder+"/initialize.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'VOLUME': VOLUME, 'DIM': DIM } main_module = nvcc.SourceModule(kernel_code) init_kernel = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") adj_complex_conj = main_module.get_function("adj_complex_conj") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) largebox_d = gpuarray.zeros(shape, dtype=np.float32) init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size) #import IPython; IPython.embed() largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag #adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size) largebox = largebox_d.get() #np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox) #save real space box before smoothing plan = Plan(shape, dtype=np.complex64) plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box largebox_d /= scale**3 np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async()) #save real space box after smoothing and subsampling # host largebox is still in k space, no need to reload from disk largebox_d = gpuarray.to_gpu(largebox) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size); plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box largebox_d /= scale**3 smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async()) # reload the k-space box for velocity boxes largebox_d = gpuarray.to_gpu(largebox) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64) smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size) plan.execute(largevbox_d, inverse=True) largevbox_d /= scale**3 #import IPython; IPython.embed() subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size) np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get()) return
def set_qsmiles(self,qsmilesmat,qcountsmat,querylengths,querymags=None): #{{{ """Sets the reference SMILES set to use Lingo matrix *qsmilesmat*, count matrix *qcountsmat*, and length vector *querylengths*. If *querymags* is provided, it will be used as the magnitude vector; else, the magnitude vector will be computed (on the GPU) from the count matrix. Because of hardware limitations, the query matrices (*qsmilesmat* and *qcountsmat*) must have no more than 65,536 rows (molecules) and 32,768 columns (Lingos). Larger computations must be performed in tiles. """ # Set up lingo and count matrices on device #{{{ if self.usePycudaArray: # Create CUDAarrays for lingo and count matrices print "Strides qsmilesmat:",numpy.ascontiguousarray(qsmilesmat.T).strides self.gpu.qsmiles = cuda.matrix_to_array(numpy.ascontiguousarray(qsmilesmat.T),order='C') self.gpu.qcounts= cuda.matrix_to_array(numpy.ascontiguousarray(qcountsmat.T),order='C') print "qsmiles descriptor",dtos(self.gpu.qsmiles.get_descriptor()) print "qcounts descriptor",dtos(self.gpu.qcounts.get_descriptor()) self.gpu.tex2lq.set_array(self.gpu.qsmiles) self.gpu.tex2cq.set_array(self.gpu.qcounts) else: # Manually handle texture setup # padded_array will handle making matrix contiguous tempqlmat = self._padded_array(qsmilesmat.T) if tempqlmat.shape[1] > 65536 or tempqlmat.shape[0] > 32768: raise ValueError("Error: query matrix is not allowed to have more than 65536 rows (molecules) or 32768 columns (LINGOs) (both padded to multiple of 16). Dimensions = (%d,%d)"%tempqlmat.shape) if self.gpu.qsmiles is None or self.gpu.qsmiles.nbytes < tempqlmat.nbytes: self.gpu.qsmiles = cuda.mem_alloc(tempqlmat.nbytes) self.gpu.qsmiles.nbytes = tempqlmat.nbytes cuda.memcpy_htod_async(self.gpu.qsmiles,tempqlmat,stream=self.gpu.stream) tempqcmat = self._padded_array(qcountsmat.T) if self.gpu.qcounts is None or self.gpu.qcounts.nbytes < tempqcmat.nbytes: self.gpu.qcounts = cuda.mem_alloc(tempqcmat.nbytes) self.gpu.qcounts.nbytes = tempqcmat.nbytes cuda.memcpy_htod_async(self.gpu.qcounts,tempqcmat,stream=self.gpu.stream) descriptor = cuda.ArrayDescriptor() descriptor.width = tempqcmat.shape[1] descriptor.height = tempqcmat.shape[0] descriptor.format = cuda.array_format.UNSIGNED_INT32 descriptor.num_channels = 1 self.gpu.tex2lq.set_address_2d(self.gpu.qsmiles,descriptor,tempqlmat.strides[0]) self.gpu.tex2cq.set_address_2d(self.gpu.qcounts,descriptor,tempqcmat.strides[0]) #print "Set up query textures with stride=",tempqmat.strides[0] self.gpu.stream.synchronize() del tempqlmat del tempqcmat #}}} self.qshape = qsmilesmat.shape self.nquery = qsmilesmat.shape[0] #print "Query shape=",self.qshape,", nquery=",self.nquery # Transfer query lengths array to GPU self.gpu.ql_gpu = cuda.to_device(querylengths) # Allocate buffers for query set magnitudes self.gpu.qmag_gpu = cuda.mem_alloc(querylengths.nbytes) if querymags is not None: cuda.memcpy_htod(self.gpu.qmag_gpu,querymags) else: # Calculate query set magnitudes on GPU magthreads = 256 self.gpu.qMagKernel(self.gpu.qmag_gpu,self.gpu.ql_gpu,numpy.int32(self.nquery),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cq]) #self.qmag_gpu = cuda.to_device(qcountsmat.sum(1).astype(numpy.int32)) return