Beispiel #1
0
    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
Beispiel #2
0
def initData(fn=None):
    global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid

    # Cuda array initialization
    array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major

    pixels.fill(0) # Resetting the array to 0

    pbo_buffer = glGenBuffers(1) # generate 1 buffer reference
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer
    glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer
    bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size
    assert(bsize == imWidth*imHeight)
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind

    if ver2011:
        cuda_pbo_resource = pycuda.gl.RegisteredBuffer(int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD)
    else:
        cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource


    glGenTextures(1, texid); # generate 1 texture reference
    glBindTexture(GL_TEXTURE_2D, texid); # binding to this texture
    glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); # Allocate the texture
    glBindTexture(GL_TEXTURE_2D, 0) # Unbind

    glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment
    glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment
    def prepare_matrix(self, matrix):
        plan = self.plan
        given = plan.given

        assert matrix.shape == (
                plan.image_dofs_per_el, plan.preimage_dofs_per_el)

        return cuda.matrix_to_array(matrix.astype(given.float_type), "F",
                allow_double_hack=True)
Beispiel #4
0
  def __init__(self, lens_file=None, lens_psf_size=None, lens_grid_size=None):

    if lens_file:
      self.lens = True
      grid = scipy.misc.imread(lens_file, flatten=True)
      if np.max(grid) > 255:
        grid /= 2**(16-1)
      else:
        grid /= 255
      self.grid_gpu = cu.matrix_to_array(grid, 'C')
      self.lens_psf_size = lens_psf_size
      self.lens_grid_size = lens_grid_size
    else:
      self.lens = False
Beispiel #5
0
  def __init__(self, lens_file=None, lens_psf_size=None, lens_grid_size=None):

    if lens_file:
      self.lens = True
      grid = scipy.misc.imread(lens_file, flatten=True)
      if np.max(grid) > 255:
        grid /= 2**(16-1)
      else:
        grid /= 255
      self.grid_gpu = cu.matrix_to_array(grid, 'C')
      self.lens_psf_size = lens_psf_size
      self.lens_grid_size = lens_grid_size
    else:
      self.lens = False
 def set_mask(self, mask):
     self.debug(3, "Setting the mask")
     assert mask.shape == (self.h, self.w), \
       "Got a {} mask in a {} routine.".format(mask.shape, (self.h, self.w))
     if not mask.dtype == np.float32:
         self.debug(2, "Converting the mask to float32")
         mask = mask.astype(np.float32)
     if isinstance(mask, np.ndarray):
         self.maskArray = cuda.matrix_to_array(mask, 'C')
     elif isinstance(mask, gpuarray.GPUArray):
         self.maskArray = cuda.gpuarray_to_array(mask, 'C')
     else:
         self.debug(0, "Error! Mask data type not understood")
         raise ValueError
     self.texMask.set_array(self.maskArray)
 def setImage(self,img_d):
   """
   Set the image to compare with the original
   """
   assert img_d.shape == (self.w,self.h),"Got a {} image in a {} correlation routine!".format(img_d.shape,(self.w,self.h))
   if isinstance(img_d,np.ndarray):
     self.debug(3,"Creating texture from numpy array")
     self.array_d = cuda.matrix_to_array(img_d,"C")
   elif isinstance(img_d,gpuarray.GPUArray):
     self.debug(3,"Creating texture from gpuarray")
     self.array_d = cuda.gpuarray_to_array(img_d,"C")
   else:
     print("[Error] Unknown type of data given to CorrelStage.setImage()")
     raise ValueError
   self.tex_d.set_array(self.array_d)
   self.devX.set(np.zeros(self.Nfields,dtype=np.float32))
    def set_image(self, img_d):
        """
    Set the image to compare with the original

    Note that calling this method is not necessary: you can do .compute(image)
    This will automatically call this method first
    """
        assert img_d.shape == (self.h, self.w), \
          "Got a {} image in a {} correlation routine!".format(
            img_d.shape, (self.h, self.w))
        if isinstance(img_d, np.ndarray):
            self.debug(3, "Creating texture from numpy array")
            self.array_d = cuda.matrix_to_array(img_d, "C")
        elif isinstance(img_d, gpuarray.GPUArray):
            self.debug(3, "Creating texture from gpuarray")
            self.array_d = cuda.gpuarray_to_array(img_d, "C")
        else:
            self.debug(0, "Error ! Unknown type of data given to .set_image()")
            raise ValueError
        self.tex_d.set_array(self.array_d)
        self.devX.set(np.zeros(self.fields_count, dtype=np.float32))
Beispiel #9
0
def initData(fn=None):
    global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid

    # Cuda array initialization
    array = cuda_driver.matrix_to_array(
        pixels, "C")  # C-style instead of Fortran-style: row-major

    pixels.fill(0)  # Resetting the array to 0

    pbo_buffer = glGenBuffers(1)  # generate 1 buffer reference
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer)  # binding to this buffer
    glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth * imHeight, pixels,
                 GL_STREAM_DRAW)  # Allocate the buffer
    bsize = glGetBufferParameteriv(
        GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE)  # Check allocated buffer size
    assert (bsize == imWidth * imHeight)
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)  # Unbind

    if ver2011:
        cuda_pbo_resource = pycuda.gl.RegisteredBuffer(
            int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD)
    else:
        cuda_pbo_resource = cuda_gl.BufferObject(
            int(pbo_buffer))  # Mapping GLBuffer to cuda_resource

    glGenTextures(1, texid)
    # generate 1 texture reference
    glBindTexture(GL_TEXTURE_2D, texid)
    # binding to this texture
    glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0,
                 GL_LUMINANCE, GL_UNSIGNED_BYTE, None)
    # Allocate the texture
    glBindTexture(GL_TEXTURE_2D, 0)  # Unbind

    glPixelStorei(GL_UNPACK_ALIGNMENT, 1)  # 1-byte row alignment
    glPixelStorei(GL_PACK_ALIGNMENT, 1)  # 1-byte row alignment
Beispiel #10
0
  def set_params(self, psf_size, grid_size, im_size, params=None):

    # generate grid
    psf_size = np.array(psf_size)
    grid_size = np.array(grid_size)
    im_size = np.array(im_size)
    self.psf_size = psf_size + (1 - np.mod(psf_size, 2))
    self.grid_size = grid_size
    self.im_size = im_size

    if params != None:
      self.params = params
      self.shape = (params.size / 3,)
    else:
      self._psf2params()
    
    if not self.lens:
      grid = np.zeros(self.psf_size, dtype=np.float32)
      grid[(self.psf_size[0]-1)/2, (self.psf_size[1]-1)/2] = 1.
      grid = np.tile(grid, self.grid_size)
      self.lens_psf_size = self.psf_size
      #lens_grid_size = (1,1)
      self.lens_grid_size = self.grid_size
      self.grid_gpu = cu.matrix_to_array(grid, 'C')

    params_count = np.uint32(self.params.size / 3)
    params_gpu = cu.matrix_to_array(self.params.astype(np.float32), 'C')

    #self.output_size = np.array(self.grid_size)*np.array(self.psf_size)
    output_size = np.array((np.prod(self.grid_size),
                            self.psf_size[0], self.psf_size[1]))

    preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype)
    mod = SourceModule(preproc + basis_code, keep=True)

    in_tex = mod.get_texref('in_tex')
    in_tex.set_array(self.grid_gpu)
    in_tex.set_filter_mode(cu.filter_mode.LINEAR)
    #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

    params_tex = mod.get_texref('params_tex')
    params_tex.set_array(params_gpu)
    offset = ((np.array(self.im_size) - np.array(grid.shape)) /
                   np.array(self.grid_size).astype(np.float32))
    offset = np.float32(offset)
    grid_scale = ((np.array(self.lens_grid_size) - 1) /
                       (np.array(self.grid_size) - 1).astype(np.float32))
    grid_scale = np.float32(grid_scale)

    block_size = (16,16,1)
    gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])),
                     int(np.ceil(float(params_count)/block_size[1])))

    basis_gpu = cua.empty((int(params_count),
                           int(output_size[0]), int(output_size[1]),
                           int(output_size[2])), np.float32)
    #self.basis_host = cu.pagelocked_empty((int(params_count),
    #    int(output_size[0]), int(output_size[1]), int(output_size[2])),
    #    np.float32, mem_flags=cu.host_alloc_flags.DEVICEMAP)

    basis_fun_gpu = mod.get_function("basis")

    basis_fun_gpu(basis_gpu.gpudata,
                  np.uint32(np.prod(output_size)),
                  np.uint32(self.grid_size[1]),
                  np.uint32(self.psf_size[0]), np.uint32(self.psf_size[1]),
                  np.uint32(self.im_size[0]), np.uint32(self.im_size[1]),
                  offset[0], offset[1],
                  grid_scale[0], grid_scale[1],
                  np.uint32(self.lens_psf_size[0]),
                  np.uint32(self.lens_psf_size[1]),
                  params_count,
                  block=block_size, grid=gpu_grid_size)

    self.basis_host = basis_gpu.get()
    self._intern_shape = self.basis_host.shape
    self.basis_host = self.basis_host.reshape((self._intern_shape[0],
        self._intern_shape[1]*self._intern_shape[2]*self._intern_shape[3]))
    self.basis_host = scipy.sparse.csr_matrix(self.basis_host)
Beispiel #11
0
def weighted_basis_gpu(psf_size, grid_size, im_size, params,
                       lens_file=None, lens_psf_size=None, lens_grid_size=None):

  # generate grid
  psf_size = psf_size + (1 - np.mod(psf_size, 2))
  
  if lens_file:
    grid = scipy.misc.imread(lens_file, flatten=True)
    if np.max(grid) > 255:
      grid /= 2**(16-1)
    else:
      grid /= 255
  else:
    grid = np.zeros(psf_size, dtype=np.float32)
    grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1.
    grid = np.tile(grid, grid_size)
    lens_psf_size = psf_size
    #lens_grid_size = (1,1)
    lens_grid_size = grid_size

  grid_gpu = cu.matrix_to_array(grid, 'C')

  params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C')

  block_size = (16,16,1)
  output_size = np.array(grid_size)*np.array(psf_size)
  gpu_grid_size = (int(np.ceil(float(output_size[1])/block_size[0])),
                   int(np.ceil(float(output_size[0])/block_size[1])))

  weighted_basis_gpu = cua.empty((int(output_size[0]),
                                  int(output_size[1])), np.float32)

  preproc = '' #_generate_preproc(basis_gpu.dtype)
  mod = SourceModule(preproc + basis_code, keep=True)
  basis_fun_gpu = mod.get_function("weighted_basis")

  in_tex = mod.get_texref('in_tex')
  in_tex.set_array(grid_gpu)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

  params_tex = mod.get_texref('params_tex')
  params_tex.set_array(params_gpu)
  offset = ((np.array(im_size) - np.array(grid.shape)) /
            np.array(grid_size).astype(np.float32))
  offset = np.float32(offset)
  grid_scale = ((np.array(lens_grid_size) - 1) /
                (np.array(grid_size) - 1).astype(np.float32))
  grid_scale = np.float32(grid_scale)
  #psf_scale = ((np.array(lens_psf_size) - 1) /
  #             (np.array(psf_size) - 1).astype(np.float32))
  #psf_scale = np.float32(psf_scale)

  basis_fun_gpu(weighted_basis_gpu.gpudata,
                np.uint32(output_size[0]), np.uint32(output_size[1]),
                np.uint32(psf_size[0]), np.uint32(psf_size[1]),
                np.uint32(im_size[0]), np.uint32(im_size[1]),
                offset[0], offset[1],
                grid_scale[0], grid_scale[1],
                np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]),
                np.uint32(params.size/3),
                block=block_size, grid=gpu_grid_size)

  return weighted_basis_gpu
Beispiel #12
0
def init_stitch(N):
	"""outputs the high resolution k-box, and the smoothed r box

	Input
	-----------
	N:  int32
		size of box to load onto the GPU, should be related to DIM by powers of 2

	"""
	if N is None:
		N = np.int32(HII_DIM) #prepare for stitching
	META_GRID_SIZE = DIM/N
	M = np.int32(HII_DIM/META_GRID_SIZE)
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN/DIM)
	print 'scale', scale
	HII_scale = np.float32(BOX_LEN/HII_DIM)
	shape = (DIM,DIM,N)
	stitch_grid_size = (DIM/(block_size[0]),
						DIM/(block_size[0]),
						N/(block_size[0]))
	HII_stitch_grid_size = (HII_DIM/(block_size[0]),
						HII_DIM/(block_size[0]),
						M/(block_size[0]))
	#ratio of large box to small size
	kernel_source = open(cmd_folder+"/initialize_stitch.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'DIM': DIM, 
		'VOLUME': VOLUME,
		'META_BLOCKDIM': N
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_stitch = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)
	plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64)
	plan1d = Plan((np.int64(DIM)), dtype=np.complex64)
	print "init pspec"
	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)
	#hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64')
	hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64)
	#hbox_small = np.zeros(HII_shape, dtype=np.float32)
	#hbox_large = n
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)

	# Set up pinned memory for transfer
	#largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize())
	largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32)
	largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	print "init boxes"
	for meta_z in xrange(META_GRID_SIZE):
		# MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3)
		init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy()
	#if want to get velocity need to use this
	if True:
		print "saving kbox"
		np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)

	print "Executing FFT on device"
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print hbox_large.dtype
	print "Finished FFT on device"
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)
	
	if True:
		print "loading kbox"
		hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		#cuda.memcpy_htod_async(largebox_d, largebox_pin)
		largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size);
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async()
	#import IPython; IPython.embed()
	print "Executing FFT on host"
	#hbox_large = hifft(hbox_large).astype(np.complex64).real
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print "Finished FFT on host"
	#import IPython; IPython.embed()

	# for meta_x in xrange(META_GRID_SIZE):
	# 	for meta_y in xrange(META_GRID_SIZE):
	# 		for meta_z in xrange(META_GRID_SIZE):
	# 			largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N])
	# 			HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size);
	# 			hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get()
	#plan = Plan(shape, dtype=np.complex64)
	#plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)


	# This saves a large resolution deltax

	
	print "downsampling"
	smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32)
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		#largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		largebox_d /= scale**3 #
		subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space
		hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async()
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small)
	#import IPython; IPython.embed()


	# To get velocities: reload the k-space box
	hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32)
	hvbox_small = np.zeros(HII_shape, dtype=np.float32)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64)
	smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		for meta_z in xrange(META_GRID_SIZE):
			largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
			velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size)
			HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size)
			print hvbox_large.shape, largevbox_d.shape
			hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async()
		hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real
		for meta_z in xrange(META_GRID_SIZE):
			largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size)
			hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async()
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get())

	return
Beispiel #13
0
def generate_basis_gpu(psf_size, grid_size, im_size, params,
                       lens_file=None, lens_psf_size=None, lens_grid_size=None):

  # generate grid
  psf_size = psf_size + (1 - np.mod(psf_size, 2))
  
  if lens_file:
    grid = scipy.misc.imread(lens_file, flatten=True)
    if np.max(grid) > 255:
      grid /= 2**(16-1)
    else:
      grid /= 255
  else:
    grid = np.zeros(psf_size, dtype=np.float32)
    grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1.
    grid = np.tile(grid, grid_size)
    lens_psf_size = psf_size
    #lens_grid_size = (1,1)
    lens_grid_size = grid_size

  grid_gpu = cu.matrix_to_array(grid, 'C')

  # generate parameters of basis functions
  #p = max(1, np.floor(psf_size[0] / 2))
  #p = min(8, p)
  #dp = min(45. / np.floor(psf_size * grid_size / 2))
  #dp = min(0.8, dp)
  #dp = np.radians(dp)
  #p = np.radians(p)
  #l = max(1, np.floor(psf_size[0] / 2))
  #l = np.ceil(l / 2)
  #params = np.mgrid[-l:l+1, -l:l+1, -p:p+dp/10:dp].astype(np.float32).T
  #params = params.reshape(params.size / 3, 3)
  params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C')

  block_size = (16,16,1)
  output_size = np.array((np.prod(np.array(grid_size)),psf_size[0],psf_size[1]))
  gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])),
                   int(np.ceil(float(params.size/3)/block_size[1])))

  basis_gpu = cua.empty((params.size/3,
                         int(output_size[0]), int(output_size[1]),
                         int(output_size[2])), np.float32)

  preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype)
  mod = SourceModule(preproc + basis_code, keep=True)
  basis_fun_gpu = mod.get_function("basis")

  in_tex = mod.get_texref('in_tex')
  in_tex.set_array(grid_gpu)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

  params_tex = mod.get_texref('params_tex')
  params_tex.set_array(params_gpu)
  offset = ((np.array(im_size) - np.array(grid.shape)) /
            np.array(grid_size).astype(np.float32))
  offset = np.float32(offset)
  grid_scale = ((np.array(lens_grid_size) - 1) /
                (np.array(grid_size) - 1).astype(np.float32))
  grid_scale = np.float32(grid_scale)
  #psf_scale = ((np.array(lens_psf_size) - 1) /
  #             (np.array(psf_size) - 1).astype(np.float32))
  #psf_scale = np.float32(psf_scale)

  basis_fun_gpu(basis_gpu.gpudata,
                np.uint32(np.prod(output_size)), np.uint32(grid_size[1]),
                np.uint32(psf_size[0]), np.uint32(psf_size[1]),
                np.uint32(im_size[0]), np.uint32(im_size[1]),
                offset[0], offset[1],
                grid_scale[0], grid_scale[1],
                np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]),
                np.uint32(params.size/3),
                block=block_size, grid=gpu_grid_size)

  return basis_gpu
Beispiel #14
0
  def set_params(self, psf_size, grid_size, im_size, params=None):

    # generate grid
    psf_size = np.array(psf_size)
    grid_size = np.array(grid_size)
    im_size = np.array(im_size)
    self.psf_size = psf_size + (1 - np.mod(psf_size, 2))
    self.grid_size = grid_size
    self.im_size = im_size

    if params != None:
      self.params = params
      self.shape = (params.size / 3,)
    else:
      self._psf2params()
    
    if not self.lens:
      grid = np.zeros(self.psf_size, dtype=np.float32)
      grid[(self.psf_size[0]-1)/2, (self.psf_size[1]-1)/2] = 1.
      grid = np.tile(grid, self.grid_size)
      self.lens_psf_size = self.psf_size
      #lens_grid_size = (1,1)
      self.lens_grid_size = self.grid_size
      self.grid_gpu = cu.matrix_to_array(grid, 'C')

    params_count = np.uint32(self.params.size / 3)
    params_gpu = cu.matrix_to_array(self.params.astype(np.float32), 'C')

    #self.output_size = np.array(self.grid_size)*np.array(self.psf_size)
    output_size = np.array((np.prod(self.grid_size),
                            self.psf_size[0], self.psf_size[1]))

    preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype)
    mod = SourceModule(preproc + basis_code, keep=True)

    in_tex = mod.get_texref('in_tex')
    in_tex.set_array(self.grid_gpu)
    in_tex.set_filter_mode(cu.filter_mode.LINEAR)
    #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

    params_tex = mod.get_texref('params_tex')
    params_tex.set_array(params_gpu)
    offset = ((np.array(self.im_size) - np.array(grid.shape)) /
                   np.array(self.grid_size).astype(np.float32))
    offset = np.float32(offset)
    grid_scale = ((np.array(self.lens_grid_size) - 1) /
                       (np.array(self.grid_size) - 1).astype(np.float32))
    grid_scale = np.float32(grid_scale)

    block_size = (16,16,1)
    gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])),
                     int(np.ceil(float(params_count)/block_size[1])))

    basis_gpu = cua.empty((int(params_count),
                           int(output_size[0]), int(output_size[1]),
                           int(output_size[2])), np.float32)
    #self.basis_host = cu.pagelocked_empty((int(params_count),
    #    int(output_size[0]), int(output_size[1]), int(output_size[2])),
    #    np.float32, mem_flags=cu.host_alloc_flags.DEVICEMAP)

    basis_fun_gpu = mod.get_function("basis")

    basis_fun_gpu(basis_gpu.gpudata,
                  np.uint32(np.prod(output_size)),
                  np.uint32(self.grid_size[1]),
                  np.uint32(self.psf_size[0]), np.uint32(self.psf_size[1]),
                  np.uint32(self.im_size[0]), np.uint32(self.im_size[1]),
                  offset[0], offset[1],
                  grid_scale[0], grid_scale[1],
                  np.uint32(self.lens_psf_size[0]),
                  np.uint32(self.lens_psf_size[1]),
                  params_count,
                  block=block_size, grid=gpu_grid_size)

    self.basis_host = basis_gpu.get()
    self._intern_shape = self.basis_host.shape
    self.basis_host = self.basis_host.reshape((self._intern_shape[0],
        self._intern_shape[1]*self._intern_shape[2]*self._intern_shape[3]))
    self.basis_host = scipy.sparse.csr_matrix(self.basis_host)
Beispiel #15
0
def weighted_basis_gpu(psf_size, grid_size, im_size, params,
                       lens_file=None, lens_psf_size=None, lens_grid_size=None):

  # generate grid
  psf_size = psf_size + (1 - np.mod(psf_size, 2))
  
  if lens_file:
    grid = scipy.misc.imread(lens_file, flatten=True)
    if np.max(grid) > 255:
      grid /= 2**(16-1)
    else:
      grid /= 255
  else:
    grid = np.zeros(psf_size, dtype=np.float32)
    grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1.
    grid = np.tile(grid, grid_size)
    lens_psf_size = psf_size
    #lens_grid_size = (1,1)
    lens_grid_size = grid_size

  grid_gpu = cu.matrix_to_array(grid, 'C')

  params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C')

  block_size = (16,16,1)
  output_size = np.array(grid_size)*np.array(psf_size)
  gpu_grid_size = (int(np.ceil(float(output_size[1])/block_size[0])),
                   int(np.ceil(float(output_size[0])/block_size[1])))

  weighted_basis_gpu = cua.empty((int(output_size[0]),
                                  int(output_size[1])), np.float32)

  preproc = '' #_generate_preproc(basis_gpu.dtype)
  mod = SourceModule(preproc + basis_code, keep=True)
  basis_fun_gpu = mod.get_function("weighted_basis")

  in_tex = mod.get_texref('in_tex')
  in_tex.set_array(grid_gpu)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

  params_tex = mod.get_texref('params_tex')
  params_tex.set_array(params_gpu)
  offset = ((np.array(im_size) - np.array(grid.shape)) /
            np.array(grid_size).astype(np.float32))
  offset = np.float32(offset)
  grid_scale = ((np.array(lens_grid_size) - 1) /
                (np.array(grid_size) - 1).astype(np.float32))
  grid_scale = np.float32(grid_scale)
  #psf_scale = ((np.array(lens_psf_size) - 1) /
  #             (np.array(psf_size) - 1).astype(np.float32))
  #psf_scale = np.float32(psf_scale)

  basis_fun_gpu(weighted_basis_gpu.gpudata,
                np.uint32(output_size[0]), np.uint32(output_size[1]),
                np.uint32(psf_size[0]), np.uint32(psf_size[1]),
                np.uint32(im_size[0]), np.uint32(im_size[1]),
                offset[0], offset[1],
                grid_scale[0], grid_scale[1],
                np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]),
                np.uint32(params.size/3),
                block=block_size, grid=gpu_grid_size)

  return weighted_basis_gpu
Beispiel #16
0
def watershed(I):

  # Get contiguous image + shape.
  height, width = I.shape
  I = np.float32(I.copy())

  # Get block/grid size for steps 1-3.
  block_size =  (6,6,1)
  grid_size =   (width/(block_size[0]-2),
                height/(block_size[0]-2))

  # Get block/grid size for step 4.
  block_size2 = (16,16,1)
  grid_size2  = (width/(block_size2[0]-2),
                height/(block_size2[0]-2))

  # Initialize variables.
  labeled       = np.zeros([height,width]) 
  labeled       = np.float32(labeled)
  width         = np.int32(width)
  height        = np.int32(height)
  count         = np.int32([0])

  # Transfer labels asynchronously.
  labeled_d = gpu.to_gpu_async(labeled)
  counter_d = gpu.to_gpu_async(count)

  # Bind CUDA textures.
  I_cu = cu.matrix_to_array(I, order='C')
  cu.bind_array_to_texref(I_cu, image_texture)

  # Step 1.
  descent_kernel(labeled_d, width, 
  height, block=block_size, grid=grid_size)
  
  start_time = cu.Event()
  end_time = cu.Event()
  start_time.record()

  # Step 2.
  increment_kernel(labeled_d,width,height,
  block=block_size2,grid=grid_size2)
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2

  while old != new:
    old = new
    minima_kernel(labeled_d, counters_d,
    width, height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]

  # Step 3.
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    plateau_kernel(labeled_d, counters_d, width,
    height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]
  
  # Step 4
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    flood_kernel(labeled_d, counters_d, width,
    height, block=block_size2, grid=grid_size2)
    new = counters_d.get()[0]

  result = labeled_d.get()
  
  # End GPU timers.
  end_time.record()
  end_time.synchronize()
  gpu_time = start_time.\
  time_till(end_time) * 1e-3

  # print str(gpu_time)

  return result
Beispiel #17
0
def watershed(I):

    # Get contiguous image + shape.
    height, width = I.shape
    I = np.float32(I.copy())

    # Get block/grid size for steps 1-3.
    block_size = (6, 6, 1)
    grid_size = (width / (block_size[0] - 2), height / (block_size[0] - 2))

    # Get block/grid size for step 4.
    block_size2 = (16, 16, 1)
    grid_size2 = (width / (block_size2[0] - 2), height / (block_size2[0] - 2))

    # Initialize variables.
    labeled = np.zeros([height, width])
    labeled = np.float32(labeled)
    width = np.int32(width)
    height = np.int32(height)
    count = np.int32([0])

    # Transfer labels asynchronously.
    labeled_d = gpu.to_gpu_async(labeled)
    counter_d = gpu.to_gpu_async(count)

    # Bind CUDA textures.
    I_cu = cu.matrix_to_array(I, order='C')
    cu.bind_array_to_texref(I_cu, image_texture)

    # Step 1.
    descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size)

    start_time = cu.Event()
    end_time = cu.Event()
    start_time.record()

    # Step 2.
    increment_kernel(labeled_d,
                     width,
                     height,
                     block=block_size2,
                     grid=grid_size2)
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2

    while old != new:
        old = new
        minima_kernel(labeled_d,
                      counters_d,
                      width,
                      height,
                      block=block_size,
                      grid=grid_size)
        new = counters_d.get()[0]

    # Step 3.
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    while old != new:
        old = new
        plateau_kernel(labeled_d,
                       counters_d,
                       width,
                       height,
                       block=block_size,
                       grid=grid_size)
        new = counters_d.get()[0]

    # Step 4
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    while old != new:
        old = new
        flood_kernel(labeled_d,
                     counters_d,
                     width,
                     height,
                     block=block_size2,
                     grid=grid_size2)
        new = counters_d.get()[0]

    result = labeled_d.get()

    # End GPU timers.
    end_time.record()
    end_time.synchronize()
    gpu_time = start_time.\
    time_till(end_time) * 1e-3

    # print str(gpu_time)

    return result
Beispiel #18
0
def generate_basis_gpu(psf_size, grid_size, im_size, params,
                       lens_file=None, lens_psf_size=None, lens_grid_size=None):

  # generate grid
  psf_size = psf_size + (1 - np.mod(psf_size, 2))
  
  if lens_file:
    grid = scipy.misc.imread(lens_file, flatten=True)
    if np.max(grid) > 255:
      grid /= 2**(16-1)
    else:
      grid /= 255
  else:
    grid = np.zeros(psf_size, dtype=np.float32)
    grid[(psf_size[0]-1)/2, (psf_size[1]-1)/2] = 1.
    grid = np.tile(grid, grid_size)
    lens_psf_size = psf_size
    #lens_grid_size = (1,1)
    lens_grid_size = grid_size

  grid_gpu = cu.matrix_to_array(grid, 'C')

  # generate parameters of basis functions
  #p = max(1, np.floor(psf_size[0] / 2))
  #p = min(8, p)
  #dp = min(45. / np.floor(psf_size * grid_size / 2))
  #dp = min(0.8, dp)
  #dp = np.radians(dp)
  #p = np.radians(p)
  #l = max(1, np.floor(psf_size[0] / 2))
  #l = np.ceil(l / 2)
  #params = np.mgrid[-l:l+1, -l:l+1, -p:p+dp/10:dp].astype(np.float32).T
  #params = params.reshape(params.size / 3, 3)
  params_gpu = cu.matrix_to_array(params.astype(np.float32), 'C')

  block_size = (16,16,1)
  output_size = np.array((np.prod(np.array(grid_size)),psf_size[0],psf_size[1]))
  gpu_grid_size = (int(np.ceil(float(np.prod(output_size))/block_size[0])),
                   int(np.ceil(float(params.size/3)/block_size[1])))

  basis_gpu = cua.empty((params.size/3,
                         int(output_size[0]), int(output_size[1]),
                         int(output_size[2])), np.float32)

  preproc = '#define BLOCK_SIZE 0\n' #_generate_preproc(basis_gpu.dtype)
  mod = SourceModule(preproc + basis_code, keep=True)
  basis_fun_gpu = mod.get_function("basis")

  in_tex = mod.get_texref('in_tex')
  in_tex.set_array(grid_gpu)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  #in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)

  params_tex = mod.get_texref('params_tex')
  params_tex.set_array(params_gpu)
  offset = ((np.array(im_size) - np.array(grid.shape)) /
            np.array(grid_size).astype(np.float32))
  offset = np.float32(offset)
  grid_scale = ((np.array(lens_grid_size) - 1) /
                (np.array(grid_size) - 1).astype(np.float32))
  grid_scale = np.float32(grid_scale)
  #psf_scale = ((np.array(lens_psf_size) - 1) /
  #             (np.array(psf_size) - 1).astype(np.float32))
  #psf_scale = np.float32(psf_scale)

  basis_fun_gpu(basis_gpu.gpudata,
                np.uint32(np.prod(output_size)), np.uint32(grid_size[1]),
                np.uint32(psf_size[0]), np.uint32(psf_size[1]),
                np.uint32(im_size[0]), np.uint32(im_size[1]),
                offset[0], offset[1],
                grid_scale[0], grid_scale[1],
                np.uint32(lens_psf_size[0]), np.uint32(lens_psf_size[1]),
                np.uint32(params.size/3),
                block=block_size, grid=gpu_grid_size)

  return basis_gpu
Beispiel #19
0
__global__ void interp(float* out, float ox, float oy, int w, int h)
{
  int idx = threadIdx.x+blockIdx.x*blockDim.x;
  int idy = threadIdx.y+blockIdx.y*blockDim.y;
  
  out[idy*w+idx] = tex2D(tex,(idx-ox)/w,(idy-oy)/h);
}
"""

mod = SourceModule(src_mod)
interp=mod.get_function("interp")

h,w = img.shape
print("w={}, h={}".format(w,h))

devArray = cuda.matrix_to_array(img,"C")


tex=mod.get_texref('tex')

tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tex.set_filter_mode(cuda.filter_mode.LINEAR)
tex.set_address_mode(0,cuda.address_mode.CLAMP)
tex.set_address_mode(1,cuda.address_mode.CLAMP)


tex.set_array(devArray)


arg_types = "Pffii"
devOut = gpuarray.GPUArray(img.shape,np.float32)
Beispiel #20
0
def init():
	"""outputs the high resolution k-box, and the smoothed r box"""
	N = np.int32(DIM) #prepare for stitching
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN)/DIM
	HII_scale = np.float32(BOX_LEN)/HII_DIM
	shape = (N,N,N)
	
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)

	kernel_source = open(cmd_folder+"/initialize.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'VOLUME': VOLUME,
		'DIM': DIM
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_kernel = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	adj_complex_conj = main_module.get_function("adj_complex_conj")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")

	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size)

	#import IPython; IPython.embed()
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size)

	largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag

	#adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size)
	largebox = largebox_d.get()
	#np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox)

	#save real space box before smoothing
	plan = Plan(shape, dtype=np.complex64)
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async())

	#save real space box after smoothing and subsampling
	# host largebox is still in k space, no need to reload from disk
	largebox_d = gpuarray.to_gpu(largebox)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size);
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async())

	# reload the k-space box for velocity boxes
	largebox_d = gpuarray.to_gpu(largebox)
	
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64)
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size)
		HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size)
		plan.execute(largevbox_d, inverse=True)
		largevbox_d /= scale**3
		#import IPython; IPython.embed()
		subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size)
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get())

	return
Beispiel #21
0
    def set_qsmiles(self,qsmilesmat,qcountsmat,querylengths,querymags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *qsmilesmat*, count matrix *qcountsmat*,
        and length vector *querylengths*. If *querymags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the query matrices (*qsmilesmat* and *qcountsmat*) must have
        no more than 65,536 rows (molecules) and 32,768 columns (Lingos). Larger computations must be performed in tiles.
        """
        # Set up lingo and count matrices on device #{{{

        if self.usePycudaArray:
            # Create CUDAarrays for lingo and count matrices
            print "Strides qsmilesmat:",numpy.ascontiguousarray(qsmilesmat.T).strides
            self.gpu.qsmiles = cuda.matrix_to_array(numpy.ascontiguousarray(qsmilesmat.T),order='C')
            self.gpu.qcounts= cuda.matrix_to_array(numpy.ascontiguousarray(qcountsmat.T),order='C')
            print "qsmiles descriptor",dtos(self.gpu.qsmiles.get_descriptor())
            print "qcounts descriptor",dtos(self.gpu.qcounts.get_descriptor())
            self.gpu.tex2lq.set_array(self.gpu.qsmiles)
            self.gpu.tex2cq.set_array(self.gpu.qcounts)
        else:
            # Manually handle texture setup
            # padded_array will handle making matrix contiguous
            tempqlmat = self._padded_array(qsmilesmat.T)
            if tempqlmat.shape[1] > 65536 or tempqlmat.shape[0] > 32768:
                raise ValueError("Error: query matrix is not allowed to have more than 65536 rows (molecules) or 32768 columns (LINGOs) (both padded to multiple of 16). Dimensions = (%d,%d)"%tempqlmat.shape)
            if self.gpu.qsmiles is None or self.gpu.qsmiles.nbytes < tempqlmat.nbytes:
                self.gpu.qsmiles = cuda.mem_alloc(tempqlmat.nbytes)
                self.gpu.qsmiles.nbytes = tempqlmat.nbytes
            cuda.memcpy_htod_async(self.gpu.qsmiles,tempqlmat,stream=self.gpu.stream)

            tempqcmat = self._padded_array(qcountsmat.T)
            if self.gpu.qcounts is None or self.gpu.qcounts.nbytes < tempqcmat.nbytes:
                self.gpu.qcounts = cuda.mem_alloc(tempqcmat.nbytes)
                self.gpu.qcounts.nbytes = tempqcmat.nbytes
            cuda.memcpy_htod_async(self.gpu.qcounts,tempqcmat,stream=self.gpu.stream)
            
            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = tempqcmat.shape[1]
            descriptor.height = tempqcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lq.set_address_2d(self.gpu.qsmiles,descriptor,tempqlmat.strides[0])
            self.gpu.tex2cq.set_address_2d(self.gpu.qcounts,descriptor,tempqcmat.strides[0])
            #print "Set up query textures with stride=",tempqmat.strides[0]
            self.gpu.stream.synchronize()
            del tempqlmat
            del tempqcmat
        #}}}

        self.qshape = qsmilesmat.shape
        self.nquery = qsmilesmat.shape[0]
        #print "Query shape=",self.qshape,", nquery=",self.nquery

        # Transfer query lengths array to GPU
        self.gpu.ql_gpu = cuda.to_device(querylengths)

        # Allocate buffers for query set magnitudes
        self.gpu.qmag_gpu = cuda.mem_alloc(querylengths.nbytes)
        if querymags is not None:
            cuda.memcpy_htod(self.gpu.qmag_gpu,querymags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.qMagKernel(self.gpu.qmag_gpu,self.gpu.ql_gpu,numpy.int32(self.nquery),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cq])
            #self.qmag_gpu = cuda.to_device(qcountsmat.sum(1).astype(numpy.int32))


        return