Example #1
0
def stepFuntion():
  getModulo( psi_d, psiMod_d )
  maxVal = (gpuarray.max(psiMod_d)).get()
  multiplyByScalarReal( cudaPre(0.95/(maxVal)), psiMod_d )
  sendModuloToUCHAR( psiMod_d, plotData_d)
  copyToScreenArray()

  if volumeRender.nTextures == 2:
    if not realDynamics:
      cuda.memset_d8(activity_d.ptr, 0, nBlocks3D )
      findActivityKernel( cudaPre(0.001), psi_d, activity_d, grid=grid3D, block=block3D )
    if plotVar == 1: getActivityKernel( psiOther_d, activity_d, grid=grid3D, block=block3D )
    if plotVar == 0:
      if realTEXTURE:
	tex_psiReal.set_array( psiK2Real_array )
	tex_psiImag.set_array( psiK2Imag_array )
	getVelocity_texKernel( dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D )
      else: getVelocityKernel( np.int32(neighbors), dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D )
      maxVal = (gpuarray.max(psiOther_d)).get()
      if maxVal > 0: multiplyByScalarReal( cudaPre(1./maxVal), psiOther_d )
    sendModuloToUCHAR( psiOther_d, plotData_d_1)
    copyToScreenArray_1()
  if applyTransition: timeTransition()
  if realDynamics: realStep()
  else: imaginaryStep()
def minmax_pycuda(Dx_gpu, Dy_gpu):
    """
    Given two GPUArrays, finds and returns their mins and maxes.
    This is all done in the GPU; only the mins/maxes are sent back to the host.
    """
    return (gpuarray.min(Dx_gpu).get(), gpuarray.max(Dx_gpu).get(),
            gpuarray.min(Dy_gpu).get(), gpuarray.max(Dy_gpu).get())
Example #3
0
def gpu_getmax(map):
  """
  Use pycuda to get the maximum absolute deviation of the residual map,
  with the correct sign
  """
  imax=gpu.max(cumath.fabs(map)).get()
  if gpu.max(map).get()!=imax: imax*=-1
  return np.float32(imax)
Example #4
0
def gpu_getmax(map):
    """
    Use pycuda to get the maximum absolute deviation of the residual map,
    with the correct sign
    """
    imax = gpu.max(cumath.fabs(map)).get()
    if gpu.max(map).get() != imax: imax *= -1
    return np.float32(imax)
Example #5
0
def maximum_cuda(a, b=None):
    """Maximum values of two GPUArrays.

    Parameters
    ----------
    a : gpuarray
        First GPUArray.
    b : gpuarray
        Second GPUArray.

    Returns
    -------
    gpuarray
        Maximum values from both GPArrays, or single value if one GPUarray.

    Examples
    --------
    >>> a = maximum_cuda(give_cuda([1, 2, 3]), give_cuda([3, 2, 1]))
    [3, 2, 3]

    >>> type(a)
    <class 'pycuda.gpuarray.GPUArray'>
    """
    if b is not None:
        return cuda_array.maximum(a, b)
    return cuda_array.max(a)
Example #6
0
def stepFunction():
    global animIter
    if showActivity:
        cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks)
        findActivityKernel(cudaPre(1.e-10),
                           concentrationIn_d,
                           activeBlocks_d,
                           grid=grid2D,
                           block=block2D)
        getActivityKernel(activeBlocks_d,
                          activeThreads_d,
                          grid=grid2D,
                          block=block2D)
    cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr,
                     concentrationOut_d.nbytes)
    maxVal = gpuarray.max(plotData_d).get()
    scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity),
                  activeThreads_d)
    if cudaP == "float":
        [oneIteration_tex() for i in range(nIterationsPerPlot)]
    else:
        [oneIteration_sh() for i in range(nIterationsPerPlot // 2)]
    if plotting and animIter % 25 == 0:
        maxVals.append(maxVal)
        sumConc.append(gpuarray.sum(concentrationIn_d).get())
        plotData(maxVals, sumConc)
    animIter += 1
Example #7
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.'''
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                   block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
    return target, idx
Example #8
0
 def pixel_similarity_cuda(self, image):
     N = image.shape[0]
     nd = self.pairwise_difference(image, N)
     diff = gpuarray.max(nd) - gpuarray.min(nd)
     norm = self.gdivide(nd, diff)
     C = 1 - norm
     return C
Example #9
0
def stepFuntion():
  maxVal = ( gpuarray.max( cnsv1_d ) ).get()
  convertToUCHAR( cudaPre( 0.95/maxVal ), cnsv1_d, plotData_d)
  copyToScreenArray()

  timeStepHydro()
  if usingGravity: getGravForce()
Example #10
0
def _minmax_impl(a_gpu, axis, min_or_max, out, idxout, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.
    Hacked together from scikits.cuda code, since that doesn't have an "out"
    argument'''
    assert len(a_gpu.shape) < 3
    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = scm._get_minmax_kernel(a_gpu.dtype, min_or_max)
    target = out
    idx = idxout
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                   block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
Example #11
0
    def integrate(self, t, dt, nacptsteps, d_ucoeff):
        sm, explicit = self._sm, self._explicit
        moment, updateMoment = sm.moment, sm.updateMomentBDF
        updateDist, consMaxwellian = sm.updateDistBDF, sm.constructMaxwellian

        L0, M = self.scratch
        U0, LU0, U = self.scratch_moms
        a1, a2, g1, b = [*self.A, *self.G, self.B]  

        pex = lambda *v: print(*v) + exit(-1)
        psum = lambda v: pex(gpuarray.sum(v))
        pMom = lambda v: pex(v.get().reshape(-1,5))
        pmin = lambda v: pex(gpuarray.min(v))
        pmax = lambda v: pex(gpuarray.max(v))

        # Compute the moment of the initial distribution
        moment(t, d_ucoeff, U0)

        # Compute the explicit part; L0 = -∇·f(d_ucoeff);
        explicit(t, d_ucoeff, L0)

        # Compute the moment of the explicit part
        moment(t, L0, LU0) 

        # update the moments
        updateMoment(dt, a1, U0, -g1, LU0, a2, U, b)
        #pex(U.get().reshape(-1,5))

        # implictly construct the Maxwellian (or Gaussian, etc.) given moments
        consMaxwellian(t, U, M)
        #pex(gpuarray.sum(L0))

        if nacptsteps==-1: 
            #pex(LU0.get().reshape(-1,5))
            #pex(gpuarray.sum(d_ucoeff))
            pass


        # update the distribution
        updateDist(dt, a1, d_ucoeff, -g1, L0, b, M, a2, U, d_ucoeff)
        #pex(gpuarray.sum(d_ucoeff))


        if(nacptsteps==-1):
            #print("\n>> BDF-111\n")

            #pMom(U0)
            #psum(U0)
            #psum(L0)
            #pmax(L0)
            #psum(LU0)
            #pMom(LU0)
            #psum(U)
            #psum(M)
            #psum(d_ucoeff)
            #pmin(d_ucoeff)

            #exit(-1)
            pass
Example #12
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False):
    ''' Returns both max and argmax (min/argmin) along an axis.'''
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None or len(
            a_gpu.shape
    ) <= 1:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        out_shape = (1, ) * len(a_gpu.shape)
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).reshape(out_shape), None
        else:
            return gpuarray.min(a_gpu).reshape(out_shape), None
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1],
                                                         a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0
            and a_gpu.flags.c_contiguous) or (axis == 1
                                              and a_gpu.flags.f_contiguous):
        if keepdims:
            out_shape = (1, m) if axis == 0 else (m, 1)
        else:
            out_shape = (m, )
        target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(m, 1, 1),
                   stream=stream)
    else:
        if keepdims:
            out_shape = (1, n) if axis == 0 else (n, 1)
        else:
            out_shape = (n, )
        target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(n, 1, 1),
                   stream=stream)
    return target, idx
Example #13
0
    def check_termination(self):
        """
        Check various termination criteria
        """

        # First check if we are doing termination based on running time
        if (self.options.time_limit):
            self.time = time.clock - self.time_start
            if (self.time >= self.options.maxtime):
                self.term_reason = 'Exceeded time limit'
                return

        # Now check if we are doing break by tolx
        if (self.options.use_tolx):
            if (np.sqrt(cua.dot(self.dx, self.dx).get()) /
                    np.sqrt(cua.dot(self.oldx, self.oldx).get()) <
                    self.options.tolx):
                self.term_reason = 'Relative change in x small enough'
                return

        # Are we doing break by tolo (tol obj val)
        if (self.options.use_tolo and self.iter > 2):
            delta = abs(self.obj - self.oldobj)
            if (delta < self.options.tolo):
                self.term_reason = 'Relative change in objvalue small enough'
                return

        # Check if change in x and gradient are small enough
        # we don't want that for now


#        if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \
#               or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg):
#            self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9'
#            return

# Finally the plain old check if max iter has been achieved
        if (self.iter >= self.options.maxiter):
            self.term_reason = 'Maximum number of iterations reached'
            return

        # KKT violation
        if (self.options.use_kkt):
            if np.abs(np.sqrt(cua.dot(self.x,
                                      self.grad).get())) <= options.tolk:
                self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm'
                return

        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get()
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return

        # No condition met, so return false
        self.term_reason = 0
Example #14
0
    def check_termination(self):
        """
        Check various termination criteria
        """
        
        # First check if we are doing termination based on running time
        if (self.options.time_limit):
            self.time = time.clock - self.time_start
            if (self.time >= self.options.maxtime):
                self.term_reason = 'Exceeded time limit'
                return
         
        # Now check if we are doing break by tolx
        if (self.options.use_tolx):
            if (np.sqrt(cua.dot(self.dx,self.dx).get())/
                np.sqrt(cua.dot(self.oldx,self.oldx).get()) < self.options.tolx):
                self.term_reason = 'Relative change in x small enough'
                return
         
        # Are we doing break by tolo (tol obj val)
        if (self.options.use_tolo and self.iter > 2):
            delta = abs(self.obj-self.oldobj)
            if (delta < self.options.tolo):
                self.term_reason ='Relative change in objvalue small enough'
                return

        # Check if change in x and gradient are small enough
        # we don't want that for now
#        if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \
#               or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg):
#            self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9'
#            return
         
        # Finally the plain old check if max iter has been achieved
        if (self.iter >= self.options.maxiter):
            self.term_reason = 'Maximum number of iterations reached'
            return
         
        # KKT violation
        if (self.options.use_kkt):
            if np.abs(np.sqrt(cua.dot(self.x,self.grad).get())) <= options.tolk:
                self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm'
                return
         
        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get();
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return
         
        # No condition met, so return false
        self.term_reason = 0;        
Example #15
0
    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
                  max_blocks=1024, max_steps=10, use_weights=False,
                  scatter_first=0):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        step = 0
        input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
        input_queue_gpu = ga.to_gpu(input_queue)
        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
        output_queue[0] = 1
        output_queue_gpu = ga.to_gpu(output_queue)

        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            if nphotons < nthreads_per_block * 16 * 8 or use_weights:
                nsteps = max_steps - step
            else:
                nsteps = 1

            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))

            step += nsteps
            scatter_first = 0 # Only allow non-zero in first pass

            if step < max_steps:
                temp = input_queue_gpu
                input_queue_gpu = output_queue_gpu
                output_queue_gpu = temp
                # Assign with a numpy array of length 1 to silence
                # warning from PyCUDA about setting array with different strides/storage orders.
                output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                nphotons = input_queue_gpu[:1].get()[0] - 1

        if ga.max(self.flags).get() & (1 << 31):
            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
        cuda.Context.get_current().synchronize()
Example #16
0
def stepFunction():
  global animIter
  cuda.memcpy_dtod( plotDataFloat_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes )
  maxVal = (gpuarray.max(plotDataFloat_d)).get()
  multiplyByScalarReal( cudaPre(0.5/(maxVal)), plotDataFloat_d )
  floatToUchar( plotDataFloat_d, plotDataChars_d)
  copyToScreenArray()
  if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ]
  #else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ]
  if plotting and animIter%25 == 0: 
    maxVals.append( maxVal )
    sumConc.append( gpuarray.sum(concentrationIn_d).get() )
    plotData( maxVals, sumConc )
  animIter += 1
Example #17
0
def stepFunction():
  global animIter
  if showActivity: 
    cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks )
    findActivityKernel( cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D  )
    getActivityKernel( activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D ) 
  cuda.memcpy_dtod( plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes )
  maxVal = gpuarray.max( plotData_d ).get()
  scalePlotData(100./maxVal, plotData_d, np.uint8(showActivity), activeThreads_d )
  if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ]
  else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ]
  if plotting and animIter%25 == 0: 
    maxVals.append( maxVal )
    sumConc.append( gpuarray.sum(concentrationIn_d).get() )
    plotData( maxVals, sumConc )
  animIter += 1
Example #18
0
def timeTransition():
  global realDynamics, alpha, applyTransition
  realDynamics = not realDynamics
  applyTransition = False
  if realDynamics:
    cuda.memcpy_dtod(psiK2_d.ptr, psi_d.ptr, psi_d.nbytes)
    cuda.memcpy_dtod(psiRunge_d.ptr, psi_d.ptr, psi_d.nbytes)
    if realTEXTURE:
      copy3DpsiK1Real()
      copy3DpsiK1Imag()
      copy3DpsiK2Real()
      copy3DpsiK2Imag()
    print "Real Dynamics"
  else:    
    #GetAlphas
    getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
    alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() )  #OPTIMIZACION 
    print "Imaginary Dynamics"
Example #19
0
def implicit_iteration( ):
  global alpha
  #Make FFT
  fftPlan.execute( psi_d, psiFFT_d )
  #get Derivatives
  getPartialsXY( Lx, Ly, psiFFT_d, partialX_d, fftKx_d, partialY_d, fftKy_d, block=block3D, grid=grid3D) 
  fftPlan.execute( partialX_d, inverse=True )
  fftPlan.execute( partialY_d, inverse=True )   
  implicitStep1( xMin, yMin, zMin, dx, dy, dz, alpha,  omega,  gammaX,  gammaY,  gammaZ,
		  partialX_d, partialY_d, psi_d, G_d, x0, y0, grid=grid3D, block=block3D)
  fftPlan.execute( G_d )
  implicitStep2( dtImag, fftKx_d , fftKy_d, fftKz_d, alpha, psiFFT_d, G_d, block=block3D, grid=grid3D) 
  fftPlan.execute( psiFFT_d, psi_d, inverse=True)  
  #setBoundryConditionsKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), psi_d, block=block3D, grid=grid3D)  
  normalize(dx, dy, dz, psi_d)
  #GetAlphas
  getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
  alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() )  #OPTIMIZACION 
Example #20
0
def _minmax_impl(a_gpu, axis, min_or_max, out, idxout, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.
    Hacked together from scikits.cuda code, since that doesn't have an "out"
    argument'''
    assert len(a_gpu.shape) < 3
    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1],
                                                         a_gpu.shape[0])
    col_kernel, row_kernel = scm._get_minmax_kernel(a_gpu.dtype, min_or_max)
    target = out
    idx = idxout
    if (axis == 0
            and a_gpu.flags.c_contiguous) or (axis == 1
                                              and a_gpu.flags.f_contiguous):
        col_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(m, 1, 1),
                   stream=stream)
    else:
        row_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(n, 1, 1),
                   stream=stream)
Example #21
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False):
    """ Returns both max and argmax (min/argmin) along an axis."""
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None or len(a_gpu.shape) <= 1:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        out_shape = (1,) * len(a_gpu.shape)
        if min_or_max == "max":
            return gpuarray.max(a_gpu).reshape(out_shape), None
        else:
            return gpuarray.min(a_gpu).reshape(out_shape), None
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        if keepdims:
            out_shape = (1, m) if axis == 0 else (m, 1)
        else:
            out_shape = (m,)
        target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        if keepdims:
            out_shape = (1, n) if axis == 0 else (n, 1)
        else:
            out_shape = (n,)
        target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
    return target, idx
Example #22
0
def pinv(a_gpu, dev, rcond=1e-15):
    """
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.
        
    Returns
    -------
    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.
        
    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = pinv(a_gpu, pycuda.autoinit.device)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    True
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = pinv(b_gpu, pycuda.autoinit.device)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)
    True

    """

    # Check input dtype because the SVD can only be computed in single
    # precision:
    if a_gpu.dtype not in [np.float32, np.complex64]:
        raise ValueError('unsupported type')

    # Compute SVD:
    u_gpu, s_gpu, vh_gpu = svd(a_gpu, 0)
    uh_gpu = transpose(u_gpu, dev)

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, s_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Suppress very small singular values:
    cutoff_invert_s_mod = \
        SourceModule(cutoff_invert_s_mod_template.substitute( 
        max_threads_per_block=max_threads_per_block,
        max_blocks_per_grid=max_blocks_per_grid))
    cutoff_invert_s = \
                    cutoff_invert_s_mod.get_function('cutoff_invert_s')
    cutoff_gpu = gpuarray.max(s_gpu)*rcond
    cutoff_invert_s(s_gpu.gpudata, cutoff_gpu.gpudata,
                    np.uint32(s_gpu.size),
                    block=block_dim, grid=grid_dim)
    
    # The singular values must data type is in uh_gpu:
    if s_gpu.dtype == uh_gpu.dtype:
        s_diag_gpu = diag(s_gpu, dev)
    else:
        s_diag_gpu = diag(s_gpu.astype(uh_gpu.dtype), dev)

    # Finish pinv computation:
    v_gpu = transpose(vh_gpu, dev)
    suh_gpu = dot(s_diag_gpu, uh_gpu)
    return dot(v_gpu, suh_gpu)
def L1Norm(X):
    return gpuarray.max(X * (gpuarray.zeros((X.shape[1],), dtype=int) + 1)).get()
Example #24
0
  fftKx_h[i] = i*2*np.pi/Lx
for i in range(nWidth/2, nWidth):
  fftKx_h[i] = (i-nWidth)*2*np.pi/Lx  
for i in range(nHeight/2):
  fftKy_h[i] = i*2*np.pi/Ly
for i in range(nHeight/2, nHeight):
  fftKy_h[i] = (i-nHeight)*2*np.pi/Ly
for i in range(nDepth/2):
  fftKz_h[i] = i*2*np.pi/Lz
for i in range(nDepth/2, nDepth):
  fftKz_h[i] = (i-nDepth)*2*np.pi/Lz
psi_d = gpuarray.to_gpu(psi_h)
alphas_d = gpuarray.to_gpu(  np.zeros_like(psi_h.real) )
normalize( dx, dy, dz, psi_d )
getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
alpha=( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get()  
psiMod_d = gpuarray.to_gpu(  np.zeros_like(psi_h.real) )
psiFFT_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
partialX_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
partialY_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
G_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
fftKx_d = gpuarray.to_gpu( fftKx_h )         #OPTIMIZATION
fftKy_d = gpuarray.to_gpu( fftKy_h )
fftKz_d = gpuarray.to_gpu( fftKz_h )
activity_d = gpuarray.to_gpu( np.ones( nBlocks3D, dtype=np.uint8 ) )
psiOther_d = gpuarray.to_gpu(  np.zeros_like(psi_h.real) )
psiK1_d = gpuarray.to_gpu( psi_h )
psiK2_d = gpuarray.to_gpu( psi_h )
psiRunge_d = gpuarray.to_gpu( psi_h )
#For FFT version
laplacian_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
Example #25
0
                      d_seed_threshold,
                      block=blocksize,
                      grid=gridsize)
    # reset the queue for queue calculation
    d_queue.fill(0)
    cu.memcpy_dtod(d_scan.gpudata, d_nextFront.gpudata, d_nextFront.nbytes)
    scan_kernel(d_scan)  # scan the front for queue index determination
    # call queue kernel in order to generate queue
    queue_kernel(d_nextFront,
                 d_scan,
                 d_queue,
                 width,
                 height,
                 block=blocksize,
                 grid=gridsize)  # generate queue
    qLen = np.int32(gpu.max(
        d_scan).get())  # max value in scan must be the largest index of queue
    end_gpu_time.record()
    end_gpu_time.synchronize()
    gpu_comp_time += start_gpu_time.time_till(end_gpu_time) * 1e-3

    contComp = True
    if (qLen == 0):
        contComp = False

    steps = 0
    # While the next front has pixels we need to process
    while contComp:
        # Region Growing
        # Run the CUDA kernel with the appropriate inputs
        start_gpu_time.record()
        d_nextFront.fill(0)  # set nextFront to zero
Example #26
0
 def _max_CUDA(a, stream=None):
     return cu_array.max(a=a, stream=stream)
Example #27
0
	r_space_gpu = buffer_r_space - beta * r_space_gpu
    
	sample = gpuif((sample.real < 0).astype(np.bool), r_space_gpu, sample)
	r_space_gpu = gpuif(MaskBoolean, sample, r_space_gpu)
    
	#### OSS ####

	if (HIOfirst == 0 or iter > np.ceil(iterations/filtercount)) and iter < np.ceil(iterations-iterations/filtercount):
		newsigma = sigma[(iter-1)]
		if lastUsedSigma != newsigma:
			print(str(iter) + ' changing filter to ' + str(newsigma))
			kfilter = np.exp(-(((np.sqrt((yy)**2+(xx)**2)**2))/(2*(newsigma)**2))).astype(np.complex64)
			kfilter_gpu.set(kfilter)
			temp_gpu = kfilter_gpu + 0.0
			cu_fft.fft(kfilter_gpu,temp_gpu,plan_forward)
			temp_gpu = temp_gpu/gpuarray.max(temp_gpu.real).get().astype(np.float32).astype(np.complex64)
			
			#CuFFTShift(kfilter_gpu,temp_gpu) # Desnecessauro	
			lastUsedSigma = newsigma

		cu_fft.fft(r_space_gpu, ktemp_gpu, plan_forward)
		ktemp_gpu = ktemp_gpu*kfilter_gpu
		cu_fft.ifft(ktemp_gpu, r_space_gpu, plan_inverse,True)
        
    		if np.mod(iterations,iter//filtercount)==0:
       			r_space_gpu = R2D[filtnum-1] + 0.0
   		else:
			r_space_gpu = gpuif(MaskBoolean,sample,r_space_gpu)

	##### ### ####
Example #28
0
def pinv(a_gpu, rcond=1e-15):
    """
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.

    Returns
    -------
    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    If the input matrix is square, the pseudoinverse uses less memory.

    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = linalg.pinv(a_gpu)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    True
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = linalg.pinv(b_gpu)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)
    True

    """

    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    # Perform in-place SVD if the matrix is square to save memory:
    if a_gpu.shape[0] == a_gpu.shape[1]:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o')
    else:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's')

    # Suppress very small singular values:
    cutoff_gpu = gpuarray.max(s_gpu)*rcond
    ctype = tools.dtype_to_ctype(s_gpu.dtype)
    cutoff_func = el.ElementwiseKernel("{ctype} *s, {ctype} *cutoff".format(ctype=ctype),
        "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}")
    cutoff_func(s_gpu, cutoff_gpu)

    # Compute the pseudoinverse without allocating a new diagonal matrix:
    return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
Example #29
0
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  track=False):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        step = 0
        input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in range(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        input_queue_gpu = ga.to_gpu(input_queue)
        output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32)
        output_queue[0] = 1
        output_queue_gpu = ga.to_gpu(output_queue)

        if track:
            step_photon_ids = []
            step_photons = []
            #save the first step for all photons in the input queue
            step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get())
            step_photons.append(
                self.copy_queue(input_queue_gpu[1:], nphotons).get())

        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low and not tracking
            if not track and (nphotons < nthreads_per_block * 16 * 8
                              or use_weights):
                nsteps = max_steps - step
            else:
                nsteps = 1

            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.propagate(np.int32(first_photon),
                                         np.int32(photons_this_round),
                                         input_queue_gpu[1:],
                                         output_queue_gpu,
                                         rng_states,
                                         self.pos,
                                         self.dir,
                                         self.wavelengths,
                                         self.pol,
                                         self.t,
                                         self.flags,
                                         self.last_hit_triangles,
                                         self.weights,
                                         self.evidx,
                                         np.int32(nsteps),
                                         np.int32(use_weights),
                                         np.int32(scatter_first),
                                         gpu_geometry.gpudata,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))

            if track:  #save the next step for all photons in the input queue
                step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get())
                step_photons.append(
                    self.copy_queue(input_queue_gpu[1:], nphotons).get())

            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass

            if step < max_steps:
                temp = input_queue_gpu
                input_queue_gpu = output_queue_gpu
                output_queue_gpu = temp
                # Assign with a numpy array of length 1 to silence
                # warning from PyCUDA about setting array with different strides/storage orders.
                output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                nphotons = input_queue_gpu[:1].get()[0] - 1
                if nphotons == 0:
                    break

        if ga.max(self.flags).get() & (1 << 31):
            print("WARNING: ABORTED PHOTONS", file=sys.stderr)
        cuda.Context.get_current().synchronize()

        if track:
            return step_photon_ids, step_photons
Example #30
0
gpuSum = reduct.ReductionKernel(numpy.int32, neutral="0", reduce_expr="a+b",map_expr="1 << x[i]",  arguments="long* x")
a_gpu = gpuarray.to_gpu(numpy.asarray(range(n), dtype = numpy.int64))

t0 = time.time()
krnl = reduct.ReductionKernel(numpy.int64, neutral="0",
                reduce_expr="x[i] + x[i+1]", map_expr="x[i]",
                        arguments="long *x")



t1 = time.time()
#res = krnl(a_gpu).get()
t5 = time.time()

print gpuarray.max(a_gpu).get()

t6 = time.time()


print '%0.6f' % ((t6 - t5)*1000)



print '%0.6f' % ((t1 - t0)*1000)

t2 = time.time()

sum = sum(range(n))
#maxcpu = max(range(n))
Example #31
0
    def propagate_hit(self, gpu_geometry, rng_states, parameters):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.


        got one abort::

             In [1]: a = ph("hhMOCK")

             In [9]: f = a[:,3,2].view(np.uint32)

             In [12]: np.where( f & 1<<31 )
             Out[12]: (array([279]),)

        failed to just mock that one::

              RANGE=279:280 MockNuWa MOCK 


        """
        nphotons = self.pos.size
        nwork = nphotons

        nthreads_per_block = parameters['threads_per_block']
        max_blocks = parameters['max_blocks']
        max_steps = parameters['max_steps']
        use_weights = False
        scatter_first = 0

        self.upload_queues(nwork)

        solid_id_map_gpu = gpu_geometry.solid_id_map
        solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu

        small_remainder = nthreads_per_block * 16 * 8
        block = (nthreads_per_block, 1, 1)

        results = {}
        results['name'] = "propagate_hit"
        results['nphotons'] = nphotons
        results['nwork'] = nwork
        results['nsmall'] = small_remainder
        results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i"

        step = 0
        times = []

        npass = 0
        nabort = 0

        while step < max_steps:
            npass += 1
            if nwork < small_remainder or use_weights:
                nsteps = max_steps - step  # Just finish the rest of the steps if the # of photons is low
                log.debug(
                    "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s "
                    % (small_remainder, nwork, nsteps, max_steps))
            else:
                nsteps = 1
            pass
            log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " %
                     (nphotons, nwork, step, max_steps, nsteps))

            abort = False
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nwork, nthreads_per_block, max_blocks):
                if abort:
                    nabort += 1
                else:
                    grid = (blocks, 1)
                    args = (
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        self.input_queue_gpu[1:].gpudata,
                        self.output_queue_gpu.gpudata,
                        rng_states,
                        self.pos.gpudata,
                        self.dir.gpudata,
                        self.wavelengths.gpudata,
                        self.pol.gpudata,
                        self.t.gpudata,
                        self.flags.gpudata,
                        self.last_hit_triangles.gpudata,
                        self.weights.gpudata,
                        np.int32(nsteps),
                        np.int32(use_weights),
                        np.int32(scatter_first),
                        gpu_geometry.gpudata,
                        solid_id_map_gpu.gpudata,
                        solid_id_to_channel_id_gpu.gpudata,
                    )

                    log.info(
                        "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s "
                        % (repr(grid), repr(block), first_photon,
                           photons_this_round))
                    get_time = self.propagate_hit_kernel.prepared_timed_call(
                        grid, block, *args)
                    t = get_time()
                    times.append(t)
                    if t > self.max_time:
                        abort = True
                        log.warn(
                            "kernel launch time %s > max_time %s : ABORTING " %
                            (t, self.max_time))
                    pass
                pass
            pass
            log.info("step %s propagate_hit_kernel times  %s " %
                     (step, repr(times)))
            pass
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            if step < max_steps:
                nwork = self.swap_queues()
            pass
        pass

        log.info("calling max ")
        if ga.max(self.flags).get() & (1 << 31):
            log.warn("ABORTED PHOTONS")
        log.info("done calling max ")

        cuda.Context.get_current().synchronize()

        results['npass'] = npass
        results['nabort'] = nabort
        results['nlaunch'] = len(times)
        results['tottime'] = sum(times)
        results['maxtime'] = max(times)
        results['mintime'] = min(times)
        results[
            'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f"
        return results
c = gpuarray.empty((100, 100), dtype=dtype)
print('c:\n{0}\nshape={1}\n'.format(c, c.shape))

d = gpuarray.zeros((100, 100), dtype=dtype)
print('d:\n{0}\nshape={1}\n'.format(d, d.shape))

e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype)
print('e:\n{0}\nshape={1}\n'.format(e, e.shape))

f = gpuarray.if_positive(e < 50, e - 100, e + 100)
print('f:\n{0}\nshape={1}\n'.format(f, f.shape))

g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e))
print('g:\n{0}\nshape={1}\n'.format(g, g.shape))

h = gpuarray.maximum(e, f)
print('h:\n{0}\nshape={1}\n'.format(h, h.shape))

i = gpuarray.minimum(e, f)
print('i:\n{0}\nshape={1}\n'.format(i, i.shape))

g = gpuarray.sum(a)
print(g, type(g))

k = gpuarray.max(a)
print(k, type(k))

l = gpuarray.min(a)
print(l, type(l))
Example #33
0
def pinv(a_gpu, rcond=1e-15):
    """
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.

    Returns
    -------
    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    If the input matrix is square, the pseudoinverse uses less memory.

    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = linalg.pinv(a_gpu)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    True
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = linalg.pinv(b_gpu)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)
    True

    """

    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    # Perform in-place SVD if the matrix is square to save memory:
    if a_gpu.shape[0] == a_gpu.shape[1]:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o')
    else:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's')

    # Get block/grid sizes; the number of threads per block is limited
    # to 512 because the cutoff_invert_s kernel defined above uses too
    # many registers to be invoked in 1024 threads per block (i.e., on
    # GPUs with compute capability >= 2.x):
    dev = misc.get_current_device()
    max_threads_per_block = 512
    block_dim, grid_dim = misc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block)

    # Suppress very small singular values:
    use_double = 1 if s_gpu.dtype == np.float64 else 0
    cutoff_invert_s_mod = \
        SourceModule(cutoff_invert_s_template.substitute(use_double=use_double))
    cutoff_invert_s = \
                    cutoff_invert_s_mod.get_function('cutoff_invert_s')
    cutoff_gpu = gpuarray.max(s_gpu)*rcond
    cutoff_invert_s(s_gpu, cutoff_gpu,
                    np.uint32(s_gpu.size),
                    block=block_dim, grid=grid_dim)

    # Compute the pseudoinverse without allocating a new diagonal matrix:
    return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
def calculateTimestep(meshPropSpeedsGPU, cellDim):


    maxPropSpeed = gpuarray.max(cumath.fabs(meshPropSpeedsGPU)).get()
    return cellDim / (4.0 * maxPropSpeed)
Example #35
0
def cuda_gridvis(csrh_sun, csrh_satellite, settings, plan, chan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()

    #f = pyfits.open(settings['vfile'])

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180. / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))
    ## grab data
    #f = pyfits.open(settings['vfile'])

    Data = np.ndarray(shape=(44, 44, 16), dtype=complex)
    UVW = np.ndarray(shape=(780, 1), dtype='float64')
    Data, UVW = visibility(csrh_sun, csrh_satellite, chan)
    print "UVW*****\n", UVW

    # determin the file type (uvfits or fitsidi)
    h_uu = np.ndarray(shape=(780), dtype='float64')
    h_vv = np.ndarray(shape=(780), dtype='float64')
    h_rere = np.ndarray(shape=(780), dtype='float32')
    h_imim = np.ndarray(shape=(780), dtype='float32')

    freq = 1702500000.
    light_speed = 299792458.  # Speed of light

    ## quickly figure out what data is not flagged
    #np.float32(f[7].header['CRVAL3']) 299792458vvvv
    #good  = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0)

    #h_u   = np.float32(freq*f[0].data.par('uu')[good])
    #h_v   = np.float32(freq*f[0].data.par('vv')[good])

    blen = 0

    for antenna1 in range(0, 39):
        for antenna2 in range(antenna1 + 1, 40):
            h_rere[blen] = Data[antenna1][antenna2][chan].real
            h_imim[blen] = Data[antenna1][antenna2][chan].imag
            h_uu[blen] = freq * UVW[blen][0]
            h_vv[blen] = freq * UVW[blen][1]
            blen += 1

    print "h_u", h_uu
    #h_u = np.float32(h_u.ravel())
    #h_v = np.float32(h_v.ravel())
    gcount = np.int32(np.size(h_uu))
    #gcount = len(gcount.ravel())
    #h_re = np.float32(h_re.ravel())
    #h_im = np.float32(h_im.ravel())
    #freq = 3.45E11  #np.float32(f[0].header['CRVAL4'])

    blen = 0
    bl_order = np.ndarray(shape=(780, 2), dtype=int)
    good = []

    for border1 in range(0, 39):
        for border2 in range(border1 + 1, 40):
            bl_order[blen][0] = border1
            bl_order[blen][1] = border2
            blen = blen + 1

    blen = 0

    h_u = []
    h_v = []
    h_re = []
    h_im = []
    Flag_Ant = [
        0, 4, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26,
        28, 29, 37, 38, 39
    ]
    for blen in range(0, 780):
        if (bl_order[blen][0] not in Flag_Ant) and (bl_order[blen][1]
                                                    not in Flag_Ant):
            good.append(blen)
            h_u.append(h_uu[blen])
            h_v.append(h_vv[blen])
            h_re.append(h_rere[blen])
            h_im.append(h_imim[blen])

    #print "Good:",good

    gcount = np.int32(np.size(h_u))
    ## assume data is unpolarized
    #print chan
    print 'GCOUNT', gcount
    #print "H_U", h_u
    #print "H_V", h_v
    #print h_re
    #print h_im

    # h_ : host,  d_ : device
    h_grd = np.zeros((nx, nx), dtype=np.complex64)
    h_cnt = np.zeros((nx, nx), dtype=np.int32)
    d_u = gpu.to_gpu(np.array(h_uu, dtype='float32'))
    d_v = gpu.to_gpu(np.array(h_vv, dtype='float32'))
    d_re = gpu.to_gpu(np.array(h_rere, dtype='float32'))
    d_im = gpu.to_gpu(np.array(h_imim, dtype='float32'))
    d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
    d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
    d_ngrd = gpu.zeros_like(d_grd)
    d_bm = gpu.zeros_like(d_grd)
    d_nbm = gpu.zeros_like(d_grd)
    d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)
    ## define kernel parameters
    if imsize == 1024:
        blocksize2D = (8, 16, 1)
        gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])),
                      np.int(np.ceil(1. * nx / blocksize2D[1])))
        blocksizeF2D = (16, 16, 1)
        gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])),
                       np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
        blocksize1D = (256, 1, 1)
    else:
        blocksize2D = (16, 32, 1)
        gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])),
                      np.int(np.ceil(1. * nx / blocksize2D[1])))
        blocksizeF2D = (32, 32, 1)
        gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])),
                       np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
        blocksize1D = (512, 1, 1)

    gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

    # ------------------------
    # make gridding kernels
    # ------------------------
    ## make spheroidal convolution kernel (don't mess with these!)
    width = 6.
    ngcf = 24.
    h_cgf = gcf(ngcf, width)
    ## make grid correction
    h_corr = corrfun(nx, width)
    d_cgf = module.get_global('cgf')[0]
    d_corr = gpu.to_gpu(h_corr)
    cu.memcpy_htod(d_cgf, h_cgf)

    # ------------------------
    # grid it up
    # ------------------------
    d_umax = gpu.max(cumath.fabs(d_u))
    d_vmax = gpu.max(cumath.fabs(d_v))
    umax = np.int32(np.ceil(d_umax.get() / du))
    vmax = np.int32(np.ceil(d_vmax.get() / du))

    ## grid ($$)
    #  This should be improvable via:
    #    - shared memory solution? I tried...
    #    - better coalesced memory access? I tried...
    #    - reorganzing and indexing UV data beforehand?
    #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
    #    - storing V(u,v) in texture memory?
    gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                       block=blocksize2D, grid=gridsize2D)

    ## apply weights
    wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
    hfac = np.int32(1)
    dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## normalize

    wgtGrid_kernel(d_grd,
                   d_cnt,
                   briggs,
                   nx,
                   block=blocksize2D,
                   grid=gridsize2D)
    ## Reflect grid about v axis
    hfac = np.int32(-1)
    dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
    ## Shift both
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

    # ------------------------
    # Make the beam
    # ------------------------
    ## Transform to image plane
    fft.fft(d_nbm, d_bm, plan)
    ## Shift
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    # Trim
    trimIm_kernel(d_nbm,
                  d_fim,
                  noff,
                  nx,
                  imsize,
                  block=blocksizeF2D,
                  grid=gridsizeF2D)
    ## Normalize
    d_bmax = gpu.max(d_fim)
    bmax = d_bmax.get()
    bmax = np.float32(1. / bmax)
    nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Pull onto CPU
    dpsf = d_fim.get()

    # ------------------------
    # Make the map
    # ------------------------
    ## Transform to image plane
    fft.fft(d_ngrd, d_grd, plan)
    ## Shift
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    ## Trim
    trimIm_kernel(d_ngrd,
                  d_fim,
                  noff,
                  nx,
                  imsize,
                  block=blocksizeF2D,
                  grid=gridsizeF2D)
    ## Normalize (Jy/beam)
    nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

    ## Finish timers
    t_end = time.time()
    t_full = t_end - t_start
    print "Gridding execution time %0.5f" % t_full + ' s'
    print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

    ## Return dirty psf (CPU) and dirty image (GPU)
    return dpsf, d_fim
def bicgstabMemory(cublasHandle, x_gpu, b_gpu, Xprime_gpu, X_gpu, XX_gpu,
                   Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq,
                   FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime,
                   sizePartitionr, sizePartitionc, M, max_it, tol):

    # --- flag:     0 = solution found to tolerance
    #               1 = no convergence given max_it
    #              -1 = breakdown: rho = 0
    #              -2 = breakdown: omega = 0

    N = xcg_gpu.size

    # --- Initializations
    iter = np.float32(0)
    flag = np.float32(0)
    alpha = np.float32(0)
    rho_1 = np.float32(0)
    v_gpu = gpuarray.zeros(N, dtype=np.float32)
    p_gpu = gpuarray.zeros(N, dtype=np.float32)
    #    d_p_hat = gpuarray.zeros(N, dtype = np.float32)
    #    d_s_hat = gpuarray.zeros(N, dtype = np.float32)
    #    d_t     = gpuarray.zeros(N, dtype = np.float32)

    #bnrm2   = np.sqrt((culinalg.dot(b_gpu, b_gpu.conj(), 'T', 'N').real).get())
    bnrm2 = cublas.cublasScnrm2(cublasHandle, N, b_gpu.gpudata, 1)
    if bnrm2 == np.float32(0.0):
        bnrm2 = np.float32(1.0)

    yprime_gpu = computeAx(x_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu,
                           YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c,
                           Deltaxprime, Deltayprime, Deltazprime,
                           sizePartitionc, XX_gpu.size)
    xprime_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu,
                            Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c,
                            Deltaxprime * Deltayprime * Deltazprime,
                            sizePartitionr, b_gpu.size)
    r_gpu = b_gpu - xprime_gpu
    error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2
    if (error < tol):
        return x_gpu, error, iter, flag

    omega = np.float32(1.0)
    r_tld_gpu = r_gpu.copy()

    for iter in range(max_it):

        rho = cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1,
                                 r_gpu.gpudata, 1)  # direction vector
        if (rho == np.float32(0.0)):
            break

        if (iter > 0):
            beta = (rho / rho_1) * (alpha / omega)
            cublas.cublasCaxpy(cublasHandle, N, -omega, v_gpu.gpudata, 1,
                               p_gpu.gpudata, 1)
            cublas.cublasCscal(cublasHandle, N, beta, p_gpu.gpudata, 1)
            cublas.cublasCaxpy(cublasHandle, N, np.float32(1.0), r_gpu.gpudata,
                               1, p_gpu.gpudata, 1)
        else:
            p_gpu = r_gpu.copy()

        p_hat_gpu = p_gpu.copy()
        yprime_gpu = computeAx(p_hat_gpu, Xprime_gpu, X_gpu, XX_gpu,
                               Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero,
                               freq, FREQ_gpu, c, Deltaxprime, Deltayprime,
                               Deltazprime, sizePartitionc, XX_gpu.size)
        v_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu,
                           Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c,
                           Deltaxprime * Deltayprime * Deltazprime,
                           sizePartitionr, b_gpu.size)

        alpha = rho / cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1,
                                         v_gpu.gpudata, 1)
        s_gpu = r_gpu.copy()
        cublas.cublasCaxpy(cublasHandle, N, -alpha, v_gpu.gpudata, 1,
                           s_gpu.gpudata, 1)
        norms = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata, 1)
        if (norms < tol):  # --- early convergence check
            cublas.cublasCaxpy(cublasHandle, N, np.float32(alpha),
                               p_hat_gpu.gpudata, 1, x_gpu.gpudata, 1)
            break

        # --- stabilizer
        s_hat_gpu = s_gpu.copy()
        yprime_gpu = computeAx(s_hat_gpu, Xprime_gpu, X_gpu, XX_gpu,
                               Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero,
                               freq, FREQ_gpu, c, Deltaxprime, Deltayprime,
                               Deltazprime, sizePartitionc, XX_gpu.size)
        t_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu,
                           Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c,
                           Deltaxprime * Deltayprime * Deltazprime,
                           sizePartitionr, b_gpu.size)
        omega = cublas.cublasCdotc(cublasHandle, N, t_gpu.gpudata, 1,
                                   s_gpu.gpudata, 1) / cublas.cublasCdotc(
                                       cublasHandle, N, t_gpu.gpudata, 1,
                                       t_gpu.gpudata, 1)

        # --- update approximation
        cublas.cublasCaxpy(cublasHandle, N, alpha, p_hat_gpu.gpudata, 1,
                           x_gpu.gpudata, 1)
        cublas.cublasCaxpy(cublasHandle, N, omega, s_hat_gpu.gpudata, 1,
                           x_gpu.gpudata, 1)

        r_gpu = s_gpu.copy()
        cublas.cublasCaxpy(cublasHandle, N, -omega, t_gpu.gpudata, 1,
                           r_gpu.gpudata, 1)

        error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2
        # --- check convergence
        if (error <= tol):
            break

        if (omega == np.float32(0.0)):
            break

        rho_1 = rho

        print("iteration")

    temp = np.sqrt(
        gpuarray.max(s_gpu.real * s_gpu.real + s_gpu.imag * s_gpu.imag).get())
    if ((error <= np.float32(tol)) or temp <= tol):  # --- converged
        if (temp <= tol):
            error = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata,
                                        1) / bnrm2
        flag = 0
    elif (omega == np.float32(0.0)):  # --- breakdown
        flag = -2
    elif (rho == np.float32(0.0)):
        flag = -1
    else:  # --- no convergence
        flag = 1

    p_hat_gpu.gpudata.free()
    s_hat_gpu.gpudata.free()
    v_gpu.gpudata.free()
    t_gpu.gpudata.free()

    return xcg_gpu, 0, 0, 0
    Theil(nps,
          indices,
          data_gpu,
          pt_ax_gpu,
          pts_gpu,
          Nevts,
          block=(BLOCKSIZE, 1, 1),
          grid=(nps / BLOCKSIZE, 1, 1))

    theils = indices.get()
    pion = pion_gpu.get()
    kaon = kaon_gpu.get()
    all_shit = []

    best_th = gpuarray.max(indices).get()
    mask = indices == best_th
    S = gpuarray.sum(mask).get()
    mpi = gpuarray.sum(pion_gpu * mask) / S
    mk = gpuarray.sum(kaon_gpu * mask) / S

    if S > 1:
        print "interesting, ", S, " points out of ", len(
            mask), "have all the maximum Theil index, will average them"
    print "Best Theil:", best_th, "Mpi: ", mpi, "MK: ", mk, "Scale: ", scale
    print "##############################################"
    result = [best_th, mpi.get(), mk.get()]

    #ploteo o resultado neste espazo:
    def toy(x, Mt, mt, M=MPDG, m=mPDG):
        a = np.sqrt(M**2 - 4. * m**2) / M
Example #38
0
def cuda_gridvis(sub_array, f, settings, plan, chan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()
    if sub_array==1:
        Antennas = 40
    else:
        Antennas = 60

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180. / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))

    # determin the file type (uvfits or fitsidi)
    h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64')
    h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64')
    h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32')
    h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32')

    #Get Visibility Data and values of UVW
    if settings['vfile'].find('.uvfits') != -1:
        freq = 3.45E11 #np.float32(f[0].header['CRVAL4'])
        light_speed = 299792458.
        good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0)

        h_u = np.float32(light_speed * f[0].data.par('uu')[good])
        print "h_u", h_u.shape
        h_v = np.float32(light_speed * f[0].data.par('vv')[good])
        gcount = np.int32(np.size(h_u))
        ## assume data is unpolarized
        h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0])
        h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1])

        freq = 1702500000.
        light_speed = 299792458.  # Speed of light


        ## assume data is unpolarized
        #print chan
        print 'GCOUNT', gcount

        # h_ : host,  d_ : device
        h_grd = np.zeros((nx, nx), dtype=np.complex64)
        h_cnt = np.zeros((nx, nx), dtype=np.int32)
        d_u = gpu.to_gpu(np.array(h_u,dtype='float32'))
        d_v = gpu.to_gpu(np.array(h_v,dtype='float32'))
        d_re = gpu.to_gpu(np.array(h_re,dtype='float32'))
        d_im = gpu.to_gpu(np.array(h_im,dtype='float32'))
        d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
        d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
        d_ngrd = gpu.zeros_like(d_grd)
        d_bm = gpu.zeros_like(d_grd)
        d_nbm = gpu.zeros_like(d_grd)
        d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)

        ## define kernel parameters
        if imsize == 1024:
            blocksize2D = (8, 16, 1)
            gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
            blocksizeF2D = (16, 16, 1)
            gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
            blocksize1D = (256, 1, 1)
        else:
            blocksize2D = (16, 32, 1)
            gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
            blocksizeF2D = (32, 32, 1)
            gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
            blocksize1D = (512, 1, 1)

        gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

        # ------------------------
        # make gridding kernels
        # ------------------------
        ## make spheroidal convolution kernel (don't mess with these!)
        width = 6.
        ngcf = 24.
        h_cgf = gcf(ngcf, width)
        ## make grid correction
        h_corr = corrfun(nx, width)
        d_cgf = module.get_global('cgf')[0]
        d_corr = gpu.to_gpu(h_corr)
        cu.memcpy_htod(d_cgf, h_cgf)

        # ------------------------
        # grid it up
        # ------------------------
        d_umax = gpu.max(cumath.fabs(d_u))
        d_vmax = gpu.max(cumath.fabs(d_v))
        umax = np.int32(np.ceil(d_umax.get() / du))
        vmax = np.int32(np.ceil(d_vmax.get() / du))

        ## grid ($$)
        #  This should be improvable via:
        #    - shared memory solution? I tried...
        #    - better coalesced memory access? I tried...
        #    - reorganzing and indexing UV data beforehand?
        #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
        #    - storing V(u,v) in texture memory?

        # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution.
        # This kernel also calculates the point spread function and the local sampling
        # from the data (for applying the weights later).
        gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                           block=blocksize2D, grid=gridsize2D)

        ## apply weights
        wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
        hfac = np.int32(1)
        dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
        shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
        ## normalize

        wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
        ## Reflect grid about v axis
        hfac = np.int32(-1)
        dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
        ## Shift both
        shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

        # ------------------------
        # Make the beam
        # ------------------------
        ## Transform to image plane
        fft.fft(d_nbm, d_bm, plan)
        ## Shift
        shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
        ## Correct for C
        corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
        # Trim
        trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Normalize
        d_bmax = gpu.max(d_fim)
        bmax = d_bmax.get()
        bmax = np.float32(1. / bmax)
        nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Pull onto CPU
        dpsf = d_fim.get()

        # ------------------------
        # Make the map
        # ------------------------
        ## Transform to image plane
        fft.fft(d_ngrd, d_grd, plan)
        ## Shift
        shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
        ## Correct for C
        corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
        ## Trim
        trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Normalize (Jy/beam)
        nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

        ## Finish timers
        t_end = time.time()
        t_full = t_end - t_start
        print "Gridding execution time %0.5f" % t_full + ' s'
        print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

        ## Return dirty psf (CPU) and dirty image (GPU)
        return dpsf, d_fim
Example #39
0
        d_thisFront, d_nextFront = d_nextFront, d_thisFront
        d_nextFront.fill(0)
        #d_nextFront = gpu.zeros_like(d_thisFront)
        # Run the CUDA kernel with the appropriate inputs
        start_gpu_time.record()
        regionGrow_kernel(d_image,
                          d_region,
                          d_thisFront,
                          d_nextFront,
                          width,
                          height,
                          d_threshold,
                          block=blocksize,
                          grid=gridsize)
        # nextFront should have all zeroes if there are no more fronts
        moreFronts = gpu.max(d_nextFront).get()
        end_gpu_time.record()
        end_gpu_time.synchronize()
        gpu_comp_time += start_gpu_time.time_till(end_gpu_time) * 1e-3

        # check if the max element in nextFront is a zero
        if (moreFronts == 0):
            contComp = False  # terminate loop
        # Increment counter
        i += 1

    # Copy from device to host
    start_gpu_time.record()
    h_region = d_region.get()
    h_region = h_region.reshape([height, width])
    end_gpu_time.record()
Example #40
0
def cuda_gridvis(settings,plan):
  """
  Grid the visibilities parallelized by pixel.
  References:
    - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
        by Thompson, Moran, & Swenson
    - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
  """
  print "Gridding the visibilities"
  t_start=time.time()

  # unpack parameters
  vfile   = settings['vfile']
  briggs  = settings['briggs']
  imsize  = settings['imsize']
  cell    = settings['cell']
  nx      = np.int32(2*imsize)
  noff    = np.int32((nx-imsize)/2)

  ## constants
  arc2rad = np.float32(np.pi/180/3600.)
  du      = np.float32(1./(arc2rad*cell*nx))
  ## grab data
  f  = pyfits.open(settings['vfile'])
  ## quickly figure out what data is not flagged
  freq  = np.float32(f[0].header['CRVAL4'])
  good  = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0)
  h_u   = np.float32(freq*f[0].data.par('uu')[good])
  h_v   = np.float32(freq*f[0].data.par('vv')[good])
  gcount = np.int32(np.size(h_u))
  ## assume data is unpolarized
  h_re   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0]))
  h_im   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1]))
  ## make GPU arrays
  h_grd  = np.zeros((nx,nx),dtype=np.complex64)
  h_cnt  = np.zeros((nx,nx),dtype=np.int32)
  d_u    = gpu.to_gpu(h_u)
  d_v    = gpu.to_gpu(h_v)
  d_re   = gpu.to_gpu(h_re)
  d_im   = gpu.to_gpu(h_im)
  d_cnt  = gpu.zeros((np.int(nx),np.int(nx)),np.int32)
  d_grd  = gpu.zeros((np.int(nx),np.int(nx)),np.complex64)
  d_ngrd = gpu.zeros_like(d_grd)
  d_bm   = gpu.zeros_like(d_grd)
  d_nbm  = gpu.zeros_like(d_grd)
  d_fim  = gpu.zeros((np.int(imsize),np.int(imsize)),np.float32)
  ## define kernel parameters
  blocksize2D  = (8,16,1)
  gridsize2D   = (np.int(np.ceil(1.*nx/blocksize2D[0])),np.int(np.ceil(1.*nx/blocksize2D[1])))
  blocksizeF2D = (16,16,1)
  gridsizeF2D  = (np.int(np.ceil(1.*imsize/blocksizeF2D[0])),np.int(np.ceil(1.*imsize/blocksizeF2D[1])))
  blocksize1D  = (256,1,1)
  gridsize1D   = (np.int(np.ceil(1.*gcount/blocksize1D[0])),1)

  # ------------------------
  # make gridding kernels
  # ------------------------
  ## make spheroidal convolution kernel (don't mess with these!)
  width = 6.
  ngcf  = 24.
  h_cgf = gcf(ngcf,width)
  ## make grid correction
  h_corr = corrfun(nx,width)
  d_cgf  = module.get_global('cgf')[0]
  d_corr = gpu.to_gpu(h_corr)
  cu.memcpy_htod(d_cgf,h_cgf)

  # ------------------------
  # grid it up
  # ------------------------
  d_umax = gpu.max(cumath.fabs(d_u))
  d_vmax = gpu.max(cumath.fabs(d_v))
  umax   = np.int32(np.ceil(d_umax.get()/du))
  vmax   = np.int32(np.ceil(d_vmax.get()/du))

  ## grid ($$)
  #  This should be improvable via:
  #    - shared memory solution? I tried...
  #    - better coalesced memory access? I tried...
  #    - reorganzing and indexing UV data beforehand?
  #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
  #    - storing V(u,v) in texture memory?
  gridVis_wBM_kernel(d_grd,d_bm,d_cnt,d_u,d_v,d_re,d_im,nx,du,gcount,umax,vmax,\
			block=blocksize2D,grid=gridsize2D)
  ## apply weights
  wgtGrid_kernel(d_bm,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D)
  hfac = np.int32(1)
  dblGrid_kernel(d_bm,nx,hfac,block=blocksize2D,grid=gridsize2D)
  shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D)
  ## normalize
  wgtGrid_kernel(d_grd,d_cnt,briggs,nx,block=blocksize2D,grid=gridsize2D)
  ## Reflect grid about v axis
  hfac = np.int32(-1)
  dblGrid_kernel(d_grd,nx,hfac,block=blocksize2D,grid=gridsize2D)
  ## Shift both
  shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D)

  # ------------------------
  # Make the beam
  # ------------------------
  ## Transform to image plane
  fft.fft(d_nbm,d_bm,plan)
  ## Shift
  shiftGrid_kernel(d_bm,d_nbm,nx,block=blocksize2D,grid=gridsize2D)
  ## Correct for C
  corrGrid_kernel(d_nbm,d_corr,nx,block=blocksize2D,grid=gridsize2D)
  # Trim
  trimIm_kernel(d_nbm,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D)
  ## Normalize
  d_bmax = gpu.max(d_fim)
  bmax = d_bmax.get()
  bmax = np.float32(1./bmax)
  nrmBeam_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D)
  ## Pull onto CPU
  dpsf  = d_fim.get()

  # ------------------------
  # Make the map
  # ------------------------
  ## Transform to image plane
  fft.fft(d_ngrd,d_grd,plan)
  ## Shift
  shiftGrid_kernel(d_grd,d_ngrd,nx,block=blocksize2D,grid=gridsize2D)
  ## Correct for C
  corrGrid_kernel(d_ngrd,d_corr,nx,block=blocksize2D,grid=gridsize2D)
  ## Trim
  trimIm_kernel(d_ngrd,d_fim,noff,nx,imsize,block=blocksizeF2D,grid=gridsizeF2D)
  ## Normalize (Jy/beam)
  nrmGrid_kernel(d_fim,bmax,imsize,block=blocksizeF2D,grid=gridsizeF2D)

  ## Finish timers
  t_end=time.time()
  t_full=t_end-t_start
  print "Gridding execution time %0.5f"%t_full+' s'
  print "\t%0.5f"%(t_full/gcount)+' s per visibility'

  ## Return dirty psf (CPU) and dirty image (GPU)
  return dpsf,d_fim
Example #41
0
    def cuda_gridvis(self, plan, x_offset, y_offset):
        """
        Grid the visibilities parallelized by pixel.
        References:
          - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
              by Thompson, Moran, & Swenson
          - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/

        If the size of the image is 1024x1024, the plan should be at least 1024*1.414 (about 25 degrees' rotation)
        And to satisfy the requirements of CLEAN, the dirty image should be 1024* 2.828
        """
        logger.debug("Gridding the visibilities")
        t_start = time.time()

        nx = np.int32(2 * self.imsize)
        noff = np.int32((nx - self.imsize) / 2)
        arc2rad = np.float32(np.pi / 180. / 3600.)
        du = np.float32(1. / (arc2rad * self.cell)) / (self.imsize * 2.)
        logger.debug("1 Pixel DU  = %f" % du)

        h_uu = np.float32(self.h_uu.ravel())
        h_vv = np.float32(self.h_vv.ravel())
        h_rere = np.float32(self.h_rere.ravel())
        h_imim = np.float32(self.h_imim.ravel())

        blen = 0
        bl_order = np.ndarray(shape=(self.baseline_number, 2), dtype=int)
        good = []

        if self.baseline_number == 780:  # MUSER-I
            antennas = 40
        else:
            antennas = 60
        # print antennas
        for border1 in range(0, antennas - 1):
            for border2 in range(border1 + 1, antennas):
                bl_order[blen][0] = border1
                bl_order[blen][1] = border2
                blen = blen + 1

        h_u = []
        h_v = []
        h_re = []
        h_im = []
        for blen in range(0, self.baseline_number):
            if (bl_order[blen][0]
                    not in self.Flag_Ant) and (bl_order[blen][1]
                                               not in self.Flag_Ant):
                good.append(blen)

                h_u.append(h_uu[blen])
                h_v.append(h_vv[blen])
                h_re.append(h_rere[blen])
                h_im.append(h_imim[blen])

        gcount = np.int32(np.size(h_u))

        # h_ : host,  d_ : device
        # h_grd = np.zeros((nx, nx), dtype=np.complex64)
        # h_cnt = np.zeros((nx, nx), dtype=np.int32)
        d_u = gpu.to_gpu(np.array(h_u, dtype='float32'))
        d_v = gpu.to_gpu(np.array(h_v, dtype='float32'))
        d_re = gpu.to_gpu(np.array(h_re, dtype='float32'))
        d_im = gpu.to_gpu(np.array(h_im, dtype='float32'))
        d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
        d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
        d_ngrd = gpu.zeros_like(d_grd)
        d_bm = gpu.zeros_like(d_grd)
        d_nbm = gpu.zeros_like(d_grd)
        d_cbm = gpu.zeros_like(d_grd)

        d_fbm = gpu.zeros((np.int(nx), np.int(nx)), np.float32)
        d_fim = gpu.zeros((np.int(self.imsize), np.int(self.imsize)),
                          np.float32)
        d_dim = gpu.zeros((np.int(self.imsize), np.int(self.imsize)),
                          np.float32)

        d_sun_disk = gpu.zeros_like(d_grd)
        d_fdisk = gpu.zeros((np.int(self.imsize), np.int(self.imsize)),
                            np.float32)

        ## define kernel parameters
        self.calc_gpu_thread(nx, self.imsize, gcount)

        width = 6.
        ngcf = 24.
        h_cgf = self.gcf(ngcf, width)

        ## make grid correction
        h_corr = self.corrfun(nx, width)
        d_cgf = self.module.get_global('cgf')[0]
        d_corr = gpu.to_gpu(h_corr)
        cu.memcpy_htod(d_cgf, h_cgf)

        # ------------------------
        # grid it up
        # ------------------------
        d_umax = gpu.max(cumath.fabs(d_u))
        d_vmax = gpu.max(cumath.fabs(d_v))
        umax = np.int32(np.ceil(d_umax.get() / du))
        vmax = np.int32(np.ceil(d_vmax.get() / du))

        self.gridVis_wBM_kernel(d_grd,
                                d_bm,
                                d_cbm,
                                d_cnt,
                                d_u,
                                d_v,
                                d_re,
                                d_im,
                                np.int32(nx),
                                np.float32(du),
                                np.int32(gcount),
                                np.int32(umax),
                                np.int32(vmax),
                                np.int32(1 if self.correct_p_angle else 0),
                                block=self.blocksize_2D,
                                grid=self.gridsize_2D)

        ## apply weights
        self.wgtGrid_kernel(d_bm,
                            d_cnt,
                            self.briggs,
                            nx,
                            0,
                            block=self.blocksize_2D,
                            grid=self.gridsize_2D)
        hfac = np.int32(1)
        self.dblGrid_kernel(d_bm,
                            nx,
                            hfac,
                            block=self.blocksize_2D,
                            grid=self.gridsize_2D)
        self.dblGrid_kernel(d_cbm,
                            nx,
                            hfac,
                            block=self.blocksize_2D,
                            grid=self.gridsize_2D)

        self.shiftGrid_kernel(d_bm,
                              d_nbm,
                              nx,
                              block=self.blocksize_2D,
                              grid=self.gridsize_2D)

        self.shiftGrid_kernel(d_cbm,
                              d_bm,
                              nx,
                              block=self.blocksize_2D,
                              grid=self.gridsize_2D)

        ## normalize
        self.wgtGrid_kernel(d_grd,
                            d_cnt,
                            self.briggs,
                            nx,
                            0,
                            block=self.blocksize_2D,
                            grid=self.gridsize_2D)
        ## Reflect grid about v axis
        hfac = np.int32(-1)
        self.dblGrid_kernel(d_grd,
                            nx,
                            hfac,
                            block=self.blocksize_2D,
                            grid=self.gridsize_2D)
        ## Shift both
        self.shiftGrid_kernel(d_grd,
                              d_ngrd,
                              nx,
                              block=self.blocksize_2D,
                              grid=self.gridsize_2D)

        fft.fft(d_ngrd, d_grd, plan)
        ## Shift
        self.shiftGrid_kernel(d_grd,
                              d_ngrd,
                              nx,
                              block=self.blocksize_2D,
                              grid=self.gridsize_2D)
        ## Correct for C
        self.corrGrid_kernel(d_ngrd,
                             d_corr,
                             nx,
                             block=self.blocksize_2D,
                             grid=self.gridsize_2D)
        ## Trim
        self.trimIm_kernel(d_ngrd,
                           d_dim,
                           nx,
                           self.imsize,
                           block=self.blocksize_F2D,
                           grid=self.gridsize_F2D)
        self.copyIm_kernel(d_ngrd,
                           d_fbm,
                           nx,
                           block=self.blocksize_2D,
                           grid=self.gridsize_2D)
        ## Normalize (Jy/beam)i
        # self.nrmGrid_kernel(d_dim, bmax1, self.imsize, block=self.blocksize_F2D, grid=self.gridsize_F2D)
        # self.nrmGrid_kernel(d_fbm, bmax2, nx, block=self.blocksize_2D, grid=self.gridsize_2D)

        ## Finish timers
        t_end = time.time()
        t_full = t_end - t_start
        logger.debug("Gridding execution time %0.5f" % t_full + ' s')
        logger.debug("\t%0.5f" % (t_full / gcount) + ' s per visibility')

        # ----------------------

        ## Return dirty psf (CPU), dirty image (GPU) and sun disk

        return d_dim
Example #42
0
def normalize(array):
    array = asgpuarray(array)
    array -= gpuarray.min(array)
    array /= gpuarray.max(array)
    return array
Example #43
0
 def max(self):
     t = gpuarray.max(self._dat[0:self.npart_local:, :])
     return t.get()
Example #44
0
def pinv(a_gpu, rcond=1e-15):
    """
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.

    Returns
    -------
    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    If the input matrix is square, the pseudoinverse uses less memory.

    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = linalg.pinv(a_gpu)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    True
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = linalg.pinv(b_gpu)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)
    True

    """

    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    # Perform in-place SVD if the matrix is square to save memory:
    if a_gpu.shape[0] == a_gpu.shape[1]:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o')
    else:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's')

    # Suppress very small singular values:
    cutoff_gpu = gpuarray.max(s_gpu) * rcond
    ctype = tools.dtype_to_ctype(s_gpu.dtype)
    cutoff_func = el.ElementwiseKernel(
        "{ctype} *s, {ctype} *cutoff".format(ctype=ctype),
        "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}")
    cutoff_func(s_gpu, cutoff_gpu)

    # Compute the pseudoinverse without allocating a new diagonal matrix:
    return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
Example #45
0
def normalize( data ):
  maxVal = gpuarray.max(data).get()
  linearDouble(1./maxVal, np.float64(0.), data, data )
Example #46
0
def cuda_gridvis(settings, plan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180 / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))
    ## grab data
    f = pyfits.open(settings['vfile'])

    # determin the file type (uvfits or fitsidi)

    if settings['vfile'].find('.fitsidi') != -1:

        ## quickly figure out what data is not flagged
        freq = 3.45E11 #np.float32(f[7].header['CRVAL3']) 299792458vvvv
        #good  = np.where(f[0].data.data[:,0,0,0,0,0,0] != 0)

        #h_u   = np.float32(freq*f[0].data.par('uu')[good])
        #h_v   = np.float32(freq*f[0].data.par('vv')[good])
        light_speed = 299792458.         # Speed of light

        h_u = np.ndarray(shape=(780, 1),dtype='float64')
        h_v = np.ndarray(shape=(780, 1),dtype='float64')
        h_re = np.ndarray(shape=(780, 1),dtype='float32')
        h_im = np.ndarray(shape=(780, 1),dtype='float32')

        h_u = np.float64(light_speed * f[0].data[:].UU)
        h_v = np.float64(light_speed * f[0].data[:].VV)

        for bl in range(0, 780):

            #gcount += np.int32(np.size(h_u[bl]))
            ## assume data is unpolarized
            #h_re   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,0]+f[0].data.data[good,0,0,0,0,1,0]))
            #h_im   = np.float32(0.5*(f[0].data.data[good,0,0,0,0,0,1]+f[0].data.data[good,0,0,0,0,1,1]))
            h_re[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][0])
            h_im[bl] = np.float32(f[0].data[:].data[bl][0][0][0][0][0][1])
            ## make GPU arrays

        h_u = np.float32(h_u.ravel())
        h_v = np.float32(h_v.ravel())
        gcount = np.int32(np.size(h_u))
        #gcount = len(gcount.ravel())
        h_re = np.float32(h_re.ravel())
        h_im = np.float32(h_im.ravel())
        print len(h_re),len(h_im)
    elif settings['vfile'].find('.uvfits') != -1:
        freq = 3.45E11 #np.float32(f[0].header['CRVAL4'])
        light_speed = 299792458.
        good = np.where(f[0].data.data[:, 0, 0, 0, 0, 0] != 0)
        h_u = np.float32(light_speed * f[0].data.par('uu')[good])
        h_v = np.float32(light_speed * f[0].data.par('vv')[good])
        gcount = np.int32(np.size(h_u))
        ## assume data is unpolarized
        h_re = np.float32(f[0].data.data[good, 0, 0, 0, 0, 0])
        h_im = np.float32(f[0].data.data[good, 0, 0, 0, 0, 1])

    print h_u

    # h_ : host,  d_ : device
    h_grd = np.zeros((nx, nx), dtype=np.complex64)
    h_cnt = np.zeros((nx, nx), dtype=np.int32)
    d_u = gpu.to_gpu(h_u)
    d_v = gpu.to_gpu(h_v)
    d_re = gpu.to_gpu(h_re)
    d_im = gpu.to_gpu(h_im)
    d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
    d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
    d_ngrd = gpu.zeros_like(d_grd)
    d_bm = gpu.zeros_like(d_grd)
    d_nbm = gpu.zeros_like(d_grd)
    d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)
    ## define kernel parameters
    blocksize2D = (8, 16, 1)
    gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
    blocksizeF2D = (16, 16, 1)
    gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
    blocksize1D = (256, 1, 1)
    gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

    # ------------------------
    # make gridding kernels
    # ------------------------
    ## make spheroidal convolution kernel (don't mess with these!)
    width = 6.
    ngcf = 24.
    h_cgf = gcf(ngcf, width)
    ## make grid correction
    h_corr = corrfun(nx, width)
    d_cgf = module.get_global('cgf')[0]
    d_corr = gpu.to_gpu(h_corr)
    cu.memcpy_htod(d_cgf, h_cgf)

    # ------------------------
    # grid it up
    # ------------------------
    d_umax = gpu.max(cumath.fabs(d_u))
    d_vmax = gpu.max(cumath.fabs(d_v))
    umax = np.int32(np.ceil(d_umax.get() / du))
    vmax = np.int32(np.ceil(d_vmax.get() / du))

    ## grid ($$)
    #  This should be improvable via:
    #    - shared memory solution? I tried...
    #    - better coalesced memory access? I tried...
    #    - reorganzing and indexing UV data beforehand?
    #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
    #    - storing V(u,v) in texture memory?
    gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                       block=blocksize2D, grid=gridsize2D)
    ## apply weights
    wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
    hfac = np.int32(1)
    dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## normalize
    wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
    ## Reflect grid about v axis
    hfac = np.int32(-1)
    dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
    ## Shift both
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

    # ------------------------
    # Make the beam
    # ------------------------
    ## Transform to image plane
    fft.fft(d_nbm, d_bm, plan)
    ## Shift
    shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    # Trim
    trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Normalize
    d_bmax = gpu.max(d_fim)
    bmax = d_bmax.get()
    bmax = np.float32(1. / bmax)
    nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Pull onto CPU
    dpsf = d_fim.get()

    # ------------------------
    # Make the map
    # ------------------------
    ## Transform to image plane
    fft.fft(d_ngrd, d_grd, plan)
    ## Shift
    shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
    ## Correct for C
    corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
    ## Trim
    trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
    ## Normalize (Jy/beam)
    nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

    ## Finish timers
    t_end = time.time()
    t_full = t_end - t_start
    print "Gridding execution time %0.5f" % t_full + ' s'
    print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

    ## Return dirty psf (CPU) and dirty image (GPU)
    return dpsf, d_fim
Example #47
0
def pinv(a_gpu, rcond=1e-15):
    """
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.

    Returns
    -------
    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    If the input matrix is square, the pseudoinverse uses less memory.

    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = linalg.pinv(a_gpu)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    True
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = linalg.pinv(b_gpu)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)
    True

    """

    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    # Perform in-place SVD if the matrix is square to save memory:
    if a_gpu.shape[0] == a_gpu.shape[1]:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o')
    else:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's')

    # Get block/grid sizes; the number of threads per block is limited
    # to 512 because the cutoff_invert_s kernel defined above uses too
    # many registers to be invoked in 1024 threads per block (i.e., on
    # GPUs with compute capability >= 2.x):
    dev = misc.get_current_device()
    max_threads_per_block = 512
    block_dim, grid_dim = misc.select_block_grid_sizes(dev, s_gpu.shape,
                                                       max_threads_per_block)

    # Suppress very small singular values:
    use_double = 1 if s_gpu.dtype == np.float64 else 0
    cutoff_invert_s_mod = \
        SourceModule(cutoff_invert_s_template.substitute(use_double=use_double))
    cutoff_invert_s = \
                    cutoff_invert_s_mod.get_function('cutoff_invert_s')
    cutoff_gpu = gpuarray.max(s_gpu) * rcond
    cutoff_invert_s(s_gpu,
                    cutoff_gpu,
                    np.uint32(s_gpu.size),
                    block=block_dim,
                    grid=grid_dim)

    # Compute the pseudoinverse without allocating a new diagonal matrix:
    return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
Example #48
0
def calculateTimestep(PropSpeedsGPU, cellDim):

    maxPropSpeed = gpuarray.max(cumath.fabs(PropSpeedsGPU)).get()
    return cellDim / (4.0 * maxPropSpeed), maxPropSpeed
 def max(self):
     return gpuarray.max(self.arr).get().max()