def minmax_pycuda(Dx_gpu, Dy_gpu):
    """
    Given two GPUArrays, finds and returns their mins and maxes.
    This is all done in the GPU; only the mins/maxes are sent back to the host.
    """
    return (gpuarray.min(Dx_gpu).get(), gpuarray.max(Dx_gpu).get(),
            gpuarray.min(Dy_gpu).get(), gpuarray.max(Dy_gpu).get())
Esempio n. 2
0
 def pixel_similarity_cuda(self, image):
     N = image.shape[0]
     nd = self.pairwise_difference(image, N)
     diff = gpuarray.max(nd) - gpuarray.min(nd)
     norm = self.gdivide(nd, diff)
     C = 1 - norm
     return C
Esempio n. 3
0
def _minmax_impl(a_gpu, axis, min_or_max, out, idxout, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.
    Hacked together from scikits.cuda code, since that doesn't have an "out"
    argument'''
    assert len(a_gpu.shape) < 3
    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = scm._get_minmax_kernel(a_gpu.dtype, min_or_max)
    target = out
    idx = idxout
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                   block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
Esempio n. 4
0
def timeStepHydro():
  for coord in [ 1, 2, 3]:
    #Bind textures to read conserved
    tex_1.set_array( cnsv1_array )
    tex_2.set_array( cnsv2_array )
    tex_3.set_array( cnsv3_array )
    tex_4.set_array( cnsv4_array )
    tex_5.set_array( cnsv5_array )
    #Bind surfaces to write inter-cell fluxes
    surf_1.set_array( flx1_array )
    surf_2.set_array( flx2_array )
    surf_3.set_array( flx3_array )
    surf_4.set_array( flx4_array )
    surf_5.set_array( flx5_array )
    setInterFlux_hll_kernel( np.int32( coord ), cudaPre( gamma ), cudaPre(dx), cudaPre(dy), cudaPre(dz), cnsv1_d, cnsv2_d, cnsv3_d, cnsv4_d, cnsv5_d, times_d,  grid=grid3D, block=block3D )
    if coord == 1:
      dt = c0 * gpuarray.min( times_d ).get()
      print dt
    #Bind textures to read inter-cell fluxes
    tex_1.set_array( flx1_array )
    tex_2.set_array( flx2_array )
    tex_3.set_array( flx3_array )
    tex_4.set_array( flx4_array )
    tex_5.set_array( flx5_array )
    getInterFlux_hll_kernel( np.int32( coord ), cudaPre( dt ), cudaPre( gamma ), cudaPre(dx), cudaPre(dy), cudaPre(dz),
                          cnsv1_d, cnsv2_d, cnsv3_d, cnsv4_d, cnsv5_d,
                          gForceX_d, gForceY_d, gForceZ_d, gravWork_d, grid=grid3D, block=block3D )
    copy3D_cnsv1()
    copy3D_cnsv2()
    copy3D_cnsv3()
    copy3D_cnsv4()
    copy3D_cnsv5()
Esempio n. 5
0
def minimum_cuda(a, b=None):
    """Minimum values of two GPUArrays.

    Parameters
    ----------
    a : gpuarray
        First GPUArray.
    b : gpuarray
        Second GPUArray.

    Returns
    -------
    gpuarray
        Minimum values from both GPArrays, or single value if one GPUarray.

    Examples
    --------
    >>> a = minimum_cuda(give_cuda([1, 2, 3]), give_cuda([3, 2, 1]))
    [1, 2, 1]

    >>> type(a)
    <class 'pycuda.gpuarray.GPUArray'>
    """
    if b is not None:
        return cuda_array.minimum(a, b)
    return cuda_array.min(a)
Esempio n. 6
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.'''
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                   block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
    return target, idx
Esempio n. 7
0
    def integrate(self, t, dt, nacptsteps, d_ucoeff):
        sm, explicit = self._sm, self._explicit
        moment, updateMoment = sm.moment, sm.updateMomentBDF
        updateDist, consMaxwellian = sm.updateDistBDF, sm.constructMaxwellian

        L0, M = self.scratch
        U0, LU0, U = self.scratch_moms
        a1, a2, g1, b = [*self.A, *self.G, self.B]  

        pex = lambda *v: print(*v) + exit(-1)
        psum = lambda v: pex(gpuarray.sum(v))
        pMom = lambda v: pex(v.get().reshape(-1,5))
        pmin = lambda v: pex(gpuarray.min(v))
        pmax = lambda v: pex(gpuarray.max(v))

        # Compute the moment of the initial distribution
        moment(t, d_ucoeff, U0)

        # Compute the explicit part; L0 = -∇·f(d_ucoeff);
        explicit(t, d_ucoeff, L0)

        # Compute the moment of the explicit part
        moment(t, L0, LU0) 

        # update the moments
        updateMoment(dt, a1, U0, -g1, LU0, a2, U, b)
        #pex(U.get().reshape(-1,5))

        # implictly construct the Maxwellian (or Gaussian, etc.) given moments
        consMaxwellian(t, U, M)
        #pex(gpuarray.sum(L0))

        if nacptsteps==-1: 
            #pex(LU0.get().reshape(-1,5))
            #pex(gpuarray.sum(d_ucoeff))
            pass


        # update the distribution
        updateDist(dt, a1, d_ucoeff, -g1, L0, b, M, a2, U, d_ucoeff)
        #pex(gpuarray.sum(d_ucoeff))


        if(nacptsteps==-1):
            #print("\n>> BDF-111\n")

            #pMom(U0)
            #psum(U0)
            #psum(L0)
            #pmax(L0)
            #psum(LU0)
            #pMom(LU0)
            #psum(U)
            #psum(M)
            #psum(d_ucoeff)
            #pmin(d_ucoeff)

            #exit(-1)
            pass
Esempio n. 8
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False):
    ''' Returns both max and argmax (min/argmin) along an axis.'''
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None or len(
            a_gpu.shape
    ) <= 1:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        out_shape = (1, ) * len(a_gpu.shape)
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).reshape(out_shape), None
        else:
            return gpuarray.min(a_gpu).reshape(out_shape), None
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1],
                                                         a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0
            and a_gpu.flags.c_contiguous) or (axis == 1
                                              and a_gpu.flags.f_contiguous):
        if keepdims:
            out_shape = (1, m) if axis == 0 else (m, 1)
        else:
            out_shape = (m, )
        target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(m, 1, 1),
                   stream=stream)
    else:
        if keepdims:
            out_shape = (1, n) if axis == 0 else (n, 1)
        else:
            out_shape = (n, )
        target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(n, 1, 1),
                   stream=stream)
    return target, idx
    def get_K_min(self):
        """
        Return the kinetic energy minimum
        """
        # fill array with values of the kinetic energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_K")(self.weighted, **self.wigner_mapper_params)

        return gpuarray.min(self.weighted).get()
Esempio n. 10
0
    def get_K_min(self):
        """
        Return the kinetic energy minimum
        """
        # allocate memory
        k_p_p_prime = gpuarray.zeros((self.P.size, self.P.size), np.float64)

        # fill array with values of the kinetic energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_K")(k_p_p_prime, **self.rho_mapper_params)

        return gpuarray.min(k_p_p_prime).get()
    def get_K_min(self):
        """
        Return the kinetic energy minimum in the lambda p space
        """
        # allocate memory
        k_p_lambda = gpuarray.zeros((self.P.size, self.Lambda.size), np.float64)

        # fill array with values of the kinetic energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_K_bulk")(k_p_lambda, **self.K_bulk_mapper_params)
        fill_compiled.get_function("fill_K_boundary")(k_p_lambda, **self.K_boundary_mapper_params)

        return gpuarray.min(k_p_lambda).get()
    def get_V_min(self):
        """
        Return the potential energy minimum in the x theta space
        """
        # allocate memory
        v_theta_x = gpuarray.zeros((self.Theta.size, self.X.size), np.float64)

        # fill array with values of the potential energy
        fill_compiled = SourceModule(
            self.fill_V_K.format(cuda_consts=self.cuda_consts, K=self.K, V=self.V)
        )
        fill_compiled.get_function("fill_V_bulk")(v_theta_x, **self.V_bulk_mapper_params)
        fill_compiled.get_function("fill_V_boundary")(v_theta_x, **self.V_boundary_mapper_params)

        return gpuarray.min(v_theta_x).get()
Esempio n. 13
0
def implicit_iteration( ):
  global alpha
  #Make FFT
  fftPlan.execute( psi_d, psiFFT_d )
  #get Derivatives
  getPartialsXY( Lx, Ly, psiFFT_d, partialX_d, fftKx_d, partialY_d, fftKy_d, block=block3D, grid=grid3D) 
  fftPlan.execute( partialX_d, inverse=True )
  fftPlan.execute( partialY_d, inverse=True )   
  implicitStep1( xMin, yMin, zMin, dx, dy, dz, alpha,  omega,  gammaX,  gammaY,  gammaZ,
		  partialX_d, partialY_d, psi_d, G_d, x0, y0, grid=grid3D, block=block3D)
  fftPlan.execute( G_d )
  implicitStep2( dtImag, fftKx_d , fftKy_d, fftKz_d, alpha, psiFFT_d, G_d, block=block3D, grid=grid3D) 
  fftPlan.execute( psiFFT_d, psi_d, inverse=True)  
  #setBoundryConditionsKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), psi_d, block=block3D, grid=grid3D)  
  normalize(dx, dy, dz, psi_d)
  #GetAlphas
  getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
  alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() )  #OPTIMIZACION 
Esempio n. 14
0
def timeTransition():
  global realDynamics, alpha, applyTransition
  realDynamics = not realDynamics
  applyTransition = False
  if realDynamics:
    cuda.memcpy_dtod(psiK2_d.ptr, psi_d.ptr, psi_d.nbytes)
    cuda.memcpy_dtod(psiRunge_d.ptr, psi_d.ptr, psi_d.nbytes)
    if realTEXTURE:
      copy3DpsiK1Real()
      copy3DpsiK1Imag()
      copy3DpsiK2Real()
      copy3DpsiK2Imag()
    print "Real Dynamics"
  else:    
    #GetAlphas
    getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
    alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() )  #OPTIMIZACION 
    print "Imaginary Dynamics"
Esempio n. 15
0
def _minmax_impl(a_gpu, axis, min_or_max, out, idxout, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.
    Hacked together from scikits.cuda code, since that doesn't have an "out"
    argument'''
    assert len(a_gpu.shape) < 3
    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1],
                                                         a_gpu.shape[0])
    col_kernel, row_kernel = scm._get_minmax_kernel(a_gpu.dtype, min_or_max)
    target = out
    idx = idxout
    if (axis == 0
            and a_gpu.flags.c_contiguous) or (axis == 1
                                              and a_gpu.flags.f_contiguous):
        col_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(m, 1, 1),
                   stream=stream)
    else:
        row_kernel(a_gpu,
                   target,
                   idx,
                   np.uint32(m),
                   np.uint32(n),
                   block=(32, 1, 1),
                   grid=(n, 1, 1),
                   stream=stream)
    def __call__(self,theta): 
        self.nCalls += 1                                                                  
        if len(theta) != self.cpa_space.d:
            raise ValueError(theta.shape,self.cpa_space.d)                  
        self.calc_ll(theta)
        ll = self.ll         
#        ret = gpuarray.sum(ll.gpu).get()
        if 1:
            ret = sum_krnl(ll.gpu, stream=None).get()
        else:
            raise ValueError("BAD IDEA")
            ret = gpuarray.min(ll.gpu).get() * len(ll)
            
        
#        tmp =  calc_sum_abs_double_prime(self.transformed.gpu,self.nPts).get() 
#         
#        tmp /= (0.02)**2
##        print 'tmp',tmp
#        ret -= tmp / 10
#        ipshell('hi')
#        1/0            
        return ret
Esempio n. 17
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False):
    """ Returns both max and argmax (min/argmin) along an axis."""
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None or len(a_gpu.shape) <= 1:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        out_shape = (1,) * len(a_gpu.shape)
        if min_or_max == "max":
            return gpuarray.max(a_gpu).reshape(out_shape), None
        else:
            return gpuarray.min(a_gpu).reshape(out_shape), None
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        if keepdims:
            out_shape = (1, m) if axis == 0 else (m, 1)
        else:
            out_shape = (m,)
        target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        if keepdims:
            out_shape = (1, n) if axis == 0 else (n, 1)
        else:
            out_shape = (n,)
        target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
    return target, idx
Esempio n. 18
0
    def __call__(self, theta):
        self.nCalls += 1
        if len(theta) != self.cpa_space.d:
            raise ValueError(theta.shape, self.cpa_space.d)
        self.calc_ll(theta)
        ll = self.ll
        #        ret = gpuarray.sum(ll.gpu).get()
        if 1:
            ret = sum_krnl(ll.gpu, stream=None).get()
        else:
            raise ValueError("BAD IDEA")
            ret = gpuarray.min(ll.gpu).get() * len(ll)


#        tmp =  calc_sum_abs_double_prime(self.transformed.gpu,self.nPts).get()
#
#        tmp /= (0.02)**2
##        print 'tmp',tmp
#        ret -= tmp / 10
#        ipshell('hi')
#        1/0
        return ret
Esempio n. 19
0
 def computeDt(self):
     max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
     return max_dt * 0.5
 def min(self):
     return gpuarray.min(self.arr).get().min()
Esempio n. 21
0
def normalize(array):
    array = asgpuarray(array)
    array -= gpuarray.min(array)
    array /= gpuarray.max(array)
    return array
c = gpuarray.empty((100, 100), dtype=dtype)
print('c:\n{0}\nshape={1}\n'.format(c, c.shape))

d = gpuarray.zeros((100, 100), dtype=dtype)
print('d:\n{0}\nshape={1}\n'.format(d, d.shape))

e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype)
print('e:\n{0}\nshape={1}\n'.format(e, e.shape))

f = gpuarray.if_positive(e < 50, e - 100, e + 100)
print('f:\n{0}\nshape={1}\n'.format(f, f.shape))

g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e))
print('g:\n{0}\nshape={1}\n'.format(g, g.shape))

h = gpuarray.maximum(e, f)
print('h:\n{0}\nshape={1}\n'.format(h, h.shape))

i = gpuarray.minimum(e, f)
print('i:\n{0}\nshape={1}\n'.format(i, i.shape))

g = gpuarray.sum(a)
print(g, type(g))

k = gpuarray.max(a)
print(k, type(k))

l = gpuarray.min(a)
print(l, type(l))
Esempio n. 23
0
  fftKx_h[i] = i*2*np.pi/Lx
for i in range(nWidth/2, nWidth):
  fftKx_h[i] = (i-nWidth)*2*np.pi/Lx  
for i in range(nHeight/2):
  fftKy_h[i] = i*2*np.pi/Ly
for i in range(nHeight/2, nHeight):
  fftKy_h[i] = (i-nHeight)*2*np.pi/Ly
for i in range(nDepth/2):
  fftKz_h[i] = i*2*np.pi/Lz
for i in range(nDepth/2, nDepth):
  fftKz_h[i] = (i-nDepth)*2*np.pi/Lz
psi_d = gpuarray.to_gpu(psi_h)
alphas_d = gpuarray.to_gpu(  np.zeros_like(psi_h.real) )
normalize( dx, dy, dz, psi_d )
getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
alpha=( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get()  
psiMod_d = gpuarray.to_gpu(  np.zeros_like(psi_h.real) )
psiFFT_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
partialX_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
partialY_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
G_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )
fftKx_d = gpuarray.to_gpu( fftKx_h )         #OPTIMIZATION
fftKy_d = gpuarray.to_gpu( fftKy_h )
fftKz_d = gpuarray.to_gpu( fftKz_h )
activity_d = gpuarray.to_gpu( np.ones( nBlocks3D, dtype=np.uint8 ) )
psiOther_d = gpuarray.to_gpu(  np.zeros_like(psi_h.real) )
psiK1_d = gpuarray.to_gpu( psi_h )
psiK2_d = gpuarray.to_gpu( psi_h )
psiRunge_d = gpuarray.to_gpu( psi_h )
#For FFT version
laplacian_d = gpuarray.to_gpu(  np.zeros_like(psi_h) )