コード例 #1
0
def create_quantiles(data, params):
    global quantiles, q_lb, q_ub, mask
    sort_gpu(data)

    if mask.shape != data.shape:
        mask = gpuarray.zeros_like(data)

    n_lb = gpuarray.sum(data < mask)
    n_ub = gpuarray.sum(data > mask)

    fill_lb_quantiles(data,
                      quantiles,
                      n_lb,
                      n_ub,
                      q_lb,
                      block=(quantiles.shape[0], 1, 1))
    fill_ub_quantiles(data,
                      quantiles,
                      n_lb,
                      n_ub,
                      q_ub,
                      block=(quantiles.shape[0], 1, 1))
    q_lb = q_lb.reverse()

    p_ub = n_ub / (n_ub + n_lb)

    del n_lb, n_ub

    return data, q_lb.get(), q_ub.get(), probs * (
        1 - p_ub.get()), probs * p_ub.get()
コード例 #2
0
ファイル: algo.py プロジェクト: uncbiag/PStrip
def computeEnergy(D_v, S, T, _Lambda, _gamma_c, Alpha, Beta):
    l, m, n = S.shape

    sum_alpha_beta = gpuarray.zeros_like(D_v)
    sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta)

    GR = grad(T)
    square_matrix(GR, GR)
    G_norm = gpuarray.zeros_like(T)
    sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm,
                     1.0, 1.0, 1.0)
    sqrt_matrix(G_norm, G_norm)
    #    multiply_matrix(G_norm, _Gamma, G_norm)
    ET = _gamma_c * gpuarray.sum(G_norm)

    SP = gpuarray.zeros_like(S)
    absolute_matrix(S, SP)
    multiply_matrix(SP, _Lambda, SP)
    ES = gpuarray.sum(SP)

    sparse = D_v - S.reshape(l * m * n, 1) - T.reshape(l * m * n,
                                                       1) - sum_alpha_beta
    square_matrix(sparse, sparse)
    EL = gpuarray.sum(sparse)

    E = 1 / 2 * EL.get() + ES.get() + ET.get()

    return EL.get(), ES.get(), ET.get(), E
コード例 #3
0
ファイル: ssrbf_psi_gpucomp.py プロジェクト: Arthurkorn/GPy
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
        
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        psi1_gpu = self.gpuCache['psi1_gpu']
        psi2n_gpu = self.gpuCache['psi2n_gpu']
        l_gpu = self.gpuCache['l_gpu']
        Z_gpu = self.gpuCache['Z_gpu']
        mu_gpu = self.gpuCache['mu_gpu']
        S_gpu = self.gpuCache['S_gpu']
        gamma_gpu = self.gpuCache['gamma_gpu']
        dvar_gpu = self.gpuCache['dvar_gpu']
        dl_gpu = self.gpuCache['dl_gpu']
        dZ_gpu = self.gpuCache['dZ_gpu']
        dmu_gpu = self.gpuCache['dmu_gpu']
        dS_gpu = self.gpuCache['dS_gpu']
        dgamma_gpu = self.gpuCache['dgamma_gpu']
        grad_l_gpu = self.gpuCache['grad_l_gpu']
        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
        grad_S_gpu = self.gpuCache['grad_S_gpu']
        grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
        
        if self.GPU_direct:
            dL_dpsi1_gpu = dL_dpsi1
            dL_dpsi2_gpu = dL_dpsi2
            dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
        else:
            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
            dL_dpsi0_sum = dL_dpsi0.sum()

        self.reset_derivative()
        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi1compDer '+str(t)
        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi2compDer '+str(t)
        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))

        dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
        dL_dmu = grad_mu_gpu.get()
        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
        dL_dS = grad_S_gpu.get()
        sum_axis(grad_gamma_gpu,dgamma_gpu,N*Q,self.blocknum)
        dL_dgamma = grad_gamma_gpu.get()
        dL_dZ = dZ_gpu.get()
        if ARD:
            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
            dL_dlengscale = grad_l_gpu.get()
        else:
            dL_dlengscale = gpuarray.sum(dl_gpu).get()
            
        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
コード例 #4
0
ファイル: test_gpuarray.py プロジェクト: inducer/pycuda
    def test_sum_allocator(self):
        # FIXME
        from pytest import skip

        skip("https://github.com/inducer/pycuda/issues/163")
        # crashes with  terminate called after throwing an instance of 'pycuda::error'
        # what():  explicit_context_dependent failed: invalid device context - no currently active context?

        import pycuda.tools

        pool = pycuda.tools.DeviceMemoryPool()

        rng = np.random.randint(low=512, high=1024)

        a = gpuarray.arange(rng, dtype=np.int32)
        b = gpuarray.sum(a)
        c = gpuarray.sum(a, allocator=pool.allocate)

        # Test that we get the correct results
        assert b.get() == rng * (rng - 1) // 2
        assert c.get() == rng * (rng - 1) // 2

        # Test that result arrays were allocated with the appropriate allocator
        assert b.allocator == a.allocator
        assert c.allocator == pool.allocate
コード例 #5
0
    def cuda_run(self, prefix, supportK):
        print('Running Eclat in recursive: number of itemsets found:',
              len(self.support_list),
              end='\r')

        while supportK:
            itemset, bitvector = supportK.pop(0)
            support = gpuarray.sum(bitvector).get()

            if support >= self.min_support:
                self.support_list[frozenset(sorted(prefix +
                                                   [itemset]))] = int(support)

                suffix = []
                for itemset_sub, bitvector_sub in supportK:
                    if gpuarray.sum(bitvector_sub).get() >= self.min_support:
                        if self.use_optimal:
                            union_bitvector = bitvector.__mul__(bitvector_sub)
                        else:
                            union_bitvector = gpuarray.zeros_like(bitvector)
                            self.multiply(union_bitvector,
                                          bitvector,
                                          bitvector_sub,
                                          block=self.block,
                                          grid=self.grid)

                        if gpuarray.sum(
                                union_bitvector).get() >= self.min_support:
                            suffix.append((itemset_sub, union_bitvector))

                self.cuda_run(
                    prefix + [itemset],
                    sorted(suffix, key=lambda x: int(x[0]), reverse=True))
コード例 #6
0
	def apply_mds_parallel2(self):
		print("Applying parallel MDS via SMACOF...")
		current_time = time.clock()
		graph_d = gpu.to_gpu(np.float32(self.graph))
		row_sum_d = gpu.to_gpu(np.float32(np.zeros(self.N)))
		score_current_d = gpu.to_gpu(np.float32(np.random.uniform(0, 10, size=self.N)))
		score_next_d = gpu.to_gpu(np.float32(np.zeros(self.N)))
		sigma_d = gpu.to_gpu(np.float32(np.zeros(self.N)))
		delta_d = gpu.to_gpu(np.float32(np.zeros(self.N)))
		mds2_kernel = cuda_compile(_kernel_source, 'mds2_kernel')
		stress  = 1
		while (stress > 0.001):
			mds2_kernel(
				graph_d,
				row_sum_d,
				score_current_d,
				score_next_d,
				sigma_d,
				delta_d,
				np.int32(self.N),
				block=(1024, 1, 1), 
				grid=(int(self.N / 1024 + 1), int(1))
			)
			score_current_d = score_next_d
			score_next_d = gpu.to_gpu(np.float32(np.zeros(self.N)))
			stress = gpu.sum(sigma_d).get() / gpu.sum(delta_d).get()
		self.outdata = score_current_d.get()
		print "Time to apply parallel MDS: %6.2f s" % (time.clock() - current_time)
コード例 #7
0
ファイル: misc.py プロジェクト: Eric89GXL/scikits.cuda
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False, ddof=0,
              keepdims=False):
    global _global_cublas_allocator
    assert isinstance(ddof, numbers.Integral)

    if axis is None or len(x_gpu.shape) <= 1:
        out_shape = (1,)*len(x_gpu.shape) if keepdims else ()
        if calc_mean == False:
            return gpuarray.sum(x_gpu).reshape(out_shape)
        else:
            return gpuarray.sum(x_gpu).reshape(out_shape) / (x_gpu.dtype.type(x_gpu.size-ddof))

    if axis < 0:
        axis += 2
    if axis > 1:
        raise ValueError('invalid axis')

    if x_gpu.flags.c_contiguous:
        n, m = x_gpu.shape[1], x_gpu.shape[0]
        lda = x_gpu.shape[1]
        trans = "n" if axis == 0 else "t"
        sum_axis, out_axis = (m, n) if axis == 0 else (n, m)
    else:
        n, m = x_gpu.shape[0], x_gpu.shape[1]
        lda = x_gpu.shape[0]
        trans = "t" if axis == 0 else "n"
        sum_axis, out_axis = (n, m) if axis == 0 else (m, n)

    if calc_mean:
        alpha = (1.0 / (sum_axis-ddof))
    else:
        alpha = 1.0
    if (x_gpu.dtype == np.complex64):
        gemv = cublas.cublasCgemv
    elif (x_gpu.dtype == np.float32):
        gemv = cublas.cublasSgemv
    elif (x_gpu.dtype == np.complex128):
        gemv = cublas.cublasZgemv
    elif (x_gpu.dtype == np.float64):
        gemv = cublas.cublasDgemv

    alloc = _global_cublas_allocator
    ons = ones((sum_axis, ), x_gpu.dtype, allocator=alloc)

    if keepdims:
        out_shape = (1, out_axis) if axis == 0 else (out_axis, 1)
    else:
        out_shape = (out_axis,)

    if out is None:
        out = gpuarray.empty(out_shape, x_gpu.dtype, alloc)
    else:
        assert out.dtype == x_gpu.dtype
        assert out.size >= out_axis

    gemv(_global_cublas_handle, trans, n, m,
         alpha, x_gpu.gpudata, lda,
         ons.gpudata, 1, 0.0, out.gpudata, 1)
    return out
コード例 #8
0
ファイル: misc.py プロジェクト: anGie44/scikit-cuda
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False, ddof=0,
              keepdims=False):
    global _global_cublas_allocator
    assert isinstance(ddof, numbers.Integral)

    if axis is None or len(x_gpu.shape) <= 1:
        out_shape = (1,)*len(x_gpu.shape) if keepdims else ()
        if calc_mean == False:
            return gpuarray.sum(x_gpu).reshape(out_shape)
        else:
            return gpuarray.sum(x_gpu).reshape(out_shape) / (x_gpu.dtype.type(x_gpu.size-ddof))

    if axis < 0:
        axis += 2
    if axis > 1:
        raise ValueError('invalid axis')

    if x_gpu.flags.c_contiguous:
        n, m = x_gpu.shape[1], x_gpu.shape[0]
        lda = x_gpu.shape[1]
        trans = "n" if axis == 0 else "t"
        sum_axis, out_axis = (m, n) if axis == 0 else (n, m)
    else:
        n, m = x_gpu.shape[0], x_gpu.shape[1]
        lda = x_gpu.shape[0]
        trans = "t" if axis == 0 else "n"
        sum_axis, out_axis = (n, m) if axis == 0 else (m, n)

    if calc_mean:
        alpha = (1.0 / (sum_axis-ddof))
    else:
        alpha = 1.0
    if (x_gpu.dtype == np.complex64):
        gemv = cublas.cublasCgemv
    elif (x_gpu.dtype == np.float32):
        gemv = cublas.cublasSgemv
    elif (x_gpu.dtype == np.complex128):
        gemv = cublas.cublasZgemv
    elif (x_gpu.dtype == np.float64):
        gemv = cublas.cublasDgemv

    alloc = _global_cublas_allocator
    ons = ones((sum_axis, ), x_gpu.dtype, alloc)

    if keepdims:
        out_shape = (1, out_axis) if axis == 0 else (out_axis, 1)
    else:
        out_shape = (out_axis,)

    if out is None:
        out = gpuarray.empty(out_shape, x_gpu.dtype, alloc)
    else:
        assert out.dtype == x_gpu.dtype
        assert out.size >= out_axis

    gemv(_global_cublas_handle, trans, n, m,
         alpha, x_gpu.gpudata, lda,
         ons.gpudata, 1, 0.0, out.gpudata, 1)
    return out
コード例 #9
0
 def updateBC(ul, t):
     updateBCFunc.prepared_call(grid_Nv, block, ul.ptr,
                                self._vm.d_cvx().ptr,
                                self._d_bnd_f0[p].ptr,
                                self._bc_vals_num[p].ptr,
                                self._bc_vals_den[p].ptr, t)
     self._wall_nden[p] = -(gpuarray.sum(self._bc_vals_num[p]) /
                            gpuarray.sum(self._bc_vals_den[p]))
コード例 #10
0
	def Average_TotalProbabilityP( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
				
		temp =  gpuarray.sum(  Psi1_GPU*Psi1_GPU.conj()  ).get()
		temp += gpuarray.sum(  Psi2_GPU*Psi2_GPU.conj()  ).get()
		temp += gpuarray.sum(  Psi3_GPU*Psi3_GPU.conj()  ).get()
		temp += gpuarray.sum(  Psi4_GPU*Psi4_GPU.conj()  ).get()

		return temp * self.dPx*self.dPy
コード例 #11
0
ファイル: sharpen.py プロジェクト: ealehman/image_video_edit
def gpu_sharpen(kernel, orig_image):
	# allocate memory for input and output
	curr_im, next_im = np.array(orig_image, dtype=np.float64), np.array(orig_image, dtype=np.float64)
	
	
	# Get image data
	height, width = np.int32(orig_image.shape)
	N = height * width
	print "Processing %d x %d image" % (width, height)

	# Allocate device memory and copy host to device
	start_transfer = time.time()
	d_curr = gpu.to_gpu(curr_im)
	d_next = gpu.to_gpu(next_im)
	stop_transfer = time.time()
  	host_to_device = stop_transfer - start_transfer
  	print "host to device tranfer time: " + str(host_to_device)

	# Block size (threads per block)
	b_size = (32, 32, 1)  
	33
	# Grid size (blocks per grid)
	g_size = (int(np.ceil(float(width)/float(b_size[0]))), int(np.ceil(float(height)/float(b_size[1])))) 
	# Initialize the GPU event trackers for timing
  	start_gpu_time = cu.Event()
  	end_gpu_time = cu.Event()
	
	start_gpu_time.record()
	
	# Compute the image's initial mean and variance
	init_mean = np.float64(gpu.sum(d_curr).get())/N

	var = ReductionKernel(dtype_out=np.float64, neutral= "0", reduce_expr= "a+b", map_expr="(x[i]-mu)*(x[i]-mu)/size", arguments="double* x, double mu, double size")
	init_variance = var(d_curr, np.float64(init_mean), np.float64(N)).get()
	
	variance = 0
	total = 0
	# while variance is less than a 20% difference from the initial variance, continue to sharpen
	while variance < 1.2 * init_variance:

		kernel(d_curr, EPSILON, d_next, height, width, block=b_size, grid=g_size)

		# Swap references to the images, next_im => curr_im
		d_curr, d_next = d_next, d_curr
		
		# calculate mean and variance
		mean = np.float64(gpu.sum(d_curr).get())/N

		variance = var(d_curr, np.float64(mean), np.float64(N)).get()
		
		print "Mean = %f,  Variance = %f" % (mean, variance)
	end_gpu_time.record()
	end_gpu_time.synchronize()
	gpu_time = start_gpu_time.time_till(end_gpu_time)*1*1e-3 

	print "GPU Time: %f" % gpu_time

	return d_curr.get()
コード例 #12
0
ファイル: bc.py プロジェクト: jaisw7/dgfs1D_gpu
 def updateBC(ul, t):
     updateBCFunc.prepared_call(grid_Nv, block, ul.ptr,
                                self._vm.d_cvx().ptr,
                                self._vm.d_cvy().ptr,
                                self._vm.d_cvz().ptr,
                                self._bc_vals_num.ptr,
                                self._bc_vals_den.ptr, t)
     self._wall_nden = -(gpuarray.sum(self._bc_vals_num) /
                         gpuarray.sum(self._bc_vals_den))
コード例 #13
0
ファイル: GPU_Dirac2D.py プロジェクト: sborah53/PyWignerCUDA
	def Norm_P_GPU( self, Psi1, Psi2, Psi3, Psi4):
		norm  = gpuarray.sum( Psi1.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi2.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi3.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi4.__abs__()**2  ).get()

		norm = np.sqrt(norm*self.dPx * self.dPy )	
		
		return norm
コード例 #14
0
ファイル: __init__.py プロジェクト: darien/fatsheet
def gCOVAR(data1, data2):
	dA1 = gpuarray.to_gpu(data1.astype(np.float32))
	dA2 = gpuarray.to_gpu(data2.astype(np.float32))
	dM1 = gpuarray.sum(dA1)/len(data1)
	dM2 = gpuarray.sum(dA2)/len(data1)
		
	covar = np.float64(kn.kCOVAR(dA1, dA2, dM1, dM2).get()/len(data1))
	
	return covar
コード例 #15
0
	def Norm_P_GPU( self, Psi1, Psi2, Psi3, Psi4):
		norm  = gpuarray.sum( Psi1.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi2.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi3.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi4.__abs__()**2  ).get()

		norm = np.sqrt(norm*self.dPx * self.dPy )	
		
		return norm
コード例 #16
0
def softmax(x, deriv=False):
    if deriv:
        return x * (1.0 - x)
    else:
        np_t = np.array([[0.0]])
        # skcuda.misc.max(x).get(np_t)
        # x = x - np_t.ravel()[0]
        gpu.sum(cm.exp(x)).get(np_t)

        return cm.exp(x) / np_t.ravel()[0]
コード例 #17
0
ファイル: __init__.py プロジェクト: darien/fatsheet
def gCORREL(data1, data2):
	dA1 = gpuarray.to_gpu(data1.astype(np.float32))
	dA2 = gpuarray.to_gpu(data2.astype(np.float32))
	dM1 = gpuarray.sum(dA1)/len(data1)
	dM2 = gpuarray.sum(dA2)/len(data1)
	
	correl = np.float64(kn.kCOVAR(dA1, dA2, dM1, dM2).get() / \
	(kn.kSTDEV(dA1, dM1).get() * kn.kSTDEV(dA2, dM2).get())**.5)
	
	return correl
コード例 #18
0
ファイル: GPU_Dirac2D.py プロジェクト: sborah53/PyWignerCUDA
	def Norm_GPU( self, Psi1, Psi2, Psi3, Psi4):
		norm  = gpuarray.sum( Psi1.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi2.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi3.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi4.__abs__()**2  ).get()

		norm = np.sqrt(norm*self.dX * self.dY )

		#print '               norm GPU = ', norm		
		
		return norm
コード例 #19
0
def gen_summary_stats(data):
    lb = data[data < mask]
    ub = data[data > mask]

    prob_ub = ub / lb
    n_ub = ub.size
    n_lb = lb.size
    mean_ub = gpuarray.sum(ub) / n_ub
    mean_lb = gpuarray.sum(lb) / n_lb
    var_ub = (ub - mean_ub)**2 / n_ub
    var_lb = (lb - mean_lb)**2 / n_lb
コード例 #20
0
def gen_summary_stats(data):
    lb = data[data < mask]
    ub = data[data > mask]

    prob_ub = ub / lb
    n_ub = ub.size
    n_lb = lb.size
    mean_ub = gpuarray.sum(ub) / n_ub
    mean_lb = gpuarray.sum(lb) / n_lb
    var_ub = (ub - mean_ub)**2 / n_ub
    var_lb = (lb - mean_lb)**2 / n_lb
コード例 #21
0
ファイル: GPU_Dirac3D.py プロジェクト: cabrer7/PyWignerCUDA
	def Norm_GPU( self, Psi1, Psi2, Psi3, Psi4):
		norm  = gpuarray.sum( Psi1.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi2.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi3.__abs__()**2  ).get()
		norm += gpuarray.sum( Psi4.__abs__()**2  ).get()

		norm = np.sqrt(norm*self.dX * self.dY * self.dZ )

		#print '               norm GPU = ', norm		
		
		return norm
 def ERA_probe(self, iters=1):
     exits2_gpu = self.thr.empty_like(self.exits_gpu)
     print 'i, eMod, eSup'
     for i in range(iters):
         exits2_gpu = self.Pmod(self.exits_gpu)
         #
         self.error_mod.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm)
         #
         exits = exits2_gpu.get()
         self.Psup_probe(exits)
         #
         self.thr.to_device(makeExits2(self.sample, self.probe, self.coords, exits), dest=self.exits_gpu)
         #
         self.error_sup.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm)
         #
         update_progress(i / max(1.0, float(iters-1)), 'ERA probe', i, self.error_mod[-1], self.error_sup[-1])
コード例 #23
0
    def bloch_single_step_propagation(self, dbeta):
        """
        Perform a single step propagation with respect to the inverse temperature via the Bloch equation.
        The final Wigner function is not normalized.
        :param dbeta: (float) the inverse temperature step size
        :return: self.wignerfunction
        """
        self.p2theta_transform()
        self.bloch_expV_bulk(self.wigner_theta_x, dbeta, **self.V_bulk_mapper_params)
        self.bloch_expV_boundary(self.wigner_theta_x, dbeta, **self.V_boundary_mapper_params)
        self.theta2p_transform()

        self.x2lambda_transform()
        self.bloch_expK_bulk(self.wigner_p_lambda, dbeta, **self.K_bulk_mapper_params)
        self.bloch_expK_boundary(self.wigner_p_lambda, dbeta, **self.K_boundary_mapper_params)
        self.lambda2x_transform()

        self.p2theta_transform()
        self.bloch_expV_bulk(self.wigner_theta_x, dbeta, **self.V_bulk_mapper_params)
        self.bloch_expV_boundary(self.wigner_theta_x, dbeta, **self.V_boundary_mapper_params)
        self.theta2p_transform()

        # normalize
        self.wignerfunction /= gpuarray.sum(self.wignerfunction).get() * self.dXdP

        return self.wignerfunction
コード例 #24
0
    def get_wigner_time(self, wigner_current, wigner_init, t):
        """
        Calculate the integral:

            int_{H(x, p, t) > -Ip} [wigner_current(x,p) - wigner_init(x,p)] dxdp

        :param wigner_current: gpuarray containing current Wigner function
        :param wigner_init: gpuarray containing initial Wigner function
        :param t: current time
        :return: float
        """
        # If kernel calculating the wigner time is not present, compile it
        try:
            wigner_time_mapper = self._wigner_time_mapper
        except AttributeError:
            # Allocate memory to map
            self._tmp_wigner_time = gpuarray.empty(self.rho.shape, np.float64)

            wigner_time_mapper = self._wigner_time_mapper = SourceModule(
                self.wigner_time_mapper_cuda_code.format(
                    cuda_consts=self.cuda_consts, K=self.K, V=self.V
                ),
            ).get_function("Kernel")

        wigner_time_mapper(self._tmp_wigner_time, wigner_current, wigner_init, t, **self.rho_mapper_params)

        return gpuarray.sum(self._tmp_wigner_time).get() * self.wigner_dxdp
コード例 #25
0
def stepFunction():
    global animIter
    if showActivity:
        cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks)
        findActivityKernel(cudaPre(1.e-10),
                           concentrationIn_d,
                           activeBlocks_d,
                           grid=grid2D,
                           block=block2D)
        getActivityKernel(activeBlocks_d,
                          activeThreads_d,
                          grid=grid2D,
                          block=block2D)
    cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr,
                     concentrationOut_d.nbytes)
    maxVal = gpuarray.max(plotData_d).get()
    scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity),
                  activeThreads_d)
    if cudaP == "float":
        [oneIteration_tex() for i in range(nIterationsPerPlot)]
    else:
        [oneIteration_sh() for i in range(nIterationsPerPlot // 2)]
    if plotting and animIter % 25 == 0:
        maxVals.append(maxVal)
        sumConc.append(gpuarray.sum(concentrationIn_d).get())
        plotData(maxVals, sumConc)
    animIter += 1
コード例 #26
0
    def get_wigner(self):
        """
        Transform the density matrix saved in self.rho into the unormalized Wigner function
        :return: self.wignerfunction
        """
        # Create the density matrix out of the wavefunction
        self.psi2rho(self.wavefunction, self.wignerfunction, **self.wigner_mapper_params)

        # Step 1: Rotate by +45 degrees
        # Shear X
        cufft.fft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1)
        self.phase_shearX(self.wignerfunction, **self.wigner_mapper_params)
        cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1)

        # Shear Y
        cufft.fft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax0)
        self.phase_shearY(self.wignerfunction, **self.wigner_mapper_params)
        cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax0)

        # Shear X
        cufft.fft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1)
        self.phase_shearX(self.wignerfunction, **self.wigner_mapper_params)
        cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax1)

        # Step 2: FFt the Blokhintsev function
        self.sign_flip(self.wignerfunction, **self.wigner_mapper_params)
        cufft.ifft_Z2Z(self.wignerfunction, self.wignerfunction, self.plan_Z2Z_ax0)
        self.sign_flip(self.wignerfunction, **self.wigner_mapper_params)

        # normalize
        self.wignerfunction /= gpuarray.sum(self.wignerfunction).get().real * self.wigner_dXdP

        return self.wignerfunction
コード例 #27
0
    def execute(self, solver, stream=None):
        slvr = solver

        # The gaussian shape array can be empty if
        # no gaussian sources were specified.
        gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \
            else slvr.gauss_shape

        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
            else slvr.sersic_shape

        self.kernel(slvr.uvw, slvr.brightness, gauss, sersic,
            slvr.wavelength, slvr.antenna1, slvr.antenna2,
            slvr.jones_scalar,
            slvr.flag, slvr.weight_vector,
            slvr.model_vis, slvr.observed_vis, slvr.chi_sqrd_result,
            **self.get_kernel_params(slvr))

        # Call the pycuda reduction kernel.
        # Divide by the single sigma squared value if a weight vector
        # is not required. Otherwise the kernel will incorporate the
        # individual sigma squared values into the sum
        gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get()

        if not self.weight_vector:
            slvr.set_X2(gpu_sum/slvr.sigma_sqrd)
        else:
            slvr.set_X2(gpu_sum)
コード例 #28
0
def sum_cuda(a, axis=None):
    """Sum of GPUArray elements in a given axis direction or all elements.

    Parameters
    ----------
    a : gpuarray
        GPUArray with elements to be operated on.
    axis : int
        Axis direction to sum through, all if None.

    Returns
    -------
    gpuarray
        GPUArray sum.

    Notes
    -----
    - This is temporary and not an efficient implementation.
    """
    if axis is not None:
        m, n = a.shape
        if axis == 0:
            func = mod.get_function('sum0_cuda')
            b = pycuda.gpuarray.empty((1, n), dtype=float32)
            func(a, b, int32(m), int32(n), block=(1, m, 1), grid=(n, 1, 1))
        elif axis == 1:
            func = mod.get_function('sum1_cuda')
            b = pycuda.gpuarray.empty((m, 1), dtype=float32)
            func(a, b, int32(m), int32(n), block=(n, 1, 1), grid=(1, m, 1))
        return b
    return cuda_array.sum(a)
コード例 #29
0
def mean_variance(red_kernel, d_data, size):
    '''Return the mean and variance of a 2D array'''
    mean = gpu.sum(d_data, dtype=np.float32).get() / np.float32(size)
    #mean = sum_kernel(d_data,size).get()
    variance = red_kernel(d_data, mean).get() / np.float32(size)
    #print "Mean = %f,  Variance = %f" % (mean, variance)
    return mean, variance
コード例 #30
0
ファイル: RimeGaussBSum.py プロジェクト: andferrari/montblanc
    def execute(self, solver, stream=None):
        slvr = solver

        # The gaussian shape array can be empty if
        # no gaussian sources were specified.
        gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \
            else slvr.gauss_shape

        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
            else slvr.sersic_shape

        self.kernel(slvr.uvw, slvr.brightness, gauss, sersic, slvr.wavelength,
                    slvr.antenna1, slvr.antenna2, slvr.jones_scalar, slvr.flag,
                    slvr.weight_vector, slvr.model_vis, slvr.observed_vis,
                    slvr.chi_sqrd_result, **self.get_kernel_params(slvr))

        # Call the pycuda reduction kernel.
        # Divide by the single sigma squared value if a weight vector
        # is not required. Otherwise the kernel will incorporate the
        # individual sigma squared values into the sum
        gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get()

        if not self.weight_vector:
            slvr.set_X2(gpu_sum / slvr.sigma_sqrd)
        else:
            slvr.set_X2(gpu_sum)
コード例 #31
0
ファイル: misc.py プロジェクト: oursland/scikits.cuda
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False):
    global _global_cublas_allocator

    if axis is None:
        if calc_mean == False:
            return gpuarray.sum(x_gpu).get()
        else:
            return gpuarray.sum(x_gpu).get() / x_gpu.dtype.type(x_gpu.size)

    if axis < 0:
        axis += 2
    if axis > 1:
        raise ValueError('invalid axis')

    if x_gpu.flags.c_contiguous:
        n, m = x_gpu.shape[1], x_gpu.shape[0]
        lda = x_gpu.shape[1]
        trans = "n" if axis == 0 else "t"
        sum_axis, out_axis = (m, n) if axis == 0 else (n, m)
    else:
        n, m = x_gpu.shape[0], x_gpu.shape[1]
        lda = x_gpu.shape[0]
        trans = "t" if axis == 0 else "n"
        sum_axis, out_axis = (n, m) if axis == 0 else (m, n)

    alpha = (1.0 / sum_axis) if calc_mean else 1.0
    if (x_gpu.dtype == np.complex64):
        gemv = cublas.cublasCgemv
    elif (x_gpu.dtype == np.float32):
        gemv = cublas.cublasSgemv
    elif (x_gpu.dtype == np.complex128):
        gemv = cublas.cublasZgemv
    elif (x_gpu.dtype == np.float64):
        gemv = cublas.cublasDgemv

    alloc = _global_cublas_allocator
    ons = ones((sum_axis, ), x_gpu.dtype, alloc)
    if out is None:
        out = gpuarray.empty((out_axis, ), x_gpu.dtype, alloc)
    else:
        assert out.dtype == x_gpu.dtype
        assert out.size >= out_axis

    gemv(_global_cublas_handle, trans, n, m,
         alpha, x_gpu.gpudata, lda,
         ons.gpudata, 1, 0.0, out.gpudata, 1)
    return out
コード例 #32
0
ファイル: integrator.py プロジェクト: jaisw7/dgfs1D_gpu
    def integrate(self, t, dt, nacptsteps, d_ucoeff):
        sm, explicit = self._sm, self._explicit
        moment, updateMoment = sm.moment, sm.updateMomentBDF
        updateDist, consMaxwellian = sm.updateDistBDF, sm.constructMaxwellian

        L0, M = self.scratch
        U0, LU0, U = self.scratch_moms
        a1, a2, g1, b = [*self.A, *self.G, self.B]  

        pex = lambda *v: print(*v) + exit(-1)
        psum = lambda v: pex(gpuarray.sum(v))
        pMom = lambda v: pex(v.get().reshape(-1,5))
        pmin = lambda v: pex(gpuarray.min(v))
        pmax = lambda v: pex(gpuarray.max(v))

        # Compute the moment of the initial distribution
        moment(t, d_ucoeff, U0)

        # Compute the explicit part; L0 = -∇·f(d_ucoeff);
        explicit(t, d_ucoeff, L0)

        # Compute the moment of the explicit part
        moment(t, L0, LU0) 

        # update the moments
        updateMoment(dt, a1, U0, -g1, LU0, a2, U, b)
        #pex(U.get().reshape(-1,5))

        # implictly construct the Maxwellian (or Gaussian, etc.) given moments
        consMaxwellian(t, U, M)
        #pex(gpuarray.sum(L0))

        if nacptsteps==-1: 
            #pex(LU0.get().reshape(-1,5))
            #pex(gpuarray.sum(d_ucoeff))
            pass


        # update the distribution
        updateDist(dt, a1, d_ucoeff, -g1, L0, b, M, a2, U, d_ucoeff)
        #pex(gpuarray.sum(d_ucoeff))


        if(nacptsteps==-1):
            #print("\n>> BDF-111\n")

            #pMom(U0)
            #psum(U0)
            #psum(L0)
            #pmax(L0)
            #psum(LU0)
            #pMom(LU0)
            #psum(U)
            #psum(M)
            #psum(d_ucoeff)
            #pmin(d_ucoeff)

            #exit(-1)
            pass
コード例 #33
0
ファイル: neural_net.py プロジェクト: liyangdal/hebel
 def calibrate_learning_rate(self, data_provider):
     lr_multiplier = []
     for data, targets in data_provider:
         _, gradients = self.training_pass(data, targets)
         lr_multiplier.append([float((grad.size / gpuarray.sum(grad.__abs__())).get()) for grad in gradients])
     lr_multiplier = np.array(lr_multiplier).mean(0)
     lr_multiplier /= lr_multiplier.max()
     self.lr_multiplier = lr_multiplier.tolist()
コード例 #34
0
ファイル: __init__.py プロジェクト: darien/fatsheet
def gSTDEV(data1):
	dA = gpuarray.to_gpu(data1.astype(np.float32))
	dM = gpuarray.sum(dA)/len(data1)

	hR = kn.kSTDEV(dA, dM).get()
	stdev = np.float64((hR/(len(data1)-1))**.5)
	
	return stdev
コード例 #35
0
def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False):
    global _global_cublas_allocator

    if axis is None:
        if calc_mean == False:
            return gpuarray.sum(x_gpu).get()
        else:
            return gpuarray.sum(x_gpu).get() / x_gpu.dtype.type(x_gpu.size)

    if axis < 0:
        axis += 2
    if axis > 1:
        raise ValueError('invalid axis')

    if x_gpu.flags.c_contiguous:
        n, m = x_gpu.shape[1], x_gpu.shape[0]
        lda = x_gpu.shape[1]
        trans = "n" if axis == 0 else "t"
        sum_axis, out_axis = (m, n) if axis == 0 else (n, m)
    else:
        n, m = x_gpu.shape[0], x_gpu.shape[1]
        lda = x_gpu.shape[0]
        trans = "t" if axis == 0 else "n"
        sum_axis, out_axis = (n, m) if axis == 0 else (m, n)

    alpha = (1.0 / sum_axis) if calc_mean else 1.0
    if (x_gpu.dtype == np.complex64):
        gemv = cublas.cublasCgemv
    elif (x_gpu.dtype == np.float32):
        gemv = cublas.cublasSgemv
    elif (x_gpu.dtype == np.complex128):
        gemv = cublas.cublasZgemv
    elif (x_gpu.dtype == np.float64):
        gemv = cublas.cublasDgemv

    alloc = _global_cublas_allocator
    ons = ones((sum_axis, ), x_gpu.dtype, alloc)
    if out is None:
        out = gpuarray.empty((out_axis, ), x_gpu.dtype, alloc)
    else:
        assert out.dtype == x_gpu.dtype
        assert out.size >= out_axis

    gemv(_global_cublas_handle, trans, n, m, alpha, x_gpu.gpudata, lda,
         ons.gpudata, 1, 0.0, out.gpudata, 1)
    return out
コード例 #36
0
ファイル: test_gpuarray.py プロジェクト: rutsky/pycuda
    def test_sum_allocator(self):
        import pycuda.tools
        pool = pycuda.tools.DeviceMemoryPool()

        rng = np.random.randint(low=512,high=1024)

        a = gpuarray.arange(rng,dtype=np.int32)
        b = gpuarray.sum(a)
        c = gpuarray.sum(a, allocator=pool.allocate)

        # Test that we get the correct results
        assert b.get() == rng*(rng-1)//2
        assert c.get() == rng*(rng-1)//2

        # Test that result arrays were allocated with the appropriate allocator
        assert b.allocator == a.allocator
        assert c.allocator == pool.allocate
コード例 #37
0
ファイル: test_gpuarray.py プロジェクト: anair13/mujoco-torch
    def test_sum_allocator(self):
        import pycuda.tools
        pool = pycuda.tools.DeviceMemoryPool()

        rng = np.random.randint(low=512, high=1024)

        a = gpuarray.arange(rng, dtype=np.int32)
        b = gpuarray.sum(a)
        c = gpuarray.sum(a, allocator=pool.allocate)

        # Test that we get the correct results
        assert b.get() == rng * (rng - 1) // 2
        assert c.get() == rng * (rng - 1) // 2

        # Test that result arrays were allocated with the appropriate allocator
        assert b.allocator == a.allocator
        assert c.allocator == pool.allocate
コード例 #38
0
 def calibrate_learning_rate(self, data_provider, mini_batches=None):
     lr_multiplier = []
     for i, (data, targets) in enumerate(data_provider):
         if mini_batches is not None and i > mini_batches: break
         _, gradients = self.training_pass(data, targets)
         lr_multiplier.append([float((grad.size / gpuarray.sum(grad.__abs__())).get()) for grad in gradients])
     lr_multiplier = np.array(lr_multiplier).mean(0)
     lr_multiplier /= lr_multiplier.max()
     self.lr_multiplier = lr_multiplier.tolist()
コード例 #39
0
    def initialise(self, solver, stream=None):
        slvr = solver

        # Run the kernel once so that its cached for use
        tmp_X2 = gpuarray.sum(slvr.chi_sqrd_result,
            stream=stream, allocator=slvr.dev_mem_pool.allocate)

        # Return the result's memory to the pool
        tmp_X2.gpudata.free()
コード例 #40
0
ファイル: RimeReduction.py プロジェクト: andferrari/montblanc
    def initialise(self, solver, stream=None):
        slvr = solver

        # Run the kernel once so that its cached for use
        tmp_X2 = gpuarray.sum(slvr.chi_sqrd_result,
                              stream=stream,
                              allocator=slvr.dev_mem_pool.allocate)

        # Return the result's memory to the pool
        tmp_X2.gpudata.free()
コード例 #41
0
ファイル: test_gpuarray.py プロジェクト: anair13/mujoco-torch
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000, ))
        a = a_gpu.get()

        sum_a = np.sum(a)

        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
コード例 #42
0
ファイル: test_gpuarray.py プロジェクト: rutsky/pycuda
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()

        sum_a = np.sum(a)

        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
コード例 #43
0
ファイル: likelihoods.py プロジェクト: MicHan87/hddm
def wiener_like_gpu(value, v, V, a, z, t, out, err=1e-4):
    """Log-likelihood for the simple DDM including contaminants"""
    # Check if parameters are in allowed range
    if z<0 or z>1 or t<0 or a <= 0 or V<=0:
        return -np.inf

    wfpt_gpu.pdf_gpu(value, float(v), float(V), float(a), float(z), float(t), err, out)
    logp = gpuarray.sum(out).get() #cumath.log(out)).get()

    return np.asscalar(logp)
コード例 #44
0
def create_quantiles(data, params):
    global quantiles, q_lb, q_ub, mask
    sort_gpu(data)

    if mask.shape != data.shape:
        mask = gpuarray.zeros_like(data)

    n_lb = gpuarray.sum(data < mask)
    n_ub = gpuarray.sum(data > mask)

    fill_lb_quantiles(data, quantiles, n_lb, n_ub, q_lb, block=(quantiles.shape[0], 1, 1))
    fill_ub_quantiles(data, quantiles, n_lb, n_ub, q_ub, block=(quantiles.shape[0], 1, 1))
    q_lb = q_lb.reverse()

    p_ub = n_ub / (n_ub + n_lb)

    del n_lb, n_ub

    return data, q_lb.get(), q_ub.get(), probs*(1-p_ub.get()), probs*p_ub.get()
コード例 #45
0
ファイル: test_gpuarray.py プロジェクト: minrk/PyCUDA
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()

        sum_a = numpy.sum(a)

        from pycuda.reduction import get_sum_kernel
        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
コード例 #46
0
        def thunk():
            x, truth = inputs[0], inputs[1]
            context = None
            if hasattr(x[0], 'context'):
                context = x[0].context
            z = outputs[0]
            z_shape = x[0].shape
            if z[0] is None or z[0].shape != z_shape:
                z[0] = pygpu.zeros(z_shape,
                                   dtype=theano.config.floatX,
                                   context=context)
            x_ptr, _ = get_tens_ptr(x[0])
            truth_ptr, _ = get_tens_ptr(truth[0])
            z_ptr, z_obj = get_tens_ptr(z[0])

            # store as gpuarray
            best_idx_ptr = gpuarray.GPUArray(shape=(np.prod(
                truth[0].shape[:2]), ),
                                             dtype=np.int32)
            best_iou_ptr = gpuarray.GPUArray(shape=(np.prod(
                truth[0].shape[:2]), ),
                                             dtype=np.float32)

            yolo_ptr, _ = get_yolo_info(n_classes, n_anchors, l_obj, l_noobj,
                                        anchors)

            # get best index
            index_fn(best_idx_ptr,
                     best_iou_ptr,
                     x_ptr,
                     truth_ptr,
                     yolo_ptr,
                     block=(1, 1, 1),
                     grid=(x[0].shape[0], 1, 1))

            n_total = np.int32(x[0].shape[0] * n_anchors *
                               np.prod(x[0].shape[-2:]))
            n_matched = np.int32(gpuarray.sum(best_idx_ptr != -1).get())

            grad_fn(z_ptr,
                    best_idx_ptr,
                    best_iou_ptr,
                    x_ptr,
                    truth_ptr,
                    yolo_ptr,
                    n_matched,
                    n_total,
                    block=(n_anchors, 1, 1),
                    grid=(x[0].shape[0], x[0].shape[2], x[0].shape[3]))

            # free all memory
            del best_idx_ptr
            del best_iou_ptr
            yolo_ptr.free()
 def Thibault_probe(self, iters=1):
     exits2_gpu = self.thr.empty_like(self.exits_gpu)
     print 'i \t\t eConv \t\t eSup'
     for i in range(iters):
         exits = self.exits_gpu.get()
         self.Psup_probe(exits)
         #
         self.thr.to_device(makeExits2(self.sample, self.probe, self.coords, exits), dest=exits2_gpu)
         #
         self.error_sup.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm)
         #
         exits2_gpu = self.exits_gpu + self.Pmod(2*exits2_gpu - self.exits_gpu) - exits2_gpu
         #
         self.error_conv.append(gpuarray.sum(abs(self.exits_gpu - exits2_gpu)**2).get()/self.diffNorm)
         #
         self.error_mod.append(None)
         #
         self.exits_gpu = exits2_gpu.copy()
         #
         update_progress(i / max(1.0, float(iters-1)), 'Thibault probe', i, self.error_conv[-1], self.error_sup[-1])
コード例 #48
0
ファイル: test_gpuarray.py プロジェクト: thecobb/PyCUDA
    def test_sum(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000, ))
        a = a_gpu.get()

        sum_a = numpy.sum(a)

        from pycuda.reduction import get_sum_kernel
        sum_a_gpu = gpuarray.sum(a_gpu).get()

        assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
コード例 #49
0
    def squared_loss(self, input_data, targets, average=True, cache=None, prediction=False):
        if cache is not None:
            activations = cache
        else:
            activations = self.feed_forward(input_data, prediction=prediction)

        loss = gpuarray.sum(matrix_sum_out_axis((targets - activations) ** 2, 1))

        if average:
            loss = loss.mean()
        return float(loss.get())
コード例 #50
0
ファイル: GPU_Dirac2D.py プロジェクト: sborah53/PyWignerCUDA
	def Average_KEnergy( self, temp_GPU, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
				
		energy  = gpuarray.sum( Psi1_GPU*Psi1_GPU.conj() ).get()
		energy += gpuarray.sum( Psi2_GPU*Psi2_GPU.conj() ).get()
		energy -= gpuarray.sum( Psi3_GPU*Psi3_GPU.conj() ).get()
		energy -= gpuarray.sum( Psi4_GPU*Psi4_GPU.conj() ).get()

		energy *= self.mass*self.c*self.c*self.dPx*self.dPy	
		
		#
		temp_GPU *= 0.

		temp_GPU += Psi4_GPU * Psi1_GPU.conj()
		temp_GPU += Psi1_GPU * Psi4_GPU.conj()
		temp_GPU += Psi3_GPU * Psi2_GPU.conj()
		temp_GPU += Psi2_GPU * Psi3_GPU.conj()
	
		temp_GPU *= self.Px_GPU
		#temp_GPU *= self.c

		energy += gpuarray.sum( temp_GPU ).get()*self.dPx*self.dPy*self.c
		#
		temp_GPU *= 0.

		temp_GPU += Psi4_GPU * Psi1_GPU.conj()
		temp_GPU -= Psi1_GPU * Psi4_GPU.conj()
		temp_GPU -= Psi3_GPU * Psi2_GPU.conj()
		temp_GPU += Psi2_GPU * Psi3_GPU.conj()

		temp_GPU *= self.Py_GPU
		#temp_GPU *= -1j

		energy += gpuarray.sum( temp_GPU ).get()*self.dPx*self.dPy*self.c*(-1j)

		return energy
コード例 #51
0
	def Average_KEnergy( self, temp_GPU, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
				
		energy  = gpuarray.sum( Psi1_GPU*Psi1_GPU.conj() ).get()
		energy += gpuarray.sum( Psi2_GPU*Psi2_GPU.conj() ).get()
		energy -= gpuarray.sum( Psi3_GPU*Psi3_GPU.conj() ).get()
		energy -= gpuarray.sum( Psi4_GPU*Psi4_GPU.conj() ).get()

		energy *= self.mass*self.c*self.c*self.dPx*self.dPy	
		
		#
		temp_GPU *= 0.

		temp_GPU += Psi4_GPU * Psi1_GPU.conj()
		temp_GPU += Psi1_GPU * Psi4_GPU.conj()
		temp_GPU += Psi3_GPU * Psi2_GPU.conj()
		temp_GPU += Psi2_GPU * Psi3_GPU.conj()
	
		temp_GPU *= self.Px_GPU
		#temp_GPU *= self.c

		energy += gpuarray.sum( temp_GPU ).get()*self.dPx*self.dPy*self.c
		#
		temp_GPU *= 0.

		temp_GPU += Psi4_GPU * Psi1_GPU.conj()
		temp_GPU -= Psi1_GPU * Psi4_GPU.conj()
		temp_GPU -= Psi3_GPU * Psi2_GPU.conj()
		temp_GPU += Psi2_GPU * Psi3_GPU.conj()

		temp_GPU *= self.Py_GPU
		#temp_GPU *= -1j

		energy += gpuarray.sum( temp_GPU ).get()*self.dPx*self.dPy*self.c*(-1j)

		return energy
コード例 #52
0
 def test_sliceset_macroparticles(self):
     '''Tests whether the sum of all particles per slice
     is equal to the specified number of macroparticles when specifying
     z_cuts which lie outside of the bunch
     '''
     #create a bunch and a slice set encompassing the whole bunch
     z_min, z_max = -2., 2.
     bunch = self.create_bunch(zmin=z_min, zmax=z_max)
     z_cuts = (z_min - 1, z_max + 1)
     mesh = self.create_mesh(z_cuts=z_cuts)
     slice_set = MeshSlicer(mesh, context).slice(bunch)
     n_particles = gpuarray.sum(slice_set.n_macroparticles_per_slice).get()
     self.assertEqual(self.macroparticlenumber, n_particles,
                      'the SliceSet lost/added some particles')
コード例 #53
0
def stepFunction():
  global animIter
  cuda.memcpy_dtod( plotDataFloat_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes )
  maxVal = (gpuarray.max(plotDataFloat_d)).get()
  multiplyByScalarReal( cudaPre(0.5/(maxVal)), plotDataFloat_d )
  floatToUchar( plotDataFloat_d, plotDataChars_d)
  copyToScreenArray()
  if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ]
  #else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ]
  if plotting and animIter%25 == 0: 
    maxVals.append( maxVal )
    sumConc.append( gpuarray.sum(concentrationIn_d).get() )
    plotData( maxVals, sumConc )
  animIter += 1
コード例 #54
0
ファイル: algo.py プロジェクト: uncbiag/pregis
def computeEnergy(D_v, T, _gamma, Alpha, Beta):
    l, m, n = T.shape

    sum_alpha_beta = gpuarray.zeros_like(D_v)
    sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta)

    GR = grad(T)
    square_matrix(GR, GR)
    G_norm = gpuarray.zeros_like(T)

    sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm,
                     1.0, 1.0, 1.0)
    sqrt_matrix(G_norm, G_norm)

    ET = _gamma * gpuarray.sum(G_norm)

    sparse = D_v - T.reshape(l * m * n, 1) - sum_alpha_beta
    square_matrix(sparse, sparse)
    EL = gpuarray.sum(sparse)

    E = 1 / 2 * EL.get() + ET.get()

    return EL.get(), ET.get(), E
コード例 #55
0
    def fitness(self):
        # calculate new particle scores
        i = 0
        for p in self.particles:
            sensor_values = np.zeros((self.number_of_sensors,3),dtype=np.float32,order='C')
            magnets = self.ball_joint.gen_magnets_angle(p['magnet_angles'])
            for sens,i in zip(self.sensors,range(0,self.number_of_sensors)):
                value = sens.getB(magnets)
                if self.normalize_magnetic_field:
                    sensor_values[i]=value/np.linalg.norm(value)
                else:
                    sensor_values[i]=value
            p1_gpu = gpuarray.to_gpu(sensor_values)
            out_gpu = gpuarray.empty(self.number_of_sensors**2, np.float32)
            number_of_samples = np.int32(self.number_of_sensors)
            bdim = (16, 16, 1)
            dx, mx = divmod(number_of_samples, bdim[0])
            dy, my = divmod(number_of_samples, bdim[1])
            gdim = ( int((dx + (mx>0))), int((dy + (my>0))))
            # print(bdim)
            # print(gdim)
            self.distance(number_of_samples, p1_gpu, out_gpu, block=bdim, grid=gdim)
            out = np.reshape(out_gpu.get(),(number_of_samples,number_of_samples))
            # sum = 0
            # for val in out_gpu.get():
            #     sum += val
            # print(out)
            # print(sum)
            # print(gpuarray.sum(out_gpu))
            score = gpuarray.sum(out_gpu).get()
            if score > p['personal_best_score']:
                # print('score of particle %d improved from %d to %d'%(i,p['personal_best_score'],score))
                p['personal_best_score'] = score
                p['personal_best'] = p['magnet_angles']
            i+=1

            fig = plt.figure(figsize=(9,9))
            ax1 = fig.add_subplot(111, projection='3d')
            displaySystem(magnets, subplotAx=ax1, suppress=True, direc=True)
            fig.savefig('pics/'+self.target_folder+'/'+p['name']+'/'+'%03d.png'%self.iteration)
            plt.close(fig)
            self.status_bar.update(1)
        # calculate global best score
        i = 0
        for p in self.particles:
            if p['personal_best_score']>self.global_best_score:
                self.global_best_score = p['personal_best_score']
                self.global_best_particle = i
                print('new global best score %d of %s'%(self.global_best_score,p['name']))
            i+=1
コード例 #56
0
 def test_sliceset_macroparticles(self):
     '''Tests whether the sum of all particles per slice
     is equal to the specified number of macroparticles when specifying
     z_cuts which lie outside of the bunch
     '''
     #create a bunch and a slice set encompassing the whole bunch
     z_min, z_max = -2., 2.
     bunch = self.create_bunch(zmin=z_min, zmax=z_max)
     z_cuts = (z_min-1, z_max+1)
     mesh = self.create_mesh(z_cuts=z_cuts)
     slice_set = MeshSlicer(mesh, context).slice(bunch)
     n_particles = gpuarray.sum(slice_set.n_macroparticles_per_slice).get()
     self.assertEqual(self.macroparticlenumber, n_particles,
                      'the SliceSet lost/added some particles')
コード例 #57
0
    def run(self, data, get=False):
        """ Function to perform drift rate conversion"""

        self.spectr_d = gpuarray.to_gpu(data)
        self.sweep_kernel(self.spectr_d,
                          self.output_d,
                          self.delay_table_d,
                          self.nfreqs,
                          self.ntimes,
                          self.ndelays,
                          block=self.block_size,
                          grid=self.grid_size)
        if get:
            out = self.output_d.get()
            return out
        else:
            operand = self.output_d[self.ndelays // 2]
            mean = gpuarray.sum(operand / np.float32(self.nfreqs))
            var = gpuarray.sum(
                (operand - mean) * (operand - mean) / np.float32(self.nfreqs))
            std = np.sqrt(var.get())
            self.output_d = self.output_d - mean
            thresholded = self.output_d > 3 * std
            return thresholded.get()
コード例 #58
0
    def computeCorrespondence(self):
        """
        Compute point correspondence from result PointCloud to dst.
        CUDA function and summation reduction is called here.

        :return: total distance and matrix with point correspondence
        """
        super(ICPParallel, self).computeCorrespondence()

        target = np.zeros([self.src.num, 3], dtype=np.float32)
        self.computeCorrespondenceCuda(cuda.In(self.result.points),
                                       cuda.Out(target),
                                       self.distances_gpu,
                                       block=(self.numCore, 1, 1))

        return gpuarray.sum(self.distances_gpu).get(), PointCloud(target)