Example #1
0
    def cnvinv_objfun(self, z, sz, y_gpu, alpha=0., beta=0.):
        """
        Computes objective function value of 'lbfgsb' mode of deconv method.
        See deconv for details.
        """
         
        if z.__class__ == np.ndarray:
            z = np.array(np.reshape(z,sz)).astype(np.float32)
            z_gpu = cua.to_gpu(z)            
                
        self.res_gpu = y_gpu - self.cnv(z_gpu)        
 
        obj = 0.5*(cua.dot(self.res_gpu,self.res_gpu,dtype=np.float64))

        # Thikonov regularization, dinstinguish between 'X' and 'F' cases
        # as size of corresponding z is different
        # alpha > 0: Thikonov on the gradient of z
        if alpha > 0:
            if self.__id__ == 'X':
                self.lz_gpu = shock.laplace_stack_gpu(z_gpu, mode='same')

            elif self.__id__ == 'F':        
                self.lz_gpu = gputools.laplace_gpu(z_gpu, mode='same')

            obj += 0.5*alpha*(cua.dot(z_gpu, self.lz_gpu, dtype=np.float64))

        # beta  > 0: Thikonov on z
        if beta > 0:
            obj += 0.5*beta*(cua.dot(z_gpu, z_gpu,dtype=np.float64))
                
        return obj.get()
Example #2
0
	def Average_Alpha1( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
		average  =  gpuarray.dot( Psi2_GPU, Psi3_GPU.conj() ).get().real
		average +=  gpuarray.dot( Psi1_GPU, Psi4_GPU.conj() ).get().real

		average *= 2.*self.dX*self.dY

		return average
Example #3
0
    def test_dot_allocator(self):
        from pytest import skip
        skip("https://github.com/inducer/pycuda/issues/163")

        import pycuda.tools
        pool = pycuda.tools.DeviceMemoryPool()

        a_cpu = np.random.randint(low=512,high=1024,size=1024)
        b_cpu = np.random.randint(low=512,high=1024,size=1024)

        # Compute the result on the CPU
        dot_cpu_1 = np.dot(a_cpu, b_cpu)

        a_gpu = gpuarray.to_gpu(a_cpu)
        b_gpu = gpuarray.to_gpu(b_cpu)

        # Compute the result on the GPU using different allocators
        dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu)
        dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate)

        # Test that we get the correct results
        assert dot_cpu_1 == dot_gpu_1.get()
        assert dot_cpu_1 == dot_gpu_2.get()

        # Test that result arrays were allocated with the appropriate allocator
        assert dot_gpu_1.allocator == a_gpu.allocator
        assert dot_gpu_2.allocator == pool.allocate
Example #4
0
    def cnvinv_objfun(self, z, sz, y_gpu, alpha=0., beta=0.):
        """
        Computes objective function value of 'lbfgsb' mode of deconv method.
        See deconv for details.
        """

        if z.__class__ == np.ndarray:
            z = np.array(np.reshape(z, sz)).astype(np.float32)
            z_gpu = cua.to_gpu(z)

        self.res_gpu = y_gpu - self.cnv(z_gpu)

        obj = 0.5 * (cua.dot(self.res_gpu, self.res_gpu).astype(np.float64))

        # Thikonov regularization, dinstinguish between 'X' and 'F' cases
        # as size of corresponding z is different
        # alpha > 0: Thikonov on the gradient of z
        if alpha > 0:
            if self.__id__ == 'X':
                self.lz_gpu = shock.laplace_stack_gpu(z_gpu, mode='same')

            elif self.__id__ == 'F':
                self.lz_gpu = gputools.laplace_gpu(z_gpu, mode='same')

            obj += 0.5 * alpha * (cua.dot(z_gpu, self.lz_gpu).astype(
                np.float64))

        # beta  > 0: Thikonov on z
        if beta > 0:
            obj += 0.5 * beta * (cua.dot(z_gpu, z_gpu).astype(np.float64))

        return obj.get()
	def Average_Alpha2( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
		average  =   gpuarray.dot( Psi3_GPU, Psi2_GPU.conj() ).get().imag
		average +=   gpuarray.dot( Psi1_GPU, Psi4_GPU.conj() ).get().imag

		average *= -2.*self.dX*self.dY

		return average
Example #6
0
    def test_dot_allocator(self):
        # FIXME
        from pytest import skip

        skip("https://github.com/inducer/pycuda/issues/163")

        import pycuda.tools

        pool = pycuda.tools.DeviceMemoryPool()

        a_cpu = np.random.randint(low=512, high=1024, size=1024)
        b_cpu = np.random.randint(low=512, high=1024, size=1024)

        # Compute the result on the CPU
        dot_cpu_1 = np.dot(a_cpu, b_cpu)

        a_gpu = gpuarray.to_gpu(a_cpu)
        b_gpu = gpuarray.to_gpu(b_cpu)

        # Compute the result on the GPU using different allocators
        dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu)
        dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate)

        # Test that we get the correct results
        assert dot_cpu_1 == dot_gpu_1.get()
        assert dot_cpu_1 == dot_gpu_2.get()

        # Test that result arrays were allocated with the appropriate allocator
        assert dot_gpu_1.allocator == a_gpu.allocator
        assert dot_gpu_2.allocator == pool.allocate
Example #7
0
	def Average_Beta( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
		average =     gpuarray.dot(Psi1_GPU,Psi1_GPU.conj()).get()		
		average +=    gpuarray.dot(Psi2_GPU,Psi2_GPU.conj()).get()
		average +=  - gpuarray.dot(Psi3_GPU,Psi3_GPU.conj()).get()
		average +=  - gpuarray.dot(Psi4_GPU,Psi4_GPU.conj()).get()
		
		average *= self.dX*self.dY

		return average
Example #8
0
	def Average_Alpha2( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
		average  = - gpuarray.dot(Psi4_GPU,Psi1_GPU.conj()).get()
		average +=   gpuarray.dot(Psi3_GPU,Psi2_GPU.conj()).get()
		average += - gpuarray.dot(Psi2_GPU,Psi3_GPU.conj()).get()
		average +=   gpuarray.dot(Psi1_GPU,Psi4_GPU.conj()).get()

		average *= 1j*self.dX*self.dY*self.dZ

		return average
Example #9
0
    def check_termination(self):
        """
        Check various termination criteria
        """

        # First check if we are doing termination based on running time
        if (self.options.time_limit):
            self.time = time.clock - self.time_start
            if (self.time >= self.options.maxtime):
                self.term_reason = 'Exceeded time limit'
                return

        # Now check if we are doing break by tolx
        if (self.options.use_tolx):
            if (np.sqrt(cua.dot(self.dx, self.dx).get()) /
                    np.sqrt(cua.dot(self.oldx, self.oldx).get()) <
                    self.options.tolx):
                self.term_reason = 'Relative change in x small enough'
                return

        # Are we doing break by tolo (tol obj val)
        if (self.options.use_tolo and self.iter > 2):
            delta = abs(self.obj - self.oldobj)
            if (delta < self.options.tolo):
                self.term_reason = 'Relative change in objvalue small enough'
                return

        # Check if change in x and gradient are small enough
        # we don't want that for now


#        if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \
#               or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg):
#            self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9'
#            return

# Finally the plain old check if max iter has been achieved
        if (self.iter >= self.options.maxiter):
            self.term_reason = 'Maximum number of iterations reached'
            return

        # KKT violation
        if (self.options.use_kkt):
            if np.abs(np.sqrt(cua.dot(self.x,
                                      self.grad).get())) <= options.tolk:
                self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm'
                return

        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get()
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return

        # No condition met, so return false
        self.term_reason = 0
Example #10
0
	def Average_Beta( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):
		average =    gpuarray.dot( Psi1_GPU, Psi1_GPU.conj() ).get()		
		average +=   gpuarray.dot( Psi2_GPU, Psi2_GPU.conj() ).get()
		average -=   gpuarray.dot( Psi3_GPU, Psi3_GPU.conj() ).get()
		average -=   gpuarray.dot( Psi4_GPU, Psi4_GPU.conj() ).get()
		
		average *= self.dX*self.dY

		return average
Example #11
0
	def Average_Py( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):

		average  = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Py_GPU).get()
		average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Py_GPU).get()
		average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Py_GPU).get()
		average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Py_GPU).get()

		average *= self.dX*self.dY

		return average		
Example #12
0
	def Average_Y( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):

		average  = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Y_GPU).get()
		average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Y_GPU).get()
		average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Y_GPU).get()
		average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Y_GPU).get()

		average *= self.dX*self.dY

		return average		
Example #13
0
	def Average_Py( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):

		average  = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Py_GPU).get()
		average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Py_GPU).get()
		average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Py_GPU).get()
		average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Py_GPU).get()

		average *= self.dPx*self.dPy

		return average		
Example #14
0
	def _Average_Px( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU):

		average  = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Px_GPU).get()
		average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Px_GPU).get()
		average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Px_GPU).get()
		average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Px_GPU).get()

		average *= self.dX*self.dY*self.dZ

		return average	
Example #15
0
  def compute_obj(self, w_gpu):

    self.dfs_gpu = 1. * (self.weight(w_gpu) - self.data_gpu)
    res =    0.5 * self.lamda * cua.dot(self.dfs_gpu, self.dfs_gpu) 
    reg = (  0.5 * self.beta  * cua.dot(w_gpu - self.u_gpu,
                                        w_gpu - self.u_gpu))

    if self.eta:
      reg += 0.5 * self.eta * cua.dot(w_gpu, laplace3d_gpu(w_gpu))

    return res + reg
Example #16
0
  def compute_obj(self, w_gpu):

    self.dfs_gpu = 1. * (self.weight(w_gpu) - self.data_gpu)
    res =    0.5 * self.lamda * cua.dot(self.dfs_gpu, self.dfs_gpu) 
    reg = (  0.5 * self.beta  * cua.dot(w_gpu - self.u_gpu,
                                        w_gpu - self.u_gpu))

    if self.eta:
      reg += 0.5 * self.eta * cua.dot(w_gpu, laplace3d_gpu(w_gpu))

    return res + reg
Example #17
0
    def check_termination(self):
        """
        Check various termination criteria
        """
        
        # First check if we are doing termination based on running time
        if (self.options.time_limit):
            self.time = time.clock - self.time_start
            if (self.time >= self.options.maxtime):
                self.term_reason = 'Exceeded time limit'
                return
         
        # Now check if we are doing break by tolx
        if (self.options.use_tolx):
            if (np.sqrt(cua.dot(self.dx,self.dx).get())/
                np.sqrt(cua.dot(self.oldx,self.oldx).get()) < self.options.tolx):
                self.term_reason = 'Relative change in x small enough'
                return
         
        # Are we doing break by tolo (tol obj val)
        if (self.options.use_tolo and self.iter > 2):
            delta = abs(self.obj-self.oldobj)
            if (delta < self.options.tolo):
                self.term_reason ='Relative change in objvalue small enough'
                return

        # Check if change in x and gradient are small enough
        # we don't want that for now
#        if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \
#               or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg):
#            self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9'
#            return
         
        # Finally the plain old check if max iter has been achieved
        if (self.iter >= self.options.maxiter):
            self.term_reason = 'Maximum number of iterations reached'
            return
         
        # KKT violation
        if (self.options.use_kkt):
            if np.abs(np.sqrt(cua.dot(self.x,self.grad).get())) <= options.tolk:
                self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm'
                return
         
        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get();
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return
         
        # No condition met, so return false
        self.term_reason = 0;        
Example #18
0
    def one_iteration(self, compute_real_residual=False):
        # typed up from J.R. Shewchuk,
        # An Introduction to the Conjugate Gradient Method
        # Without the Agonizing Pain, Edition 1 1/4 [8/1994]
        # Appendix B3

        q = self.operator(self.d)
        myip = gpuarray.dot(self.d, q)
        alpha = self.guarded_div(self.delta, myip)

        self.lc2(1, self.x, alpha, self.d, out=self.x)

        if compute_real_residual:
            self.residual = self.lc2(
                    1, self.rhs, -1, self.operator(self.x))
        else:
            self.lc2(1, self.residual, -alpha, q, out=self.residual)

        s = self.precon(self.residual)
        delta_old = self.delta
        delta = AsyncInnerProduct(self.residual, s,
                self.pagelocked_allocator)
        self.delta = delta.gpu_result
        beta = self.guarded_div(self.delta, delta_old)

        self.lc2(1, s, beta, self.d, out=self.d)

        if compute_real_residual:
            self.real_delta_queue.append(delta)
Example #19
0
    def __init__(self, a, b, pagelocked_allocator):
        self.gpu_result = gpuarray.dot(a, b)
        self.gpu_finished_evt = drv.Event()
        self.gpu_finished_evt.record()
        self.gpu_finished = False

        self.pagelocked_allocator = pagelocked_allocator
def gpuErrorEvaluate(actual, expected):
    context = make_default_context()
    device = context.get_device()
    p=gpuarray.to_gpu(numpy.array(actual))- gpuarray.to_gpu(numpy.array(expected))
    res= 1.0 - gpuarray.dot(p,p)
    context.pop()
    return res
Example #21
0
    def test_dot(self):
        from pycuda.curandom import rand as curand

        for sz in [
                2,
                3,
                4,
                5,
                6,
                7,
                31,
                32,
                33,
                127,
                128,
                129,
                255,
                256,
                257,
                16384 - 993,
                20000,
        ]:
            a_gpu = curand((sz, ))
            a = a_gpu.get()
            b_gpu = curand((sz, ))
            b = b_gpu.get()

            dot_ab = np.dot(a, b)

            dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
Example #22
0
    def one_iteration(self, compute_real_residual=False):
        # typed up from J.R. Shewchuk,
        # An Introduction to the Conjugate Gradient Method
        # Without the Agonizing Pain, Edition 1 1/4 [8/1994]
        # Appendix B3

        q = self.operator(self.d)
        myip = gpuarray.dot(self.d, q)
        alpha = self.guarded_div(self.delta, myip)

        self.lc2(1, self.x, alpha, self.d, out=self.x)

        if compute_real_residual:
            self.residual = self.lc2(1, self.rhs, -1, self.operator(self.x))
        else:
            self.lc2(1, self.residual, -alpha, q, out=self.residual)

        s = self.precon(self.residual)
        delta_old = self.delta
        delta = AsyncInnerProduct(self.residual, s, self.pagelocked_allocator)
        self.delta = delta.gpu_result
        beta = self.guarded_div(self.delta, delta_old)

        self.lc2(1, s, beta, self.d, out=self.d)

        if compute_real_residual:
            self.real_delta_queue.append(delta)
Example #23
0
 def norm(self):
     """The L2-norm on the flattened vector."""
     if self.state is DeviceDataMixin.DEVICE:
         return np.sqrt(gpuarray.dot(self.array, self.array).get())
     elif self.state in [DeviceDataMixin.DEVICE_UNALLOCATED,
                         DeviceDataMixin.HOST, DeviceDataMixin.BOTH]:
         return np.sqrt(np.dot(self.data_ro, self.data_ro))
     else:
         raise RuntimeError('Data neither on host nor device, oops!')
Example #24
0
def magnitude(vec, vec2):
    #, fn = mod.get_function('magnitude')):
    #gpu_vec = drv.mem_alloc(vec.nbytes)
    #drv.memcpy_htod(gpu_vec, vec)

    #fn(gpu_vec, block=(512, 1, 1))

    #dest = drv.from_device_like(gpu_vec, vec)

    #print 'Dot product: ', dest[0]
    
    gpu_arry = gpuarr.to_gpu_async(vec)
    gpu_arry2 = gpuarr.to_gpu_async(vec2)
    mag = cumath.sqrt(gpuarr.dot(gpu_arry, gpu_arry, dtype=np.float32))
    mag2 = cumath.sqrt(gpuarr.dot(gpu_arry2, gpu_arry2, dtype=np.float32))

    product = gpuarr.dot(gpu_arry, gpu_arry2, dtype=np.float32) / mag + mag2
    print product
    return product.get()
Example #25
0
    def __call__(self, tcurr, nsteps, solprev, solcurr):
        if ((self.nsteps > 0
             and ((nsteps % self.nsteps == 0) or nsteps == 1))):
            #    or (self.dt_out>0 and abs(tcurr % self.dt_out) < 1e-8)):

            comm, rank, root = get_comm_rank_root()
            diff = solcurr - solprev
            res = np.array([
                gpuarray.dot(diff, diff).get(),
                gpuarray.dot(solprev, solprev).get()
            ])

            if rank != root:
                comm.Reduce(res, None, op=get_mpi('sum'), root=root)
            else:
                comm.Reduce(get_mpi('in_place'),
                            res,
                            op=get_mpi('sum'),
                            root=root)
                print("residual at t = ", tcurr, np.sqrt(res[0] / res[1]))
Example #26
0
    def test_dot(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000, ))
        a = a_gpu.get()
        b_gpu = curand((200000, ))
        b = b_gpu.get()

        dot_ab = np.dot(a, b)

        dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

        assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
Example #27
0
    def test_dot(self):
        from pycuda.curandom import rand as curand
        a_gpu = curand((200000,))
        a = a_gpu.get()
        b_gpu = curand((200000,))
        b = b_gpu.get()

        dot_ab = numpy.dot(a, b)

        dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

        assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
Example #28
0
    def test_dot(self):
        from pycuda.curandom import rand as curand

        for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000]:
            a_gpu = curand((l,))
            a = a_gpu.get()
            b_gpu = curand((l,))
            b = b_gpu.get()

            dot_ab = np.dot(a, b)

            dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
Example #29
0
def dotc_gpu(x, y=None):
    """Calculate complex dot product on GPU.
    If y is not provided, <x, x> is calculated instead.

    Args:
        x (ndarray): Vector.
        y (ndarray): Vector.

    Returns:
        ndarray: Absolute of complex dot product.
    """
    if y is None:
        y = x
    return np.abs(gpuarray.dot(x.ravel(), y.ravel().conj()).get())
    def test_dot(self):
        """ Test dot-product. """
        dtypes = [numpy.float32, numpy.float64, numpy.complex64, numpy.complex128]
        for dtype in dtypes:
            for shape in self.shapes:
                x = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype))
                y = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype))

                dot_cpu = numpy.dot(x.get().flatten(), y.get().flatten()) 
                dot_gpu = gpuarray.dot(x, y).get()

                percent_error = abs(dot_cpu-dot_gpu)/abs(dot_cpu)*100
#                 print 'shape:', shape
#                 print 'data type:', dtype 
#                 print 'numpy computed dot product:', dot_cpu
#                 print 'gpuarray computed dot product:', dot_gpu
#                 print 'percent error:', percent_error, '%'
#                 print '\n'

                self.assertTrue(percent_error < 10.0, 'Error above 10%.')
Example #31
0
    def test_dot(self):
        """ Test dot-product. """
        dtypes = [
            numpy.float32, numpy.float64, numpy.complex64, numpy.complex128
        ]
        for dtype in dtypes:
            for shape in self.shapes:
                x = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype))
                y = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype))

                dot_cpu = numpy.dot(x.get().flatten(), y.get().flatten())
                dot_gpu = gpuarray.dot(x, y).get()

                percent_error = abs(dot_cpu - dot_gpu) / abs(dot_cpu) * 100
                #                 print 'shape:', shape
                #                 print 'data type:', dtype
                #                 print 'numpy computed dot product:', dot_cpu
                #                 print 'gpuarray computed dot product:', dot_gpu
                #                 print 'percent error:', percent_error, '%'
                #                 print '\n'

                self.assertTrue(percent_error < 10.0, 'Error above 10%.')
def amplitude_compute_gpu(vector, atom_factors, frame):
    f_a_real = 0
    f_a_imag = 0
    f_frame = []
    f_factor = []
    for atom in frame:
        f_frame.append([atom[1], atom[2], atom[3]])
        for factors in atom_factors:
            if factors[0] == atom[4]:
                f_factor.append(factors[1])
    n_vector = numpy.asarray(vector)
    n_frame = numpy.asarray(f_frame)
    for i in range(0, len(n_frame)):
        gpu_vector = gpuarray.to_gpu(n_vector)
        gpu_frame = gpuarray.to_gpu(n_frame[i])
        gpu_result = gpuarray.dot(gpu_vector, gpu_frame)
        gpu_sin = gpumath.sin(gpu_result)
        gpu_cos = gpumath.cos(gpu_result)
        f_q = f_factor[i]
        f_a_real += f_q * gpu_cos
        f_a_imag += f_q * gpu_sin
    return f_a_real, f_a_imag
Example #33
0
    def compute_obj(self, f):

        res_gpu = self.X.cnv(f)-self.y
        return 0.5 * cua.dot(res_gpu,res_gpu)
Example #34
0
 def _perform_dot(self, v1, v2):
     return dot(v1, v2)
Example #35
0
    def compute_grad(self, x):

        return cua.dot(self.A.T, cua.dot(self.A,x) - self.b)
Example #36
0
    def __init__(self, objective, x_init, options):

        self.objective = objective
        self.options = options
        self.time_start = time.clock()
        self.iter = 0
        self.status = 'Failure'

        # ------------------------------------------
        #  Initialisation
        #  -----------------------------------------
        self.initialisation(x_init)

        # ------------------------------------------
        #  Sanity checks
        #  -----------------------------------------
        if np.sqrt(cua.dot(self.x, self.x).get()) < 1e-12:
            raise IOError('Initial vector close to zero. Cannot proceed')

        # ------------------------------------------
        #  Prime the pump
        #  -----------------------------------------
        if options.verbose:
            print 'Running Projected Barzilai Borwein:\n'

        # ------------------------------------------
        #  Main iterative loop
        #  -----------------------------------------
        for i in range(options.maxiter):
            self.iter += 1
            self.show_status()

            dx = self.x - self.oldx
            dg = self.g - self.oldg

            if not options.unconstrained:
                clip2bound(dx, self.x, self.g)
                clip2bound(dg, self.x, self.g)

                self.dx = dx
                self.dg = dg

            # Check termination criteria
            self.check_termination()
            if self.term_reason:
                break

            # store x & gradient
            self.oldx = self.x
            self.oldg = self.g

            # update x & gradient
            if (np.mod(self.iter, 2) == 0):
                step = (cua.sum(dx * dx) / (0.00001 + cua.sum(dx * dg))).get()
            else:
                step = (cua.sum(dx * dg) / (0.00001 + cua.sum(dg * dg))).get()

            self.x = self.x - self.g * step
            if not options.unconstrained:
                gputools.cliplower_GPU(self.x, 0)  # projection

            if options.compute_both:
                self.oldobj = self.obj
                self.obj, self.g = objective.compute_both(self.x)
            elif options.compute_obj:
                self.g = objective.compute_grad(self.x)
                self.oldobj = self.obj
                self.obj = objective.compute_obj(self.x)
            else:
                self.g = objective.compute_grad(self.x)

        # ------------------------------------------
        #  Final statistics and wrap up
        #  -----------------------------------------
        self.time = time.clock() - self.time_start
        self.status = 'Success'

        if self.options.verbose:
            print self.status
            print self.term_reason
            print 'Done\n'

        self.result = self.x
Example #37
0
start.record()
dev_expx = cumath.exp(dev_x)
end.record()
end.synchronize()
print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3)

start.record()
exp_x = np.exp(x)
end.record()
end.synchronize()
print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)

print "Timing vectorized dot product/sum of squares:"

start.record()
gpuarray.dot(dev_x_short, dev_x_short)
end.record()
end.synchronize()
print "GPU array calc time (initial): %fs" % (start.time_till(end) * 1e-3)

start.record()
gpuarray.dot(dev_x, dev_x)
end.record()
end.synchronize()
print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3)

start.record()
np.dot(x, x)
end.record()
end.synchronize()
print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)
start.record()
dev_expx = cumath.exp(dev_x)
end.record() 
end.synchronize()
print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)

start.record()
exp_x = np.exp(x)
end.record() 
end.synchronize()
print "CPU calc time: %fs" %(start.time_till(end)*1e-3)

print "Timing vectorized dot product/sum of squares:"

start.record()
gpuarray.dot(dev_x_short,dev_x_short)
end.record() 
end.synchronize()
print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3)

start.record()
gpuarray.dot(dev_x,dev_x)
end.record() 
end.synchronize()
print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)

start.record()
np.dot(x, x)
end.record() 
end.synchronize()
print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3)


print "Timing vectorized exponentiation:"

start.record()
dexpX = cumath.exp(dX)
end.record() 
end.synchronize()
print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)

start.record()
expX = np.exp(x)
end.record() 
end.synchronize()
print "CPU calc time: %fs" %(start.time_till(end)*1e-3)

print "Timing vectorized dot product/sum of squares:"

start.record()
gpuarray.dot(dX,dX)
end.record() 
end.synchronize()
print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)

start.record()
np.dot(x, x)
end.record() 
end.synchronize()
print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
Example #40
0
    def conjugate_gradient(self, init_delta, grad, iters=250, printing=False):
        """Find minimum of quadratic approximation using conjugate gradient
        algorithm."""

        if self.net.debug:
            self.net.check_grad(grad)

        store_iter = 5
        store_mult = 1.3
        deltas = []
        grad = -grad  # note negative, some CG algorithms are flipped
        vals = np.zeros(iters, dtype=self.net.dtype)

        if self.net.use_GPU:
            from pycuda import gpuarray
            base_grad = gpuarray.to_gpu(grad)
            delta = gpuarray.to_gpu(init_delta)
            G_dir = gpuarray.zeros(grad.shape, dtype=self.net.dtype)
            dot = lambda a, b: gpuarray.dot(a, b).get()
            get = lambda x: x.get(pagelocked=True)
            self.calc_G = self.net.GPU_calc_G
        else:
            base_grad = grad
            delta = init_delta
            G_dir = np.zeros_like(grad)
            dot = np.dot
            get = lambda x: x.copy()
            self.calc_G = self.net.calc_G

        residual = base_grad.copy()
        residual -= self.calc_G(delta, damping=self.damping, out=G_dir)
        res_norm = dot(residual, residual)
        direction = residual.copy()

        for i in range(iters):
            if printing:
                print "-" * 20
                print "CG iteration", i
                print "delta norm", np.linalg.norm(get(delta))
                print "direction norm", np.linalg.norm(get(direction))

            self.calc_G(direction, damping=self.damping, out=G_dir)

            # calculate step size
            step = res_norm / dot(direction, G_dir)

            if not np.isfinite(step):
                warnings.warn("Non-finite step value (%f)" % step)
            step = np.nan_to_num(step)

            if printing:
                print "G_dir norm", np.linalg.norm(get(G_dir))
                print "step", step

            if self.net.debug:
                tmp_G_dir = get(G_dir)
                tmp_dir = get(direction)
                self.net.check_G(tmp_G_dir, tmp_dir, self.damping)

                assert np.isfinite(step)
                assert step >= 0
                assert (np.linalg.norm(np.dot(
                    tmp_dir, tmp_G_dir)) >= np.linalg.norm(
                        np.dot(tmp_dir, self.net.calc_G(tmp_dir, damping=0))))

            # update weight delta
            delta += step * direction

            # update residual
            residual -= step * G_dir
            new_res_norm = dot(residual, residual)

            if new_res_norm < 1e-20:
                # early termination (mainly to prevent numerical errors);
                # the main termination condition is below.
                break

            # update direction
            beta = new_res_norm / res_norm
            direction *= beta
            direction += residual

            res_norm = new_res_norm

            # store deltas for backtracking
            if i == store_iter:
                deltas += [(i, get(delta))]
                store_iter = int(store_iter * store_mult)

            # martens termination conditions
            vals[i] = -0.5 * dot(residual + base_grad, delta)

            gap = max(int(0.1 * i), 10)

            if printing:
                print "termination val", vals[i]

            if (i > gap and vals[i - gap] < 0
                    and (vals[i] - vals[i - gap]) / vals[i] < 5e-6 * gap):
                break

        deltas += [(i, get(delta))]

        return deltas
end.synchronize()
print "Transfer to GPU time: %fs" % (start.time_till(end) * 1e-3)

print "Timing vectorized exponentiation:"

start.record()
dexpX = cumath.exp(dX)
end.record()
end.synchronize()
print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3)

start.record()
expX = np.exp(x)
end.record()
end.synchronize()
print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)

print "Timing vectorized dot product/sum of squares:"

start.record()
gpuarray.dot(dX, dX)
end.record()
end.synchronize()
print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3)

start.record()
np.dot(x, x)
end.record()
end.synchronize()
print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)
Example #42
0
    def compute_obj(self, f):

        res_gpu = self.X.cnv(f) - self.y
        return 0.5 * cua.dot(res_gpu, res_gpu)
Example #43
0
 def kernel(a, b):
     from pycuda.gpuarray import dot
     return dot(a, b).get()
Example #44
0
 def kernel(a, b):
     from pycuda.gpuarray import dot
     return dot(a, b).get()
Example #45
0
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import time

n = 4
a = numpy.float32(numpy.random.randn(n,n))
b = numpy.float32(numpy.random.randn(n,n))
for i in range(n):
	for j in range(n):
		a[i,j] = i+j
		b[i,j] = i+j
tic = time.time()
axb = a*b
print a
print b
print "===="
print numpy.dot(a,b)
toc = time.time() - tic
print toc, "s for CPU"

tic = time.time()
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
axbGPU = gpuarray.dot(a_gpu,b_gpu)

print "===="
print axbGPU
toc=time.time()-tic
print toc,"s for GPU"
Example #46
0
 def dot(a, b):
     return gpuarray.dot(a, b).get()
Example #47
0
 def norm(self):
     """The L2-norm on the flattened vector."""
     return np.sqrt(gpuarray.dot(self.array, self.array).get())
Example #48
0
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get();
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return
         
        # No condition met, so return false
        self.term_reason = 0;        

if __name__ == '__main__':

    case = 2
    if case == 1:
        A  = curand.rand((10000,1000))
        xt = curand.rand((1000,1))
        b  = cua.dot(A, xt)
         
        x_init = cua.empty_like(xt)
        x_init.fill(0.1)
         
        # Set up objective
        objective = MVM_Objective(A,b)
         
        # Default optimization options
        opt = Solopt()
         
        pbb = PBB(objective, x_init, opt); 

    elif case == 2:
        
        x  = pylab.imread('lena.png')
Example #49
0
    def __init__(self, objective, x_init, options):

        self.objective  = objective
        self.options    = options
        self.time_start = time.clock()
        self.iter       = 0
        self.status     = 'Failure'
        
        # ------------------------------------------
        #  Initialisation
        #  -----------------------------------------        
        self.initialisation(x_init)
        
        # ------------------------------------------
        #  Sanity checks
        #  -----------------------------------------
        if np.sqrt(cua.dot(self.x, self.x).get()) < 1e-12:
            raise IOError('Initial vector close to zero. Cannot proceed');

        # ------------------------------------------
        #  Prime the pump
        #  -----------------------------------------
        if options.verbose:
            print 'Running Projected Barzilai Borwein:\n'

 
        # ------------------------------------------
        #  Main iterative loop
        #  -----------------------------------------        
        for i in range(options.maxiter):
            self.iter += 1
            self.show_status()        
            
            dx = self.x - self.oldx
            dg = self.g - self.oldg

            if not options.unconstrained:
                clip2bound(dx, self.x, self.g)
                clip2bound(dg, self.x, self.g)                
    
                self.dx = dx
                self.dg = dg

            # Check termination criteria
            self.check_termination()            
            if self.term_reason:
                break                

            # store x & gradient
            self.oldx = self.x
            self.oldg = self.g

            # update x & gradient
            if (np.mod(self.iter, 2) == 0):
                step = (cua.sum(dx*dx) / (0.00001+cua.sum(dx*dg))).get()
            else:
                step = (cua.sum(dx*dg) / (0.00001+cua.sum(dg*dg))).get()
        
            self.x = self.x - self.g * step
            if not options.unconstrained:
                gputools.cliplower_GPU(self.x, 0)      # projection
            
    
            if options.compute_both:
                self.oldobj = self.obj
                self.obj, self.g = objective.compute_both(self.x);
            elif options.compute_obj:
                self.g = objective.compute_grad(self.x)
                self.oldobj = self.obj;
                self.obj = objective.compute_obj(self.x);
            else:
                self.g = objective.compute_grad(self.x)
                

        # ------------------------------------------
        #  Final statistics and wrap up
        #  -----------------------------------------        
        self.time   = time.clock() - self.time_start
        self.status = 'Success'

        if self.options.verbose:
            print self.status
            print self.term_reason
            print 'Done\n'

        self.result = self.x
Example #50
0
    def compute_obj(self, x):

        return 0.5 * cua.dot( cua.dot(self.A,x) - self.b )
Example #51
0
    def compute_grad(self, x):

        return cua.dot(self.A.T, cua.dot(self.A, x) - self.b)
Example #52
0
    def conjugate_gradient(self, init_delta, grad, iters=250, printing=False):
        """Find minimum of quadratic approximation using conjugate gradient
        algorithm."""

        if self.net.debug:
            self.net.check_grad(grad)

        store_iter = 5
        store_mult = 1.3
        deltas = []
        grad = -grad  # note negative, some CG algorithms are flipped
        vals = np.zeros(iters, dtype=self.net.dtype)

        if self.net.use_GPU:
            from pycuda import gpuarray
            base_grad = gpuarray.to_gpu(grad)
            delta = gpuarray.to_gpu(init_delta)
            G_dir = gpuarray.zeros(grad.shape, dtype=self.net.dtype)
            dot = lambda a, b: gpuarray.dot(a, b).get()
            get = lambda x: x.get(pagelocked=True)
            self.calc_G = self.net.GPU_calc_G
        else:
            base_grad = grad
            delta = init_delta
            G_dir = np.zeros_like(grad)
            dot = np.dot
            get = lambda x: x.copy()
            self.calc_G = self.net.calc_G

        residual = base_grad.copy()
        residual -= self.calc_G(delta, damping=self.damping, out=G_dir)
        res_norm = dot(residual, residual)
        direction = residual.copy()

        for i in range(iters):
            if printing:
                print "-" * 20
                print "CG iteration", i
                print "delta norm", np.linalg.norm(get(delta))
                print "direction norm", np.linalg.norm(get(direction))

            self.calc_G(direction, damping=self.damping, out=G_dir)

            # calculate step size
            step = res_norm / dot(direction, G_dir)

            if not np.isfinite(step):
                warnings.warn("Non-finite step value (%f)" % step)
            step = np.nan_to_num(step)

            if printing:
                print "G_dir norm", np.linalg.norm(get(G_dir))
                print "step", step

            if self.net.debug:
                tmp_G_dir = get(G_dir)
                tmp_dir = get(direction)
                self.net.check_G(tmp_G_dir, tmp_dir, self.damping)

                assert np.isfinite(step)
                assert step >= 0
                assert (np.linalg.norm(np.dot(tmp_dir, tmp_G_dir)) >=
                        np.linalg.norm(np.dot(tmp_dir,
                                              self.net.calc_G(tmp_dir,
                                                              damping=0))))

            # update weight delta
            delta += step * direction

            # update residual
            residual -= step * G_dir
            new_res_norm = dot(residual, residual)

            if new_res_norm < 1e-20:
                # early termination (mainly to prevent numerical errors);
                # the main termination condition is below.
                break

            # update direction
            beta = new_res_norm / res_norm
            direction *= beta
            direction += residual

            res_norm = new_res_norm

            # store deltas for backtracking
            if i == store_iter:
                deltas += [(i, get(delta))]
                store_iter = int(store_iter * store_mult)

            # martens termination conditions
            vals[i] = -0.5 * dot(residual + base_grad, delta)

            gap = max(int(0.1 * i), 10)

            if printing:
                print "termination val", vals[i]

            if (i > gap and vals[i - gap] < 0 and
                    (vals[i] - vals[i - gap]) / vals[i] < 5e-6 * gap):
                break

        deltas += [(i, get(delta))]

        return deltas
Example #53
0
    def compute_obj(self, x):

        return 0.5 * cua.dot(cua.dot(self.A, x) - self.b)
    def minimize_batch(self, batch_size, eta, opt_method):
        #using a batch_gradient descent method
        cost = 0.0
        eps = 1e-8  # for use in adagrad
        for lower_index in range(0, len(self.nonzeros), batch_size):
            upper_index = min(lower_index + batch_size, len(self.nonzeros))  # index of first element after the end of this batch
            batch = [self.nonzeros[k] for k in range(lower_index, upper_index)]
            batch_i = [index[0] for index in batch]
            batch_j = [index[1] for index in batch]
            cur_batch_len = np.int32(upper_index - lower_index)
            
            batch_i_gpu = gpuarray.to_gpu(np.array(batch_i, dtype=np.int32))
            batch_j_gpu = gpuarray.to_gpu(np.array(batch_j, dtype=np.int32))
            cost_inner = gpuarray.zeros(batch_size, dtype=np.float32)
            weighted_cost_inner = gpuarray.zeros_like(cost_inner)
            
            # calculate intermediate values
            # cost_inner =  + self.b[batch_i] + \
            #     self.b_tilde[batch_j] - np.log(np.array([self.cooccurrence_mat[k] for k in range(lower_index, upper_index)]))
            batchMatColDot(cur_batch_len, self.v_dim, self.W, self.W_tilde, batch_i_gpu, batch_j_gpu, cost_inner, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            context.synchronize()
            if lower_index == 0:
                print cost_inner.get()
            batchCostInner(np.int32(lower_index), np.int32(upper_index), cost_inner, self.b, self.b_tilde, \
                self.cooccurrence_mat, batch_i_gpu, batch_j_gpu, block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            if lower_index == 0:
                print cost_inner.get()
            context.synchronize()
            # weighted_cost_inner = np.array([self.f_x[k] for k in range(lower_index_upper_index)]) * cost_inner
            batchWeightedInnerCost(np.int32(lower_index), np.int32(upper_index), self.f_x, cost_inner, weighted_cost_inner, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            if lower_index == 0:
                print weighted_cost_inner.get()
            context.synchronize()
            
            # calculate the gradients of each parameter
            # self.gradW[batch_i] = (self.W_tilde[batch_j].T * weighted_cost_inner).T
            batchMatVecRowMult(cur_batch_len, self.v_dim, self.W_tilde, weighted_cost_inner, self.gradW, batch_j_gpu, batch_i_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            # self.gradW_tilde[batch_j] = (self.W[batch_i].T * weighted_cost_inner).T
            batchMatVecRowMult(cur_batch_len, self.v_dim, self.W, weighted_cost_inner, self.gradW_tilde, batch_i_gpu, batch_j_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
                        
            # self.gradb[batch_i] = self.gradb_tilde[batch_j] = weighted_cost_inner
            batchCopyVector(cur_batch_len, weighted_cost_inner, self.b, batch_i_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            batchCopyVector(cur_batch_len, weighted_cost_inner, self.b_tilde, batch_j_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            context.synchronize()

            # perform the main parameter updates
            # self.W[batch_i] -= eta * self.gradW[batch_i]
            batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W, self.gradW, batch_i_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            # self.W_tilde[batch_j] -= eta * self.gradW_tilde[batch_j]
            batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W_tilde, self.gradW_tilde, batch_j_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            # self.b[batch_i] -= eta * self.gradb[batch_i]
            batchVecSubtractInplace(cur_batch_len, eta, self.b, self.gradb, batch_i_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            # self.b_tilde[batch_j] -= eta * self.gradb_tilde[batch_j]      
            batchVecSubtractInplace(cur_batch_len, eta, self.b_tilde, self.gradb_tilde, batch_j_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))      
            context.synchronize()

            cost += gpuarray.dot(weighted_cost_inner, cost_inner).get()
        return cost
 def get_purity(self):
     """
     Return the purity of the current Wigner function, 2*np.pi*np.sum(W**2)*dXdP
     :return: float
     """
     return 2. * np.pi * gpuarray.dot(self.wignerfunction, self.wignerfunction).get().real * self.dXdP
Example #56
0
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import time

r = [2, 5000]
for n in r:
    a = numpy.float32(numpy.random.randn(n, n))
    b = numpy.float32(numpy.random.randn(n, n))

    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)

    tic = time.time()
    axbGPU = gpuarray.dot(a_gpu, b_gpu)
    toc = time.time() - tic
    print(toc)
Example #57
0
import pycuda.driver as cuda
import pycuda.gpuarray as gpuArray
import pycuda.autoinit
import numpy



if len(sys.argv) != 4:
	print("Usage: python3 dot_cuda.py <n_workers> <work_size> <repetitions>")
	exit(1)

n_workers = int(sys.argv[1])
work_size = int(sys.argv[2])
repetitions = int(sys.argv[3])


t1 = time.perf_counter()

vec_a = numpy.float32(numpy.array([0.01 for i in range(work_size*n_workers)]))
vec_b = numpy.float32(numpy.array([1.00 for i in range(work_size*n_workers)]))
gpu_a = gpuArray.to_gpu(vec_a)
gpu_b = gpuArray.to_gpu(vec_b)
t_aloc = time.perf_counter() - t1

t2 = time.perf_counter()
dot = gpuArray.dot(gpu_a, gpu_b)
t_proc = t2-time.perf_counter()

print(dot)
print("Tempo Alocacao: " + str(t_aloc))
print("Tempo Calculos: " + str(t_calc))
Example #58
0
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get()
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return

        # No condition met, so return false
        self.term_reason = 0

if __name__ == '__main__':

    case = 2
    if case == 1:
        A = curand.rand((10000, 1000))
        xt = curand.rand((1000, 1))
        b = cua.dot(A, xt)

        x_init = cua.empty_like(xt)
        x_init.fill(0.1)

        # Set up objective
        objective = MVM_Objective(A, b)

        # Default optimization options
        opt = Solopt()

        pbb = PBB(objective, x_init, opt)

    elif case == 2:

        x = pylab.imread('lena.png')