Esempio n. 1
0
	def reduce_obtained_updates( self, iter ):
	   
	    for (data_blob, data_blob_temp) in zip(self.data_blobs_gpu, self.data_blobs_gpu_initial):
		self.reduce_log_local(0, data_blob, data_blob_temp) 
		#if self.is_local_master:
		#    for j in range( 1, self.chunk_size ):
		#        self.comm_splitted.Recv([ self.to_buffer( data_blob_temp), MPI.FLOAT], source=MPI.ANY_SOURCE)
		#        cublas.cublasSaxpy(self.cublas_handle, data_blob_temp.size, 1.0, data_blob_temp.gpudata, 1, data_blob.gpudata, 1) 
		#else:
		#    self.comm_splitted.Send([ self.to_buffer( data_blob), MPI.FLOAT], dest=0)

		#self.comm_splitted.Barrier()
	    self.ctx.synchronize()
	    
	    if self.is_local_master:
		for data_blob in self.data_blobs_gpu:
		    cublas.cublasSscal(self.cublas_handle, data_blob.size, 1.0 / self.chunk_size, data_blob.gpudata, 1)
		self.ctx.synchronize()
	    
	    if  self.is_local_master & (iter % self._master_sync_interval == 0):
		for i in xrange(len( self.data_blobs_gpu)):
                    for other_rank in self.other_master_ranks:
                        self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank )
                        cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1)
                    cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / self.num_masters, self.data_blobs_gpu[i].gpudata, 1)
		loss = np.array([0.])
	        for other_rank in self.other_master_ranks:
		    temp = np.array([0.])
		    to_send = np.array([self.local_solver.train_loss])
		    self.comm_masters.Sendrecv([to_send, MPI.FLOAT], dest=other_rank, recvbuf=[temp, MPI.FLOAT], source=other_rank)
		    loss[0] += temp[0]
	        self.local_solver.train_loss = (self.local_solver.train_loss + loss[0])/self.num_masters
Esempio n. 2
0
    def reduce_obtained_updates(self):
        if self.rank == 0:
            for data_blob in self.data_blobs_gpu:
                # FIXME	    data_blob.fill(0)
                cublas.cublasSscal(self.cublas_handle, data_blob.size, 0,
                                   data_blob.gpudata, 1)
            self.ctx.synchronize()

        for i in xrange(len(self.data_blobs_gpu)):
            if self.rank == 0:
                for j in range(1, self.comm_size):
                    self.comm.Recv(self.data_blobs_gpu_initial[i].get(),
                                   source=MPI.ANY_SOURCE)
                    cublas.cublasSaxpy(self.cublas_handle,
                                       self.data_blobs_gpu_initial[i].size,
                                       1.0,
                                       self.data_blobs_gpu_initial[i].gpudata,
                                       1, self.data_blobs_gpu[i].gpudata, 1)
                #self.comm.Reduce(MPI.IN_PLACE, [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0)
            else:
                self.comm.Send(self.data_blobs_gpu[i].get(), dest=0)
                #self.comm.Reduce([ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0)
            self.comm.Barrier()
        self.ctx.synchronize()

        if self.rank == 0:
            for data_blob in self.data_blobs_gpu:
                cublas.cublasSscal(self.cublas_handle, data_blob.size,
                                   1.0 / (self.comm_size - 1),
                                   data_blob.gpudata, 1)
            self.ctx.synchronize()
Esempio n. 3
0
    def test_update_diff(self):
        """
        Test update diff for blobs
        """

        import scikits.cuda.cublas as cublas
        import pycuda.gpuarray as gpuarray
        import copy
        import caffe_facade
        from caffe_facade import pycuda_util
        import numpy as np

        blobs = list()
        for (blob_name, blob)  in self.net.params.items():
            blobs.append(blob[0])
            blobs.append(blob[1])

        mult = 0.0001
        blobs_update_cpu = [np.random.rand(*blob.diff.shape).astype(np.float32) * mult for blob in blobs]
        initial_params_diff = copy.deepcopy(self.net.params_diff)

        with pycuda_util.caffe_cuda_context():
            h = caffe_facade.cublas_handle()
            blobs_gpu = [blob.diff_as_pycuda_gpuarray() for blob in blobs]
            blobs_update_gpu = [gpuarray.to_gpu(blob_update_cpu) for blob_update_cpu in blobs_update_cpu]

            for (blob_gpu, blob_update_gpu) in zip(blobs_gpu, blobs_update_gpu):
                cublas.cublasSaxpy(h, blob_gpu.size, 1.0, blob_update_gpu.gpudata, 1, blob_gpu.gpudata, 1)

            for (blob_gpu, initial_param_diff, blob_update_cpu) in zip(blobs_gpu, initial_params_diff, blobs_update_cpu):
                assert np.allclose(blob_gpu.get(), initial_param_diff.reshape(blob_gpu.shape) + blob_update_cpu)

            params_diff = self.net.params_diff
            for (blob_gpu, param_diff) in zip(blobs_gpu, params_diff):
                assert np.allclose(blob_gpu.get(), param_diff.reshape(blob_gpu.shape))
Esempio n. 4
0
 def test_cublasSaxpy(self):
     alpha = np.float32(np.random.rand())
     x = np.random.rand(5).astype(np.float32)
     x_gpu = gpuarray.to_gpu(x)
     y = np.random.rand(5).astype(np.float32)
     y_gpu = gpuarray.to_gpu(y)
     cublas.cublasSaxpy(x_gpu.size, alpha, x_gpu.gpudata, 1,
                        y_gpu.gpudata, 1)
     assert np.allclose(y_gpu.get(), alpha*x+y)
Esempio n. 5
0
 def test_cublasSaxpy(self):
     alpha = np.float32(np.random.rand())
     x = np.random.rand(5).astype(np.float32)
     x_gpu = gpuarray.to_gpu(x)
     y = np.random.rand(5).astype(np.float32)
     y_gpu = gpuarray.to_gpu(y)
     cublas.cublasSaxpy(self.cublas_handle, x_gpu.size, alpha,
                        x_gpu.gpudata, 1, y_gpu.gpudata, 1)
     assert np.allclose(y_gpu.get(), alpha * x + y)
 def x_add_y(self, alpha, x, beta, y, result = None):
     '''
     return alpha*x[] + beta*y[]
     
      x_add_y(float alpha, float* x, float beta, float* y, float* result, ulong x_size)
     '''
     if result is None:
         if beta != 1.0:
            cublas.cublasSaxpy(self.handle, y.size, beta - 1.0, y.gpudata, 1, y.gpudata, 1) 
         cublas.cublasSaxpy(self.handle, x.size, alpha, x.gpudata, 1, y.gpudata, 1) 
     else:
         self.x_add_y_kernel(np.float32(alpha), x.gpudata, \
                             np.float32(beta), y.gpudata, \
                             result.gpudata, np.int64(x.size), \
                             block = (1024, 1, 1), \
                             grid = (int(x.size / 1024) + 1, 1) \
                             )
Esempio n. 7
0
	def reduce_log_local(self, root, blob, temp_blob):
            send_ranks = range(1, self.splitted_size, 2)
	    #FIX this will not work properly for all configs
	    receive_ranks = range(0, self.splitted_size, 2)
            num_iter = int(math.ceil(math.log(self.splitted_size,2)))
            for i in range(num_iter):
                for (s_rank, r_rank) in zip(send_ranks, receive_ranks):
                    if self.splitted_rank == s_rank:
                        master_req = self.comm_splitted.Isend([ self.to_buffer(blob) ,MPI.FLOAT], dest=r_rank)
                        re=MPI.Request.Wait( master_req )
                    elif self.splitted_rank == r_rank:
                        slave_req = self.comm_splitted.Irecv([ self.to_buffer(temp_blob) ,MPI.FLOAT], source=s_rank)
                        re=MPI.Request.Wait( slave_req )
			cublas.cublasSaxpy(self.cublas_handle, temp_blob.size, 1.0, temp_blob.gpudata, 1, blob.gpudata, 1)
             	send_ranks = receive_ranks[1::2]
                receive_ranks = receive_ranks[::2]
	    	self.comm_splitted.Barrier()
 def X_add_Y(self, alpha, X, beta, Y, result = None):
     '''
     return alpha*X[,] + beta*Y[,]  
           
     X_add_Y(float alpha, float* X, float beta, float* Y, float* result, uint Y_col,
             uint Y_row)
     '''
     if result is None:
         if beta != 1.0:
            cublas.cublasSaxpy(self.handle, Y.size, beta - 1.0, Y.gpudata, 1, Y.gpudata, 1) 
         cublas.cublasSaxpy(self.handle, X.size, alpha, X.gpudata, 1, Y.gpudata, 1)
     else:            
         Y_col = Y.shape[0]
         Y_row = Y.shape[1]
         self.X_add_Y_kernel(np.float32(alpha), X.gpudata, \
                             np.float32(beta), Y.gpudata, \
                             result.gpudata, \
                             np.uint32(Y_col), np.uint32(Y_row), \
                             block = (32, 32, 1), \
                             grid = (int(Y_row / 32) + 1, int(Y_col / 32) + 1) \
                             )
Esempio n. 9
0
def matrix_addition(d_a, d_b):
    # Overwrites d_a
    assert d_a.shape == d_b.shape
    if len(d_a.shape) == 1:
        # Vector addition
        cublas.cublasSaxpy(handle, d_a.size, 1.0, d_b.gpudata, 1, d_a.gpudata, 1)
    elif len(d_a.shape) == 2:
        # Matrix addition
        m, n = d_a.shape
        cublas.cublasSgeam(handle,
                'N', 'N',
                m, n,
                1.0,
                d_a.gpudata, m,
                1.0,
                d_b.gpudata, m,
                d_a.gpudata, m)
    else:
        tmp = (d_a.ravel() + d_b.ravel()).reshape(d_a.shape)
        cuda.memcpy_dtod(d_a.gpudata, tmp.gpudata, d_a.nbytes)
    return d_a
Esempio n. 10
0
    def test_axpy(self):
        """
        Test axpy function from scikits.cuda.cublas
        """
        import caffe_facade
        import scikits.cuda.cublas as cublas
        import numpy as np
        import pycuda.gpuarray as gpuarray
        from caffe_facade import pycuda_util

        caffe_facade.set_mode_gpu()
        caffe_facade.set_device(0)
        x = np.random.randn(5, 4, 3, 2).astype(np.float32)
        y = np.random.randn(5, 4, 3, 2).astype(np.float32)
        with pycuda_util.caffe_cuda_context():
            h = caffe_facade.cublas_handle()
            x_gpu = gpuarray.to_gpu(x)
            y_gpu = gpuarray.to_gpu(y)
            cublas.cublasSaxpy(h, x.size, 1.0, x_gpu.gpudata, 1, y_gpu.gpudata,
                               1)
            y = x + y
            assert np.allclose(y_gpu.get(), y)
Esempio n. 11
0
    def test_average(self):
        """
        Test average function on GPU
        """
        import caffe_facade
        import scikits.cuda.cublas as cublas
        import numpy as np
        import pycuda.gpuarray as gpuarray
        from caffe_facade import pycuda_util

        shape = (64, 32, 5, 5)
        num_elements = np.prod(shape)
        num_samples = 10

        data_cpu = np.zeros(shape, np.float32)
        data_cpu_received = [
            np.random.rand(*shape).astype(np.float32)
            for i in range(num_samples)
        ]
        with pycuda_util.caffe_cuda_context():
            #GPU average
            data_gpu = gpuarray.to_gpu(np.zeros(shape, np.float32))
            h = caffe_facade.cublas_handle()
            data_gpu_temp = gpuarray.to_gpu(data_cpu_received[0])
            cublas.cublasScopy(h, num_elements, data_gpu_temp.gpudata, 1,
                               data_gpu.gpudata, 1)
            for i in range(1, len(data_cpu_received)):
                data_gpu_temp = gpuarray.to_gpu(data_cpu_received[i])
                cublas.cublasSaxpy(h, num_elements, 1.0, data_gpu_temp.gpudata,
                                   1, data_gpu.gpudata, 1)
            cublas.cublasSscal(h, num_elements, 1.0 / num_samples,
                               data_gpu.gpudata, 1)

            #CPU average
            data_cpu = data_cpu_received[0] / num_samples
            for i in range(1, len(data_cpu_received)):
                data_cpu += data_cpu_received[i] / num_samples

            assert np.allclose(data_cpu, data_gpu.get())
Esempio n. 12
0
	def solve(self):
	   #This must be replaced in future by rank with highest possible IB speed
	   self.is_local_master = self.splitted_rank == 0
	   self.is_global_master = self.is_local_master & (self.rank == 0)
	   if self.is_local_master:
		print "I am master %d with padavans %d" % (self.rank, len(self.gpu_ids))
	   self.splitted_size = self.comm_splitted.Get_size()
           self.chunk_size = len(self.gpu_ids)
	   self.comm_masters = self.comm.Split(color=self.splitted_rank == 0, key=self.rank)
	   if self.is_local_master:
	       self.other_master_ranks = [r for r in  range(self.num_masters) if r != self.comm_masters.Get_rank()]

	   if self.is_global_master:
               self.logger.info("MM_SDSolverMPI started at submaster #%d..." % self.rank)
	       self.logger.info('Current Datetime = {0}'.format(str(datetime.datetime.now())))
	   self._solve_start = time.time()

	   iter = self.local_solver.iter 
	   max_iter = self.local_solver.max_iter
	   if self.is_local_master:
	       for i in xrange(len( self.data_blobs_gpu)):
	           for other_rank in self.other_master_ranks:
                       self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank )
                       cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1)
                   cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / (self.num_masters), self.data_blobs_gpu[i].gpudata, 1)
               self.ctx.synchronize()
           self.comm.Barrier()
	   
	   while iter < max_iter:
	      if self.is_global_master:
		  print 'Iter {0:d} from {1:d}...'.format(iter, max_iter)
	          self.logger.info('Iter {0:d} from {1:d}...'.format(iter, max_iter))

	      self.compute_weights_updates( iter )
	      
	      self.reduce_obtained_updates(iter)
	      _ = gc.collect()

	      iter += 1
	      if self.is_global_master & (iter % self.local_solver.snapshot_interval == 0):
                  print "Snapshotting..."
                  for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu):
                      cuda.memcpy_dtoh(data, data_blob_gpu.ptr)
                      #self.local_solver.snapshot()
                      #self.local_solver.output_finish()
	      # Logging and snapshots
              if self.is_global_master &  (iter % self._master_sync_interval == 0):
                  print 'Loss: ' + str(self.local_solver.train_loss)
                  self._log_training()
	      if isfile("KILL_DSOLVER"):
	          #os.remove("KILL_DSOLVER")
		  break
		  

	   # Make a snapshot on master
	   if self.is_global_master:
               print "Snapshotting..."
               for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu): 
	           cuda.memcpy_dtoh(data, data_blob_gpu.ptr)
	           self.local_solver.snapshot()
	           self.local_solver.output_finish()