Beispiel #1
0
 def test_cublasSscal(self):
     x = np.random.rand(5).astype(np.float32)
     x_gpu = gpuarray.to_gpu(x)
     alpha = np.float32(np.random.rand())
     cublas.cublasSscal(x_gpu.size, alpha,
                        x_gpu.gpudata, 1)
     assert np.allclose(x_gpu.get(), alpha*x)
Beispiel #2
0
	def reduce_obtained_updates( self, iter ):
	   
	    for (data_blob, data_blob_temp) in zip(self.data_blobs_gpu, self.data_blobs_gpu_initial):
		self.reduce_log_local(0, data_blob, data_blob_temp) 
		#if self.is_local_master:
		#    for j in range( 1, self.chunk_size ):
		#        self.comm_splitted.Recv([ self.to_buffer( data_blob_temp), MPI.FLOAT], source=MPI.ANY_SOURCE)
		#        cublas.cublasSaxpy(self.cublas_handle, data_blob_temp.size, 1.0, data_blob_temp.gpudata, 1, data_blob.gpudata, 1) 
		#else:
		#    self.comm_splitted.Send([ self.to_buffer( data_blob), MPI.FLOAT], dest=0)

		#self.comm_splitted.Barrier()
	    self.ctx.synchronize()
	    
	    if self.is_local_master:
		for data_blob in self.data_blobs_gpu:
		    cublas.cublasSscal(self.cublas_handle, data_blob.size, 1.0 / self.chunk_size, data_blob.gpudata, 1)
		self.ctx.synchronize()
	    
	    if  self.is_local_master & (iter % self._master_sync_interval == 0):
		for i in xrange(len( self.data_blobs_gpu)):
                    for other_rank in self.other_master_ranks:
                        self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank )
                        cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1)
                    cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / self.num_masters, self.data_blobs_gpu[i].gpudata, 1)
		loss = np.array([0.])
	        for other_rank in self.other_master_ranks:
		    temp = np.array([0.])
		    to_send = np.array([self.local_solver.train_loss])
		    self.comm_masters.Sendrecv([to_send, MPI.FLOAT], dest=other_rank, recvbuf=[temp, MPI.FLOAT], source=other_rank)
		    loss[0] += temp[0]
	        self.local_solver.train_loss = (self.local_solver.train_loss + loss[0])/self.num_masters
Beispiel #3
0
 def test_cublasSscal(self):
     x = np.random.rand(5).astype(np.float32)
     x_gpu = gpuarray.to_gpu(x)
     alpha = np.float32(np.random.rand())
     cublas.cublasSscal(self.cublas_handle, x_gpu.size, alpha,
                        x_gpu.gpudata, 1)
     assert np.allclose(x_gpu.get(), alpha * x)
Beispiel #4
0
    def reduce_obtained_updates(self):
        if self.rank == 0:
            for data_blob in self.data_blobs_gpu:
                # FIXME	    data_blob.fill(0)
                cublas.cublasSscal(self.cublas_handle, data_blob.size, 0,
                                   data_blob.gpudata, 1)
            self.ctx.synchronize()

        for i in xrange(len(self.data_blobs_gpu)):
            if self.rank == 0:
                for j in range(1, self.comm_size):
                    self.comm.Recv(self.data_blobs_gpu_initial[i].get(),
                                   source=MPI.ANY_SOURCE)
                    cublas.cublasSaxpy(self.cublas_handle,
                                       self.data_blobs_gpu_initial[i].size,
                                       1.0,
                                       self.data_blobs_gpu_initial[i].gpudata,
                                       1, self.data_blobs_gpu[i].gpudata, 1)
                #self.comm.Reduce(MPI.IN_PLACE, [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0)
            else:
                self.comm.Send(self.data_blobs_gpu[i].get(), dest=0)
                #self.comm.Reduce([ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0)
            self.comm.Barrier()
        self.ctx.synchronize()

        if self.rank == 0:
            for data_blob in self.data_blobs_gpu:
                cublas.cublasSscal(self.cublas_handle, data_blob.size,
                                   1.0 / (self.comm_size - 1),
                                   data_blob.gpudata, 1)
            self.ctx.synchronize()
Beispiel #5
0
    def test_average(self):
        """
        Test average function on GPU
        """
        import caffe_facade
        import scikits.cuda.cublas as cublas
        import numpy as np
        import pycuda.gpuarray as gpuarray
        from caffe_facade import pycuda_util

        shape = (64, 32, 5, 5)
        num_elements = np.prod(shape)
        num_samples = 10

        data_cpu = np.zeros(shape, np.float32)
        data_cpu_received = [
            np.random.rand(*shape).astype(np.float32)
            for i in range(num_samples)
        ]
        with pycuda_util.caffe_cuda_context():
            #GPU average
            data_gpu = gpuarray.to_gpu(np.zeros(shape, np.float32))
            h = caffe_facade.cublas_handle()
            data_gpu_temp = gpuarray.to_gpu(data_cpu_received[0])
            cublas.cublasScopy(h, num_elements, data_gpu_temp.gpudata, 1,
                               data_gpu.gpudata, 1)
            for i in range(1, len(data_cpu_received)):
                data_gpu_temp = gpuarray.to_gpu(data_cpu_received[i])
                cublas.cublasSaxpy(h, num_elements, 1.0, data_gpu_temp.gpudata,
                                   1, data_gpu.gpudata, 1)
            cublas.cublasSscal(h, num_elements, 1.0 / num_samples,
                               data_gpu.gpudata, 1)

            #CPU average
            data_cpu = data_cpu_received[0] / num_samples
            for i in range(1, len(data_cpu_received)):
                data_cpu += data_cpu_received[i] / num_samples

            assert np.allclose(data_cpu, data_gpu.get())
Beispiel #6
0
	def solve(self):
	   #This must be replaced in future by rank with highest possible IB speed
	   self.is_local_master = self.splitted_rank == 0
	   self.is_global_master = self.is_local_master & (self.rank == 0)
	   if self.is_local_master:
		print "I am master %d with padavans %d" % (self.rank, len(self.gpu_ids))
	   self.splitted_size = self.comm_splitted.Get_size()
           self.chunk_size = len(self.gpu_ids)
	   self.comm_masters = self.comm.Split(color=self.splitted_rank == 0, key=self.rank)
	   if self.is_local_master:
	       self.other_master_ranks = [r for r in  range(self.num_masters) if r != self.comm_masters.Get_rank()]

	   if self.is_global_master:
               self.logger.info("MM_SDSolverMPI started at submaster #%d..." % self.rank)
	       self.logger.info('Current Datetime = {0}'.format(str(datetime.datetime.now())))
	   self._solve_start = time.time()

	   iter = self.local_solver.iter 
	   max_iter = self.local_solver.max_iter
	   if self.is_local_master:
	       for i in xrange(len( self.data_blobs_gpu)):
	           for other_rank in self.other_master_ranks:
                       self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank )
                       cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1)
                   cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / (self.num_masters), self.data_blobs_gpu[i].gpudata, 1)
               self.ctx.synchronize()
           self.comm.Barrier()
	   
	   while iter < max_iter:
	      if self.is_global_master:
		  print 'Iter {0:d} from {1:d}...'.format(iter, max_iter)
	          self.logger.info('Iter {0:d} from {1:d}...'.format(iter, max_iter))

	      self.compute_weights_updates( iter )
	      
	      self.reduce_obtained_updates(iter)
	      _ = gc.collect()

	      iter += 1
	      if self.is_global_master & (iter % self.local_solver.snapshot_interval == 0):
                  print "Snapshotting..."
                  for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu):
                      cuda.memcpy_dtoh(data, data_blob_gpu.ptr)
                      #self.local_solver.snapshot()
                      #self.local_solver.output_finish()
	      # Logging and snapshots
              if self.is_global_master &  (iter % self._master_sync_interval == 0):
                  print 'Loss: ' + str(self.local_solver.train_loss)
                  self._log_training()
	      if isfile("KILL_DSOLVER"):
	          #os.remove("KILL_DSOLVER")
		  break
		  

	   # Make a snapshot on master
	   if self.is_global_master:
               print "Snapshotting..."
               for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu): 
	           cuda.memcpy_dtoh(data, data_blob_gpu.ptr)
	           self.local_solver.snapshot()
	           self.local_solver.output_finish()