def test_cublasSscal(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) alpha = np.float32(np.random.rand()) cublas.cublasSscal(x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x)
def reduce_obtained_updates( self, iter ): for (data_blob, data_blob_temp) in zip(self.data_blobs_gpu, self.data_blobs_gpu_initial): self.reduce_log_local(0, data_blob, data_blob_temp) #if self.is_local_master: # for j in range( 1, self.chunk_size ): # self.comm_splitted.Recv([ self.to_buffer( data_blob_temp), MPI.FLOAT], source=MPI.ANY_SOURCE) # cublas.cublasSaxpy(self.cublas_handle, data_blob_temp.size, 1.0, data_blob_temp.gpudata, 1, data_blob.gpudata, 1) #else: # self.comm_splitted.Send([ self.to_buffer( data_blob), MPI.FLOAT], dest=0) #self.comm_splitted.Barrier() self.ctx.synchronize() if self.is_local_master: for data_blob in self.data_blobs_gpu: cublas.cublasSscal(self.cublas_handle, data_blob.size, 1.0 / self.chunk_size, data_blob.gpudata, 1) self.ctx.synchronize() if self.is_local_master & (iter % self._master_sync_interval == 0): for i in xrange(len( self.data_blobs_gpu)): for other_rank in self.other_master_ranks: self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank ) cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1) cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / self.num_masters, self.data_blobs_gpu[i].gpudata, 1) loss = np.array([0.]) for other_rank in self.other_master_ranks: temp = np.array([0.]) to_send = np.array([self.local_solver.train_loss]) self.comm_masters.Sendrecv([to_send, MPI.FLOAT], dest=other_rank, recvbuf=[temp, MPI.FLOAT], source=other_rank) loss[0] += temp[0] self.local_solver.train_loss = (self.local_solver.train_loss + loss[0])/self.num_masters
def test_cublasSscal(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) alpha = np.float32(np.random.rand()) cublas.cublasSscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha * x)
def reduce_obtained_updates(self): if self.rank == 0: for data_blob in self.data_blobs_gpu: # FIXME data_blob.fill(0) cublas.cublasSscal(self.cublas_handle, data_blob.size, 0, data_blob.gpudata, 1) self.ctx.synchronize() for i in xrange(len(self.data_blobs_gpu)): if self.rank == 0: for j in range(1, self.comm_size): self.comm.Recv(self.data_blobs_gpu_initial[i].get(), source=MPI.ANY_SOURCE) cublas.cublasSaxpy(self.cublas_handle, self.data_blobs_gpu_initial[i].size, 1.0, self.data_blobs_gpu_initial[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1) #self.comm.Reduce(MPI.IN_PLACE, [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0) else: self.comm.Send(self.data_blobs_gpu[i].get(), dest=0) #self.comm.Reduce([ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0) self.comm.Barrier() self.ctx.synchronize() if self.rank == 0: for data_blob in self.data_blobs_gpu: cublas.cublasSscal(self.cublas_handle, data_blob.size, 1.0 / (self.comm_size - 1), data_blob.gpudata, 1) self.ctx.synchronize()
def test_average(self): """ Test average function on GPU """ import caffe_facade import scikits.cuda.cublas as cublas import numpy as np import pycuda.gpuarray as gpuarray from caffe_facade import pycuda_util shape = (64, 32, 5, 5) num_elements = np.prod(shape) num_samples = 10 data_cpu = np.zeros(shape, np.float32) data_cpu_received = [ np.random.rand(*shape).astype(np.float32) for i in range(num_samples) ] with pycuda_util.caffe_cuda_context(): #GPU average data_gpu = gpuarray.to_gpu(np.zeros(shape, np.float32)) h = caffe_facade.cublas_handle() data_gpu_temp = gpuarray.to_gpu(data_cpu_received[0]) cublas.cublasScopy(h, num_elements, data_gpu_temp.gpudata, 1, data_gpu.gpudata, 1) for i in range(1, len(data_cpu_received)): data_gpu_temp = gpuarray.to_gpu(data_cpu_received[i]) cublas.cublasSaxpy(h, num_elements, 1.0, data_gpu_temp.gpudata, 1, data_gpu.gpudata, 1) cublas.cublasSscal(h, num_elements, 1.0 / num_samples, data_gpu.gpudata, 1) #CPU average data_cpu = data_cpu_received[0] / num_samples for i in range(1, len(data_cpu_received)): data_cpu += data_cpu_received[i] / num_samples assert np.allclose(data_cpu, data_gpu.get())
def solve(self): #This must be replaced in future by rank with highest possible IB speed self.is_local_master = self.splitted_rank == 0 self.is_global_master = self.is_local_master & (self.rank == 0) if self.is_local_master: print "I am master %d with padavans %d" % (self.rank, len(self.gpu_ids)) self.splitted_size = self.comm_splitted.Get_size() self.chunk_size = len(self.gpu_ids) self.comm_masters = self.comm.Split(color=self.splitted_rank == 0, key=self.rank) if self.is_local_master: self.other_master_ranks = [r for r in range(self.num_masters) if r != self.comm_masters.Get_rank()] if self.is_global_master: self.logger.info("MM_SDSolverMPI started at submaster #%d..." % self.rank) self.logger.info('Current Datetime = {0}'.format(str(datetime.datetime.now()))) self._solve_start = time.time() iter = self.local_solver.iter max_iter = self.local_solver.max_iter if self.is_local_master: for i in xrange(len( self.data_blobs_gpu)): for other_rank in self.other_master_ranks: self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank ) cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1) cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / (self.num_masters), self.data_blobs_gpu[i].gpudata, 1) self.ctx.synchronize() self.comm.Barrier() while iter < max_iter: if self.is_global_master: print 'Iter {0:d} from {1:d}...'.format(iter, max_iter) self.logger.info('Iter {0:d} from {1:d}...'.format(iter, max_iter)) self.compute_weights_updates( iter ) self.reduce_obtained_updates(iter) _ = gc.collect() iter += 1 if self.is_global_master & (iter % self.local_solver.snapshot_interval == 0): print "Snapshotting..." for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu): cuda.memcpy_dtoh(data, data_blob_gpu.ptr) #self.local_solver.snapshot() #self.local_solver.output_finish() # Logging and snapshots if self.is_global_master & (iter % self._master_sync_interval == 0): print 'Loss: ' + str(self.local_solver.train_loss) self._log_training() if isfile("KILL_DSOLVER"): #os.remove("KILL_DSOLVER") break # Make a snapshot on master if self.is_global_master: print "Snapshotting..." for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu): cuda.memcpy_dtoh(data, data_blob_gpu.ptr) self.local_solver.snapshot() self.local_solver.output_finish()