def reduce_obtained_updates( self, iter ): for (data_blob, data_blob_temp) in zip(self.data_blobs_gpu, self.data_blobs_gpu_initial): self.reduce_log_local(0, data_blob, data_blob_temp) #if self.is_local_master: # for j in range( 1, self.chunk_size ): # self.comm_splitted.Recv([ self.to_buffer( data_blob_temp), MPI.FLOAT], source=MPI.ANY_SOURCE) # cublas.cublasSaxpy(self.cublas_handle, data_blob_temp.size, 1.0, data_blob_temp.gpudata, 1, data_blob.gpudata, 1) #else: # self.comm_splitted.Send([ self.to_buffer( data_blob), MPI.FLOAT], dest=0) #self.comm_splitted.Barrier() self.ctx.synchronize() if self.is_local_master: for data_blob in self.data_blobs_gpu: cublas.cublasSscal(self.cublas_handle, data_blob.size, 1.0 / self.chunk_size, data_blob.gpudata, 1) self.ctx.synchronize() if self.is_local_master & (iter % self._master_sync_interval == 0): for i in xrange(len( self.data_blobs_gpu)): for other_rank in self.other_master_ranks: self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank ) cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1) cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / self.num_masters, self.data_blobs_gpu[i].gpudata, 1) loss = np.array([0.]) for other_rank in self.other_master_ranks: temp = np.array([0.]) to_send = np.array([self.local_solver.train_loss]) self.comm_masters.Sendrecv([to_send, MPI.FLOAT], dest=other_rank, recvbuf=[temp, MPI.FLOAT], source=other_rank) loss[0] += temp[0] self.local_solver.train_loss = (self.local_solver.train_loss + loss[0])/self.num_masters
def reduce_obtained_updates(self): if self.rank == 0: for data_blob in self.data_blobs_gpu: # FIXME data_blob.fill(0) cublas.cublasSscal(self.cublas_handle, data_blob.size, 0, data_blob.gpudata, 1) self.ctx.synchronize() for i in xrange(len(self.data_blobs_gpu)): if self.rank == 0: for j in range(1, self.comm_size): self.comm.Recv(self.data_blobs_gpu_initial[i].get(), source=MPI.ANY_SOURCE) cublas.cublasSaxpy(self.cublas_handle, self.data_blobs_gpu_initial[i].size, 1.0, self.data_blobs_gpu_initial[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1) #self.comm.Reduce(MPI.IN_PLACE, [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0) else: self.comm.Send(self.data_blobs_gpu[i].get(), dest=0) #self.comm.Reduce([ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], op=MPI.SUM, root=0) self.comm.Barrier() self.ctx.synchronize() if self.rank == 0: for data_blob in self.data_blobs_gpu: cublas.cublasSscal(self.cublas_handle, data_blob.size, 1.0 / (self.comm_size - 1), data_blob.gpudata, 1) self.ctx.synchronize()
def test_update_diff(self): """ Test update diff for blobs """ import scikits.cuda.cublas as cublas import pycuda.gpuarray as gpuarray import copy import caffe_facade from caffe_facade import pycuda_util import numpy as np blobs = list() for (blob_name, blob) in self.net.params.items(): blobs.append(blob[0]) blobs.append(blob[1]) mult = 0.0001 blobs_update_cpu = [np.random.rand(*blob.diff.shape).astype(np.float32) * mult for blob in blobs] initial_params_diff = copy.deepcopy(self.net.params_diff) with pycuda_util.caffe_cuda_context(): h = caffe_facade.cublas_handle() blobs_gpu = [blob.diff_as_pycuda_gpuarray() for blob in blobs] blobs_update_gpu = [gpuarray.to_gpu(blob_update_cpu) for blob_update_cpu in blobs_update_cpu] for (blob_gpu, blob_update_gpu) in zip(blobs_gpu, blobs_update_gpu): cublas.cublasSaxpy(h, blob_gpu.size, 1.0, blob_update_gpu.gpudata, 1, blob_gpu.gpudata, 1) for (blob_gpu, initial_param_diff, blob_update_cpu) in zip(blobs_gpu, initial_params_diff, blobs_update_cpu): assert np.allclose(blob_gpu.get(), initial_param_diff.reshape(blob_gpu.shape) + blob_update_cpu) params_diff = self.net.params_diff for (blob_gpu, param_diff) in zip(blobs_gpu, params_diff): assert np.allclose(blob_gpu.get(), param_diff.reshape(blob_gpu.shape))
def test_cublasSaxpy(self): alpha = np.float32(np.random.rand()) x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) cublas.cublasSaxpy(x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y)
def test_cublasSaxpy(self): alpha = np.float32(np.random.rand()) x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) cublas.cublasSaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha * x + y)
def x_add_y(self, alpha, x, beta, y, result = None): ''' return alpha*x[] + beta*y[] x_add_y(float alpha, float* x, float beta, float* y, float* result, ulong x_size) ''' if result is None: if beta != 1.0: cublas.cublasSaxpy(self.handle, y.size, beta - 1.0, y.gpudata, 1, y.gpudata, 1) cublas.cublasSaxpy(self.handle, x.size, alpha, x.gpudata, 1, y.gpudata, 1) else: self.x_add_y_kernel(np.float32(alpha), x.gpudata, \ np.float32(beta), y.gpudata, \ result.gpudata, np.int64(x.size), \ block = (1024, 1, 1), \ grid = (int(x.size / 1024) + 1, 1) \ )
def reduce_log_local(self, root, blob, temp_blob): send_ranks = range(1, self.splitted_size, 2) #FIX this will not work properly for all configs receive_ranks = range(0, self.splitted_size, 2) num_iter = int(math.ceil(math.log(self.splitted_size,2))) for i in range(num_iter): for (s_rank, r_rank) in zip(send_ranks, receive_ranks): if self.splitted_rank == s_rank: master_req = self.comm_splitted.Isend([ self.to_buffer(blob) ,MPI.FLOAT], dest=r_rank) re=MPI.Request.Wait( master_req ) elif self.splitted_rank == r_rank: slave_req = self.comm_splitted.Irecv([ self.to_buffer(temp_blob) ,MPI.FLOAT], source=s_rank) re=MPI.Request.Wait( slave_req ) cublas.cublasSaxpy(self.cublas_handle, temp_blob.size, 1.0, temp_blob.gpudata, 1, blob.gpudata, 1) send_ranks = receive_ranks[1::2] receive_ranks = receive_ranks[::2] self.comm_splitted.Barrier()
def X_add_Y(self, alpha, X, beta, Y, result = None): ''' return alpha*X[,] + beta*Y[,] X_add_Y(float alpha, float* X, float beta, float* Y, float* result, uint Y_col, uint Y_row) ''' if result is None: if beta != 1.0: cublas.cublasSaxpy(self.handle, Y.size, beta - 1.0, Y.gpudata, 1, Y.gpudata, 1) cublas.cublasSaxpy(self.handle, X.size, alpha, X.gpudata, 1, Y.gpudata, 1) else: Y_col = Y.shape[0] Y_row = Y.shape[1] self.X_add_Y_kernel(np.float32(alpha), X.gpudata, \ np.float32(beta), Y.gpudata, \ result.gpudata, \ np.uint32(Y_col), np.uint32(Y_row), \ block = (32, 32, 1), \ grid = (int(Y_row / 32) + 1, int(Y_col / 32) + 1) \ )
def matrix_addition(d_a, d_b): # Overwrites d_a assert d_a.shape == d_b.shape if len(d_a.shape) == 1: # Vector addition cublas.cublasSaxpy(handle, d_a.size, 1.0, d_b.gpudata, 1, d_a.gpudata, 1) elif len(d_a.shape) == 2: # Matrix addition m, n = d_a.shape cublas.cublasSgeam(handle, 'N', 'N', m, n, 1.0, d_a.gpudata, m, 1.0, d_b.gpudata, m, d_a.gpudata, m) else: tmp = (d_a.ravel() + d_b.ravel()).reshape(d_a.shape) cuda.memcpy_dtod(d_a.gpudata, tmp.gpudata, d_a.nbytes) return d_a
def test_axpy(self): """ Test axpy function from scikits.cuda.cublas """ import caffe_facade import scikits.cuda.cublas as cublas import numpy as np import pycuda.gpuarray as gpuarray from caffe_facade import pycuda_util caffe_facade.set_mode_gpu() caffe_facade.set_device(0) x = np.random.randn(5, 4, 3, 2).astype(np.float32) y = np.random.randn(5, 4, 3, 2).astype(np.float32) with pycuda_util.caffe_cuda_context(): h = caffe_facade.cublas_handle() x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) cublas.cublasSaxpy(h, x.size, 1.0, x_gpu.gpudata, 1, y_gpu.gpudata, 1) y = x + y assert np.allclose(y_gpu.get(), y)
def test_average(self): """ Test average function on GPU """ import caffe_facade import scikits.cuda.cublas as cublas import numpy as np import pycuda.gpuarray as gpuarray from caffe_facade import pycuda_util shape = (64, 32, 5, 5) num_elements = np.prod(shape) num_samples = 10 data_cpu = np.zeros(shape, np.float32) data_cpu_received = [ np.random.rand(*shape).astype(np.float32) for i in range(num_samples) ] with pycuda_util.caffe_cuda_context(): #GPU average data_gpu = gpuarray.to_gpu(np.zeros(shape, np.float32)) h = caffe_facade.cublas_handle() data_gpu_temp = gpuarray.to_gpu(data_cpu_received[0]) cublas.cublasScopy(h, num_elements, data_gpu_temp.gpudata, 1, data_gpu.gpudata, 1) for i in range(1, len(data_cpu_received)): data_gpu_temp = gpuarray.to_gpu(data_cpu_received[i]) cublas.cublasSaxpy(h, num_elements, 1.0, data_gpu_temp.gpudata, 1, data_gpu.gpudata, 1) cublas.cublasSscal(h, num_elements, 1.0 / num_samples, data_gpu.gpudata, 1) #CPU average data_cpu = data_cpu_received[0] / num_samples for i in range(1, len(data_cpu_received)): data_cpu += data_cpu_received[i] / num_samples assert np.allclose(data_cpu, data_gpu.get())
def solve(self): #This must be replaced in future by rank with highest possible IB speed self.is_local_master = self.splitted_rank == 0 self.is_global_master = self.is_local_master & (self.rank == 0) if self.is_local_master: print "I am master %d with padavans %d" % (self.rank, len(self.gpu_ids)) self.splitted_size = self.comm_splitted.Get_size() self.chunk_size = len(self.gpu_ids) self.comm_masters = self.comm.Split(color=self.splitted_rank == 0, key=self.rank) if self.is_local_master: self.other_master_ranks = [r for r in range(self.num_masters) if r != self.comm_masters.Get_rank()] if self.is_global_master: self.logger.info("MM_SDSolverMPI started at submaster #%d..." % self.rank) self.logger.info('Current Datetime = {0}'.format(str(datetime.datetime.now()))) self._solve_start = time.time() iter = self.local_solver.iter max_iter = self.local_solver.max_iter if self.is_local_master: for i in xrange(len( self.data_blobs_gpu)): for other_rank in self.other_master_ranks: self.comm_masters.Sendrecv( [ self.to_buffer( self.data_blobs_gpu[i]), MPI.FLOAT], dest=other_rank, recvbuf=[ self.to_buffer( self.temp_buffer_tosync[i]), MPI.FLOAT], source=other_rank ) cublas.cublasSaxpy(self.cublas_handle, self.temp_buffer_tosync[i].size, 1.0, self.temp_buffer_tosync[i].gpudata, 1, self.data_blobs_gpu[i].gpudata, 1) cublas.cublasSscal(self.cublas_handle, self.data_blobs_gpu[i].size, 1.0 / (self.num_masters), self.data_blobs_gpu[i].gpudata, 1) self.ctx.synchronize() self.comm.Barrier() while iter < max_iter: if self.is_global_master: print 'Iter {0:d} from {1:d}...'.format(iter, max_iter) self.logger.info('Iter {0:d} from {1:d}...'.format(iter, max_iter)) self.compute_weights_updates( iter ) self.reduce_obtained_updates(iter) _ = gc.collect() iter += 1 if self.is_global_master & (iter % self.local_solver.snapshot_interval == 0): print "Snapshotting..." for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu): cuda.memcpy_dtoh(data, data_blob_gpu.ptr) #self.local_solver.snapshot() #self.local_solver.output_finish() # Logging and snapshots if self.is_global_master & (iter % self._master_sync_interval == 0): print 'Loss: ' + str(self.local_solver.train_loss) self._log_training() if isfile("KILL_DSOLVER"): #os.remove("KILL_DSOLVER") break # Make a snapshot on master if self.is_global_master: print "Snapshotting..." for (data, data_blob_gpu) in zip( self.local_solver.net.params_data, self.data_blobs_gpu): cuda.memcpy_dtoh(data, data_blob_gpu.ptr) self.local_solver.snapshot() self.local_solver.output_finish()