Exemple #1
0
    def copy_to_local(self):
        
        assert self.comm != None
        
        if self.etype == 'server':
            # copy weight from g_param to g_param_ga
            for g_param, g_param_ga in \
                            zip(self.g_param_list, self.g_param_ga_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(g_param.container.value)

                self.drv.memcpy_dtod(g_param_ga.ptr,
                                      param_ga.ptr,
                                      param_ga.dtype.itemsize *
                                      param_ga.size)
                                      
            # Send(self, buf, int dest=0, int tag=0)
            
            
            mpitp = dtype_to_mpi(self.g_param_ga_list[0].dtype)
            
            for g_param_ga in self.g_param_ga_list:
                
                self.comm.Send(buf = [bufint(g_param_ga), mpitp], dest = self.dest)
            
        elif self.etype == 'worker':
            
            mpitp = dtype_to_mpi(self.w_param_ga_list[0].dtype)
            
            for w_param_ga in self.w_param_ga_list:
                
                self.comm.Recv(buf = [bufint(w_param_ga), mpitp], source = self.dest)
                
            # copy weight from w_param_ga to w_param
            for w_param_ga, w_param in \
                            zip(self.w_param_ga_list, self.w_param_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(w_param.container.value)

                self.drv.memcpy_dtod(w_param_ga.ptr,
                                      param_ga.ptr,
                                      param_ga.dtype.itemsize *
                                      param_ga.size)
        self.comm = None
Exemple #2
0
    def copy_to_local(self):

        assert self.comm != None

        if self.etype == 'server':
            # copy weight from g_param to g_param_ga
            for g_param, g_param_ga in \
                            zip(self.g_param_list, self.g_param_ga_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(g_param.container.value)

                self.drv.memcpy_dtod(g_param_ga.ptr, param_ga.ptr,
                                     param_ga.dtype.itemsize * param_ga.size)

            # Send(self, buf, int dest=0, int tag=0)

            mpitp = dtype_to_mpi(self.g_param_ga_list[0].dtype)

            for g_param_ga in self.g_param_ga_list:

                self.comm.Send(buf=[bufint(g_param_ga), mpitp], dest=self.dest)

        elif self.etype == 'worker':

            mpitp = dtype_to_mpi(self.w_param_ga_list[0].dtype)

            for w_param_ga in self.w_param_ga_list:

                self.comm.Recv(buf=[bufint(w_param_ga), mpitp],
                               source=self.dest)

            # copy weight from w_param_ga to w_param
            for w_param_ga, w_param in \
                            zip(self.w_param_ga_list, self.w_param_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(w_param.container.value)

                self.drv.memcpy_dtod(w_param_ga.ptr, param_ga.ptr,
                                     param_ga.dtype.itemsize * param_ga.size)
        self.comm = None
Exemple #3
0
    def exchange(self):
        
        # server and worker send param to each other
        
        # this function needs the worker to send an 'exchange' message 
        # to the server through REQ-REP socket first.
        
        assert self.comm != None
        
        if self.etype == 'server':
            
            # copy weight from g_param to g_param_ga
            for g_param, g_param_ga in \
                            zip(self.g_param_list, self.g_param_ga_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(g_param.container.value)

                self.drv.memcpy_dtod(g_param_ga.ptr,
                                      param_ga.ptr,
                                      param_ga.dtype.itemsize *
                                      param_ga.size)
                                      
            # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None)
            
            for g_param_ga, w_param_ga in zip(self.g_param_ga_list, self.w_param_ga_list):
                self.comm.Sendrecv(sendbuf = [bufint(g_param_ga), MPI.FLOAT], dest = self.dest,
                                   recvbuf = [bufint(w_param_ga), MPI.FLOAT], source = self.dest,
                                   )
                                   
            # copy weight from w_param_ga to w_param
            for w_param, w_param_ga in \
                            zip(self.w_param_list, self.w_param_ga_list):

            	param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(w_param.container.value)

            	self.drv.memcpy_dtod(param_ga.ptr,
                                      w_param_ga.ptr,
                                      w_param_ga.dtype.itemsize *
                                      w_param_ga.size)
                               
                               
        elif self.etype == 'worker':
            
            # copy weight from w_param to w_param_ga
            for w_param, w_param_ga in \
                            zip(self.w_param_list, self.w_param_ga_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(w_param.container.value)

                self.drv.memcpy_dtod(w_param_ga.ptr,
                                      param_ga.ptr,
                                      param_ga.dtype.itemsize *
                                      param_ga.size)
                                      
            # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None)
            
            for w_param_ga, g_param_ga in zip(self.w_param_ga_list, self.g_param_ga_list):
                self.comm.Sendrecv(sendbuf = [bufint(w_param_ga), MPI.FLOAT], dest = self.dest,
                                   recvbuf = [bufint(g_param_ga), MPI.FLOAT], source = self.dest,
                                   )
                                   
            # copy weight from w_param_ga to w_param
            for g_param, g_param_ga in \
                            zip(self.g_param_list, self.g_param_ga_list):

            	param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(g_param.container.value)

            	self.drv.memcpy_dtod(param_ga.ptr,
                                      g_param_ga.ptr,
                                      g_param_ga.dtype.itemsize *
                                      g_param_ga.size)
                                      
        self.update_func()
            
        self.comm = None
Exemple #4
0
    def exchange(self):
        
        mpidtype = self.mpidtype
        
        if self.avg: self.avg_func()
        
        # copy weight from param_ga to param_update_ga
        for param, param_update_ga in \
                        zip(self.source_param_list, self.param_update_ga_list):

            param_ga = \
             theano.misc.pycuda_utils.to_gpuarray(param.container.value)

            self.drv.memcpy_dtod(param_update_ga.ptr,
                                  param_ga.ptr,
                                  param_ga.dtype.itemsize *
                                  param_ga.size)
                                  
            self.ctx.synchronize() 
                                  
        
        if (self.size == 2):
            
            for param_update_ga,d_param_tmp,numElements,grid_size in \
                    zip(self.param_update_ga_list, \
                        self.d_param_32_tmp_list, \
                        self.numElements_list, \
                        self.grid_size_list):

                '''
                Summing and Sharing GPU Data
                Sendrecv Pairing: 0 and 1
                '''
    
                if (self.rank == 1):
                    self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                        dest=0, recvbuf=[bufint(d_param_tmp), mpidtype], source=0)
                    self.vecadd(param_update_ga, d_param_tmp, numElements, \
                        block=(256, 1, 1), grid=grid_size)
                    self.ctx.synchronize() 
                    #should synchronize context after a kernel call 
                    # to make sure the kernel has been finished
   	
                elif (self.rank == 0):
                    self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                        dest=1, recvbuf=[bufint(d_param_tmp), mpidtype], source=1)
                    self.vecadd(param_update_ga, d_param_tmp, numElements, \
                        block=(256, 1, 1), grid=grid_size)
                    self.ctx.synchronize() 
                    #should synchronize context after a kernel call 
                    # to make sure the kernel has been finished
   	
                self.comm.Barrier()



        elif (self.size == 4):
            
            for param_update_ga,d_param_tmp,numElements,grid_size in \
                    zip(self.param_update_ga_list, \
                        self.d_param_32_tmp_list, \
                        self.numElements_list, \
                        self.grid_size_list):
    
                '''
                Summing GPU Data
                Step 1
                Source GPU -> Destination GPU
                1 -> 0, 3 -> 2
                '''
    
                if (self.rank %2 == 1):
                   	self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-1)
   	
                elif (self.rank %2 == 0):
                   	self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank+1)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize()
    
                '''
                Step 2
                Sendrecv Pairing: 0 and 2
                '''
                if (self.rank == 2):
                   	self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                            dest=0, recvbuf=[bufint(d_param_tmp), mpidtype], source=0)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                            block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
   	
                elif (self.rank == 0):
                   	self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                            dest=2, recvbuf=[bufint(d_param_tmp), mpidtype], source=2)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                            block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
    
    
                '''
                Broadcasting Result
                Source GPU -> Destination GPU
                0 -> 1, 2 -> 3
                '''
    
                if (self.rank %2 == 0):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+1)
   	
                elif (self.rank %2 == 1):
               	        self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank-1)
   	
                self.comm.Barrier()



        elif (self.size == 8):
    
            # Use this for parameter size < 16MB
            # Use Fei's implementation for parameter size > 16MB
            
            for param_update_ga,d_param_tmp,numElements,grid_size in \
                    zip(self.param_update_ga_list, \
                        self.d_param_32_tmp_list, \
                        self.numElements_list, \
                        self.grid_size_list):
    
                '''
                Summing GPU Data
                Step 1
                Source GPU -> Destination GPU
                1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6
                '''
    
                if (self.rank %2 == 1):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-1)
   	
                elif (self.rank %2 == 0):
                   	self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank+1)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
    
    
                '''
                Step 2
                Source GPU -> Destination GPU
                0 -> 2, 4 -> 6
                '''
                if (self.rank %4 == 0):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+2)
   	
                elif (self.rank == 2) or (self.rank == 6):
                   	self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank-2)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
    
    
                '''
                Step 3
                Sendrecv Pairing: 2 and 6
                '''
                if (self.rank == 2):
                   	self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                            dest=6, recvbuf=[bufint(d_param_tmp), mpidtype], source=6)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                    block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
   	
                elif (self.rank == 6):
                   	self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                            dest=2, recvbuf=[bufint(d_param_tmp), mpidtype], source=2)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                    block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize()
    
    
                '''
                Broadcasting Results
                Step 1
                Source GPU -> Destination GPU
                2 -> 0, 6 -> 4
                '''
                if  (self.rank == 2) or (self.rank == 6):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-2)
   	
                elif (self.rank %4 == 0):
               	        self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank+2)
    
    
                '''
                Step 2
                Source GPU -> Destination GPU
                0 -> 1, 2 -> 3, 4 -> 5, 6 -> 7
                '''
    
                if (self.rank %2 == 0):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+1)
   	
                elif (self.rank %2 == 1):
               	        self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank-1)
   	
    
                self.comm.Barrier()



        elif (self.size == 16):
            
            for param_update_ga,d_param_tmp,numElements,grid_size in \
                    zip(self.param_update_ga_list, \
                        self.d_param_32_tmp_list, \
                        self.numElements_list, \
                        self.grid_size_list):
    
                '''
                Summing GPU Data
                Step 1
                Source GPU -> Destination GPU
                1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6, 9 -> 8, 11 -> 10, 13 -> 12, 15 -> 14
                '''
    
                if (self.rank %2 == 1):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-1)
   	
                elif (self.rank %2 == 0):
                   	self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank+1)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                    block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
    
    
                '''
                Step 2
                Source GPU -> Destination GPU
                0 -> 2, 4 -> 6, 8 -> 10, 12 -> 14
                '''
                if (self.rank %4 == 0):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+2)
   	
                elif (self.rank == 2) or (self.rank == 6) or (self.rank == 10) or (self.rank == 14):
                   	self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank-2)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                        block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize()
    
    
                '''
                Step 3
                Source GPU -> Destination GPU
                2 -> 6, 10 -> 14
                '''
                if (self.rank == 2) or (self.rank == 10):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+4)
   	
                elif (self.rank == 6) or (self.rank == 14):
                   	self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank-4)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                        block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize()
    
    
                '''
                Step 4
                Sendrecv Pairing: 6 and 14
                '''
                if (self.rank == 6):
                   	self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                            dest=14, recvbuf=[bufint(d_param_tmp), mpidtype], source=14)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                        block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 
   	
                elif (self.rank == 14):
                   	self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \
                                dest=6, recvbuf=[bufint(d_param_tmp), mpidtype], source=6)
                   	self.vecadd(param_update_ga, d_param_tmp, numElements, \
                                                        block=(256, 1, 1), grid=grid_size)
                   	self.ctx.synchronize() 

    
                '''
                Broadcasting Result
                Step 1
                Source GPU -> Destination GPU
                6 -> 2, 14 -> 10
                '''
                if (self.rank == 6) or (self.rank == 14):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-4)
   	
                elif (self.rank == 2) or (self.rank == 10):
               	        self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank+4)
    
    
                '''
                Step 2
                Source GPU -> Destination GPU
                2 -> 0, 6 -> 4, 10 -> 8, 14 -> 12
                '''
                if  (self.rank == 2) or (self.rank == 6) or (self.rank == 10) or (self.rank == 14):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-2)
   	
                elif (self.rank %4 == 0):
               	        self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank+2)
    
    
                '''
                Step 3
                Source GPU -> Destination GPU
                0 -> 1, 2 -> 3, 4 -> 5, 6 -> 7, 8 -> 9, 10 -> 11, 12 -> 13, 14 -> 15
                '''
    
                if (self.rank %2 == 0):
               	        self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+1)
   	
                elif (self.rank %2 == 1):
               	        self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank-1)
   	
                self.comm.Barrier()
                
                
                
        # copy weight from param_update_ga back to param_ga
        for param, param_update_ga in \
                        zip(self.dest_param_list, self.param_update_ga_list):

            param_ga = \
             theano.misc.pycuda_utils.to_gpuarray(param.container.value)

            self.drv.memcpy_dtod(param_ga.ptr,
                                  param_update_ga.ptr,
                                  param_update_ga.dtype.itemsize *
                                  param_ga.size)
                      
            self.ctx.synchronize() 
Exemple #5
0
    def exchange(self):
        
        mpidtype = self.mpidtype
        
        # divding source param first before exchanging
        if self.avg:
            self.avg_func()
        
        # copy weight from param_ga to param_update_ga
        for param, param_update_ga in \
                        zip(self.source_param_list, self.param_update_ga_list):

            param_ga = \
             theano.misc.pycuda_utils.to_gpuarray(param.container.value)

            self.drv.memcpy_dtod(param_update_ga.ptr,
                                  param_ga.ptr,
                                  param_ga.dtype.itemsize *
                                  param_ga.size)
                                  
            self.ctx.synchronize() 
                                  
        # allreduce weight from param_update_ga to itself
            
        wcount=0
        for param_update_ga in self.param_update_ga_list:

            self.float2half(param_update_ga, self.d_param_16_list[wcount], \
                                self.numElements_list[wcount], self.offset_list[wcount], \
                                block=(256,1,1),grid=self.grid_size_list[wcount])
                                
            self.ctx.synchronize()

            self.comm.Alltoall(
                            [bufint(self.d_param_16_list[wcount]), mpidtype],\
                            [bufint(self.d_param_16_tmp_list[wcount]),mpidtype])
            self.sumhalfs(self.d_param_16_tmp_list[wcount], \
                     self.d_param_16_sum_list[wcount], \
                     self.reduce_size_list[wcount],self.ranksize,\
                     self.reduce_size_list[wcount], \
                     block=(256,1,1),grid=self.grid_sum_size_list[wcount])
                     
            self.ctx.synchronize()

            self.comm.Allgather(
                        [bufint(self.d_param_16_sum_list[wcount]),mpidtype],\
                        [bufint(self.d_param_16_update_list[wcount]),mpidtype]) # d_param_16_update_list redundant

            self.half2float(self.d_param_16_update_list[wcount], param_update_ga, \
                                self.numElements_list[wcount],self.offset_list[wcount], \
                                block=(256,1,1),grid=self.grid_size_list[wcount]) # d_param_16_update_list redundant
                                
            self.ctx.synchronize()

            wcount+=1
            
        # copy weight from param_reduce_ga back to param_ga
        for param, param_update_ga in \
                        zip(self.dest_param_list, self.param_update_ga_list):

            param_ga = \
             theano.misc.pycuda_utils.to_gpuarray(param.container.value)

            self.drv.memcpy_dtod(param_ga.ptr,
                                  param_update_ga.ptr,
                                  param_update_ga.dtype.itemsize *
                                  param_ga.size)
            self.ctx.synchronize() 
Exemple #6
0
    def exchange(self):
        
        mpidtype = self.mpidtype
        
        # divding source param first before exchanging
        if self.avg:
            self.avg_func()
        
        # copy weight from param_ga to param_update_ga
        for param, param_update_ga in \
                        zip(self.source_param_list, self.param_update_ga_list):

            param_ga = \
             theano.misc.pycuda_utils.to_gpuarray(param.container.value)

            self.drv.memcpy_dtod(param_update_ga.ptr,
                                  param_ga.ptr,
                                  param_ga.dtype.itemsize *
                                  param_ga.size)
                                  
            self.ctx.synchronize() 
                                  
        # allreduce weight from param_update_ga to itself
                                  
        wcount=0
        for param_update_ga in self.param_update_ga_list:
	        
            self.comm.Alltoall(
	                  [bufint(param_update_ga), mpidtype],
	                  [bufint(self.d_param_32_tmp_list[wcount]),\
                                                   mpidtype])
	    
	        # sumfloats(float* f1, float* f2, int numElements,int ranksize,int reducesize)
            self.d_f32_sumfloats(self.d_param_32_tmp_list[wcount], \
                            self.d_param_32_sum_list[wcount],\
	                        self.reduce_size_list[wcount],self.ranksize,\
                            self.reduce_size_list[wcount],\
                            block=(256,1,1),\
                            grid=self.grid_sum_size_list[wcount])
                            
            self.ctx.synchronize()
            self.comm.Allgather(\
                            [bufint(self.d_param_32_sum_list[wcount]),mpidtype], \
                            [bufint(param_update_ga),mpidtype])
            #param.container.value.release_buffer(param_buf)
            
            wcount = wcount +1
            
        # copy weight from param_reduce_ga back to param_ga
        for param, param_update_ga in \
                        zip(self.dest_param_list, self.param_update_ga_list):

            param_ga = \
             theano.misc.pycuda_utils.to_gpuarray(param.container.value)

            self.drv.memcpy_dtod(param_ga.ptr,
                                  param_update_ga.ptr,
                                  param_update_ga.dtype.itemsize *
                                  param_ga.size)

            self.ctx.synchronize() 
Exemple #7
0
    def exchange(self):

        # server and worker send param to each other

        # this function needs the worker to send an 'exchange' message
        # to the server through REQ-REP socket first.

        assert self.comm != None

        if self.etype == 'server':

            # copy weight from g_param to g_param_ga
            for g_param, g_param_ga in \
                            zip(self.g_param_list, self.g_param_ga_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(g_param.container.value)

                self.drv.memcpy_dtod(g_param_ga.ptr, param_ga.ptr,
                                     param_ga.dtype.itemsize * param_ga.size)

            # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None)

            for g_param_ga, w_param_ga in zip(self.g_param_ga_list,
                                              self.w_param_ga_list):
                self.comm.Sendrecv(
                    sendbuf=[bufint(g_param_ga), MPI.FLOAT],
                    dest=self.dest,
                    recvbuf=[bufint(w_param_ga), MPI.FLOAT],
                    source=self.dest,
                )

            # copy weight from w_param_ga to w_param
            for w_param, w_param_ga in \
                            zip(self.w_param_list, self.w_param_ga_list):

                param_ga = \
                    theano.misc.pycuda_utils.to_gpuarray(w_param.container.value)

                self.drv.memcpy_dtod(
                    param_ga.ptr, w_param_ga.ptr,
                    w_param_ga.dtype.itemsize * w_param_ga.size)

        elif self.etype == 'worker':

            # copy weight from w_param to w_param_ga
            for w_param, w_param_ga in \
                            zip(self.w_param_list, self.w_param_ga_list):

                param_ga = \
                 theano.misc.pycuda_utils.to_gpuarray(w_param.container.value)

                self.drv.memcpy_dtod(w_param_ga.ptr, param_ga.ptr,
                                     param_ga.dtype.itemsize * param_ga.size)

            # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None)

            for w_param_ga, g_param_ga in zip(self.w_param_ga_list,
                                              self.g_param_ga_list):
                self.comm.Sendrecv(
                    sendbuf=[bufint(w_param_ga), MPI.FLOAT],
                    dest=self.dest,
                    recvbuf=[bufint(g_param_ga), MPI.FLOAT],
                    source=self.dest,
                )

            # copy weight from w_param_ga to w_param
            for g_param, g_param_ga in \
                            zip(self.g_param_list, self.g_param_ga_list):

                param_ga = \
                    theano.misc.pycuda_utils.to_gpuarray(g_param.container.value)

                self.drv.memcpy_dtod(
                    param_ga.ptr, g_param_ga.ptr,
                    g_param_ga.dtype.itemsize * g_param_ga.size)

        self.update_func()

        self.comm = None