gpucomm = collectives.GpuComm(_local_id,_local_size,_local_rank) return gpucomm if __name__ == '__main__': comm = get_internode_comm() rank=comm.rank device='cuda'+str(rank) size=comm.size from test_exchanger import init_device, clean_device _,ctx,arr,shared_x,shared_xx = init_device(device=device) gpucomm = get_intranode_comm(rank,size, ctx) if rank==0: print 'original array %s' % arr # prepare nccl32 exchanger from exchanger_strategy import Exch_nccl32 exch = Exch_nccl32(intercomm=comm, intracomm=gpucomm, avg=False) exch.prepare(ctx, [shared_x]) exch.exchange()
import sys sys.path.append('../../lib/base/') device = sys.argv[1] from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.rank size = comm.size # device='gpu'+str(rank) from test_exchanger import init_device, clean_device drv, ctx, arr, shared_x, shared_xx = init_device(device=device) if rank == 0: print 'original array %s' % arr # prepare copper exchanger from exchanger_strategy import Exch_copper exch = Exch_copper(comm, avg=False) exch.prepare(ctx, drv, [shared_x]) exch.exchange() if rank == 0: print 'copper summation: %s' % shared_x.get_value() # prepare ar exchanger from exchanger_strategy import Exch_allreduce exch = Exch_allreduce(comm, avg=False)