Beispiel #1
0
 def init_group(self,
                world_size,
                rank,
                backend=Backend.NCCL,
                group_name="default"):
     col.init_collective_group(world_size, rank, backend, group_name)
     return True
Beispiel #2
0
 def __init__(self, workers, world_size, rank):
     self.params = dict()
     self.optimizer = None
     self.workers = workers
     self.world_size = world_size
     self.rank = rank
     self.grad_counts = 0
     collective.init_collective_group(self.world_size, self.rank, "nccl",
                                      "default")
     for i in range(len(self.workers)):
         recv = torch.zeros(1, ).cuda()
         collective.recv(recv, i, "default")
     for i in range(len(self.workers)):
         recv = torch.zeros(1, ).cuda()
         collective.send(recv, i, "default")
Beispiel #3
0
 def __init__(self, model, batch_size, world_size, rank, num_ps):
     self.model_type = model
     print("=> creating model '{}'".format(model))
     self.model = torchmodels.__dict__[model]().cuda()
     self.criterion = nn.CrossEntropyLoss().cuda()
     self.batch_size = batch_size
     self.train_loader = self.get_data_loader(self.batch_size)
     self.world_size = world_size
     self.rank = rank
     self.num_ps = num_ps
     self.num_workers = self.world_size - self.num_ps
     self.assignments = None
     # index i of this list stores the names of params in ith server.
     self.name_list = [[] for i in range(num_ps)]
     collective.init_collective_group(world_size, rank, "nccl", "default")
     for i in range(num_ps):
         send = torch.ones(1, ).cuda()
         collective.send(send, self.num_workers + i, "default")
     for i in range(num_ps):
         send = torch.ones(1, ).cuda()
         collective.recv(send, self.num_workers + i, "default")
 def setup(self, world_size, rank):
     collective.init_collective_group(world_size, rank, "nccl", "177")
     return True
Beispiel #5
0
 def setup(self, world_size, rank):
     self.rank = rank
     collective.init_collective_group(world_size, rank, "nccl", "8")
     return True
 def setup(self, world_size, rank):
     collective.init_collective_group("nccl", world_size, rank, "default")
     return True
 def init_gloo_group(rank: int, world_size: int, group_name: str):
     col.init_collective_group(world_size, rank, Backend.GLOO, group_name)
     return True