def dist_sgd(model, rank): group = dist.new_group([0, 1, 3]) for param in model.parameters(): sending_right = copy.deepcopy(param.data) sending_left = copy.deepcopy(sending_right) recving_left_1 = copy.deepcopy(sending_right) recving_right_1 = copy.deepcopy(sending_right) size = dist.get_world_size() left = ((rank - 1) + size) % size right = (rank + 1) % size if rank % 2 == 0: req = dist.isend(sending_right, dst=right) req.wait() req = dist.irecv(recving_left_1, src=left) req.wait() else: req = dist.irecv(recving_left_1, src=left) req.wait() req = dist.isend(sending_right, dst=right) req.wait() dist.barrier() if rank % 2 == 0: req = dist.isend(sending_left, dst=left) req.wait() req = dist.irecv(recving_right_1, src=right) req.wait() else: req = dist.irecv(recving_right_1, src=right) req.wait() req = dist.isend(sending_left, dst=left) req.wait() param.data = (sending_left + recving_left_1 + recving_right_1) / 3
def recv_obj(src, group): size = torch.tensor(1, dtype=torch.int32) dist.irecv(size, src=src, group=group).wait() data = torch.zeros(size=(size, ), dtype=torch.int8) dist.irecv(data, src=src, group=group).wait() buf = data.numpy().tobytes() return pickle.loads(buf)
def _setup_generators(self): seed = torch.empty(size=(), dtype=torch.long) dist.irecv(tensor=seed, src=comm.get().get_ttp_rank(), group=self.group).wait() dist.barrier(group=self.group) self.generator = torch.Generator() self.generator.manual_seed(seed.item())
def recv_obj(self, src, group=None): if group is None: group = self.main_group size = torch.tensor(1, dtype=torch.int32) dist.irecv(size, src=src, group=group).wait() data = torch.zeros(size=(size,), dtype=torch.int8) dist.irecv(data, src=src, group=group).wait() buf = data.numpy().tobytes() return pickle.loads(buf)
def broadcast(data, rank, world_size, recv_buff_l, recv_buff_r): left = ((rank - 1) + world_size) % world_size right = (rank + 1) % world_size send_req_l = dist.isend(data, dst=left) recv_req_r = dist.irecv(recv_buff_r, src=right) recv_req_r.wait() send_req_l.wait() send_req_r = dist.isend(data, dst=right) recv_req_l = dist.irecv(recv_buff_l, src=left) recv_req_l.wait() send_req_r.wait()
def recv_obj(self, src, group=None): """Receives a tensor from a source `src`.""" if group is None: group = self.main_group size = torch.tensor(1, dtype=torch.int32) dist.irecv(size, src=src, group=group).wait() data = torch.empty(size=(size, ), dtype=torch.int8) dist.irecv(data, src=src, group=group).wait() buf = data.numpy().tobytes() return pickle.loads(buf)
def timed_pt2pt(input, args): if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': import deepspeed.comm as dist sync_all() # Warmups, establish connections, etc. for i in range(args.warmups): if dist.get_rank() == 0: if args.async_op: dist.isend(input, 1) else: dist.send(input, 1) if dist.get_rank() == 1: if args.async_op: dist.irecv(input, src=0) else: dist.recv(input, src=0) sync_all() # time the actual comm op trials times and average it pre = time.perf_counter() for i in range(args.trials): if dist.get_rank() == 0: if args.async_op: dist.isend(input, 1) else: dist.send(input, 1) if dist.get_rank() == 1: if args.async_op: dist.irecv(input, src=0) else: dist.recv(input, src=0) sync_all() duration = time.perf_counter() - pre # maintain and clean performance data avg_duration = duration / args.trials size = input.element_size() * input.nelement() n = dist.get_world_size() tput, busbw = get_bw('pt2pt', size, avg_duration, args) tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration) desc = f'{input.nelement()}x{input.element_size()}' if not args.raw: size = convert_size(size) print_rank_0( f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def _setup_generators(self): seed = torch.empty(size=(), dtype=torch.long) dist.irecv( tensor=seed, src=comm.get().get_ttp_rank(), group=self.ttp_group ).wait() dist.barrier(group=self.ttp_group) self.generator = torch.Generator(device="cpu") self.generator.manual_seed(seed.item()) if torch.cuda.is_available(): self.generator_cuda = torch.Generator(device="cuda") self.generator_cuda.manual_seed(seed.item()) else: self.generator_cuda = None
def train_model(model, train_loader, optimizer, criterion, epoch, rank): """ model (torch.nn.module): The model created to train train_loader (pytorch data loader): Training data loader optimizer (optimizer.*): A instance of some sort of optimizer, usually SGD criterion (nn.CrossEntropyLoss) : Loss function used to train the network epoch (int): Current epoch number """ # remember to exit the train loop at end of the epoch model.train() for batch_idx, (data, target) in enumerate(train_loader): # Your code goes here! data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) train_loss = criterion(output, target) train_loss.backward() for p in model.parameters(): req = dist.isend(tensor=p.grad, dst=0) req.wait() req = dist.irecv(tensor=p.grad, src=0) req.wait() optimizer.step() if batch_idx % 20 == 0: print(batch_idx, "loss: ", train_loss.item()) now = datetime.now() if batch_idx == 10: later = datetime.now() print("average time: ", (later - now).total_seconds()/9)
def _receive_models_from_selected_clients(self, selected_client_ids): self.conf.logger.log(f"Master waits to receive the local models.") dist.barrier() # init the placeholders to recv the local models from workers. flatten_local_models = dict() for selected_client_id in selected_client_ids: arch = self.clientid2arch[selected_client_id] client_tb = TensorBuffer( list(self.client_models[arch].state_dict().values())) client_tb.buffer = torch.zeros_like(client_tb.buffer) flatten_local_models[selected_client_id] = client_tb # async to receive model from clients. reqs = [] for client_id, world_id in zip(selected_client_ids, self.world_ids): req = dist.irecv(tensor=flatten_local_models[client_id].buffer, src=world_id) reqs.append(req) for req in reqs: req.wait() dist.barrier() self.conf.logger.log(f"Master received all local models.") return flatten_local_models
def _pull(self): # Receive the central variable from the master. for group in self.param_groups: for p in group["params"]: request = dist.irecv(p.data, src=0) # Wait for the last receive call. request.wait()
def print_loss_file(file_name, loss_iteration, rank, size): if rank != 0: data = torch.tensor(loss_iteration) dist.barrier() req = dist.isend(data, dst=0) req.wait() else: loss_list_tensor = [] loss_iter = [] data = torch.tensor(loss_iteration) for i in range(size): data = copy.deepcopy(data) loss_list_tensor.append(data) dist.barrier() for i in range(size - 1): req = dist.irecv(loss_list_tensor[i + 1], src=i + 1) req.wait() for j in range(len(loss_list_tensor[0])): element = 0 for i in range(size): element += loss_list_tensor[i][j].item() loss_iter.append(element / size) if rank == 0: file_object = open(file_name, 'w') for loss in loss_iter: file_object.write(str(loss)) file_object.write('\t') file_object.close()
def _setup_generators(self): """Setup RNG generator shared between each party (client) and the TTPServer""" seed = torch.empty(size=(), dtype=torch.long) dist.irecv( tensor=seed, src=comm.get().get_ttp_rank(), group=self.ttp_group ).wait() dist.barrier(group=self.ttp_group) self.generator = torch.Generator(device="cpu") self.generator.manual_seed(seed.item()) if torch.cuda.is_available(): self.generator_cuda = torch.Generator(device="cuda") self.generator_cuda.manual_seed(seed.item()) else: self.generator_cuda = None
def _agg(self, data, op): """Aggregate data using `op` operation. Args: data (:obj:`torch.Tensor`): A Tensor to be aggragated. op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc. Returns: :obj:`torch.Tensor`: An aggregated tensor. """ # Create some tensors to host the values from neighborhood. local_data = {i: torch.zeros_like(data) for i in self.neighbors} local_data[self.rank] = data reqs = [] for node in self.neighbors: reqs.append(dist.isend(tensor=local_data[self.rank], dst=node)) reqs.append(dist.irecv(tensor=local_data[node], src=node)) for req in reqs: req.wait() # Aggregate local_data if op == "avg": output = sum(local_data.values()) / (len(self.neighbors) + 1) else: raise NotImplementedError("op {} is not supported yet.".format(op)) return output
def irecv(self, shape=None, dtype=None): if shape is None: shape = self.defaultShape if dtype is None: dtype = self.defaultDType self.tensor = torch.zeros(shape, dtype=dtype) self.tensor[0] = SEMAPHORE self.req = dist.irecv(self.tensor, src=self.rank)
def get(self, key: str, dst: Optional[torch.Tensor] = None, shared: bool = False) -> Optional[torch.Tensor]: """Get a tensor from the server.""" self._validate_get(key, dst=dst, shared=shared) cmd_rpc = torch.tensor( [GET_CMD, len(key), dst is None, 0, 0, 0], dtype=torch.long) metadata_pg = self._metadata_pg() td.send(cmd_rpc, self.server_rank, group=metadata_pg) td.send(_fromstring(key), self.server_rank, group=metadata_pg) if dst is None: meta = torch.full((2, ), -1, dtype=torch.long) td.recv(meta, src=self.server_rank, group=metadata_pg) ndim, ttype = meta if ndim.item() == -1: return None size = torch.full((ndim.item(), ), -1, dtype=torch.long) td.recv(size, src=self.server_rank, group=metadata_pg) dtype = _dtypes[ttype.item()] if shared: dst = allocate_shared_tensor(size.tolist(), dtype=dtype) else: dst = torch.empty(size.tolist(), dtype=dtype) start_t = time.monotonic() data_pgs = self._data_pgs() if data_pgs is None: td.recv(dst, src=self.server_rank) else: outstanding_work = [] flattened_dst = dst.flatten() flattened_size = flattened_dst.shape[0] for idx, (pg, slice_) in enumerate( zip( data_pgs, split_almost_equally(flattened_size, num_parts=len(data_pgs)), )): outstanding_work.append( td.irecv( tensor=flattened_dst[slice_], src=self.server_rank, group=pg, tag=idx, )) for w in outstanding_work: w.wait() end_t = time.monotonic() if self.log_stats: stats_size = dst.numel() * dst.element_size() stats_time = end_t - start_t logger.debug( f"Received tensor {key} from server {self.server_rank}: " f"{stats_size:,} bytes " f"in {stats_time:,g} seconds " f"=> {stats_size / stats_time:,.0f} B/s") return dst
def handle_store( self, rank: int, key: str, ndim: int, accum: int, overwrite: int, ttype: int, ) -> None: if ndim == -1: assert key in self.parameters size = self.parameters[key].size() else: size = torch.empty((ndim, ), dtype=torch.long) td.recv(size, src=rank) size = size.tolist() dtype = _dtypes[ttype] if not accum and overwrite and key in self.parameters: # avoid holding onto 2x the memory del self.parameters[key] data = torch.empty(size, dtype=dtype) start_t = time.monotonic() if self.groups is None: td.recv(tensor=data, src=rank) else: outstanding_work = [] flattened_data = data.flatten() flattened_size = flattened_data.shape[0] for idx, (pg, slice_) in enumerate( zip( self.groups, split_almost_equally(flattened_size, num_parts=len(self.groups)))): outstanding_work.append( td.irecv(tensor=flattened_data[slice_], src=rank, group=pg, tag=idx)) for w in outstanding_work: w.wait() end_t = time.monotonic() if self.log_stats: stats_size = data.numel() * data.element_size() stats_time = end_t - start_t logger.debug(f"Received tensor {key} from client {rank}: " f"{stats_size:,} bytes " f"in {stats_time:,g} seconds " f"=> {stats_size / stats_time:,.0f} B/s") if accum: self.parameters[key] += data elif (key not in self.parameters) or overwrite: self.parameters[key] = data
def my_igather(self, rank, size, group, sendbuf, recvbuf, root): req = [] if rank == root: for idx in range(size): if idx != rank: req.append(dist.irecv(recvbuf[idx], src=idx, group=group)) else: recvbuf[rank] = sendbuf else: req.append(dist.isend(sendbuf, group=group, dst=root)) return req
def dist_data(data, src=0, TType=torch.DoubleTensor): ''' distribute a row-major matrix ''' rank = dist.get_rank() size = dist.get_world_size() if rank == src: p = data.shape[0] q = data.shape[1] shape = torch.LongTensor([p, q]) if p % size != 0: sizes = [p // size + 1 for i in range(size - 1) ] + [p // size + 1 - (size - p % size)] else: sizes = [p // size for i in range(size)] p, q = shape[0], shape[1] sizes = torch.LongTensor(sizes) dist.broadcast(shape, src) dist.broadcast(sizes, src) else: shape = torch.LongTensor(2) sizes = torch.LongTensor(size) dist.broadcast(shape, src) dist.broadcast(sizes, src) shape = list(shape) p, q = shape[0], shape[1] p_chunk = sizes[rank].item() q_chunk = q.item() # print(rank, p_chunk, q_chunk) chunk = TType(p_chunk, q_chunk) reqs = [] if rank == src: data_ = TType(data.shape).copy_(data) sizes_int = tuple(x.item() for x in tuple(sizes)) data_split = torch.split(data_, sizes_int) chunk.copy_(data_split[src]) for i in range(size): if i == src: continue synchronize() reqs.append(dist.isend(data_split[i], i)) else: synchronize() reqs.append(dist.irecv(chunk, src)) for req in reqs: req.wait() dist.barrier() return THDistMat(shape, sizes, chunk, True)
def _agg(self, data, op, force_wait=True): """Aggregate data using `op` operation. Args: data (:obj:`torch.Tensor`): A Tensor to be aggragated. op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, `weighted`, etc. Returns: :obj:`torch.Tensor`: An aggregated tensor. """ with self.timer('com/preparation'): # Create some tensors to host the values from neighborhood. local_data = { i: torch.empty_like(data) for i in self.neighbor_ranks } local_data[self.rank] = data with self.timer('com/data_exchange'): # async send data. reqs = [] for node_rank in self.neighbor_ranks: with self.timer('com/isend'): reqs.append( dist.isend(tensor=local_data[self.rank], dst=node_rank)) with self.timer('com/irecv'): reqs.append( dist.irecv(tensor=local_data[node_rank], src=node_rank)) # wait until finish. if force_wait: with self.timer('com/force_wait'): self.complete_wait(reqs) # Aggregate local_data if op == "avg": output = sum(local_data.values()) / (self.world_size + 1) elif op == "weighted": output = sum([ tensor * self.neighbors_info[rank] for rank, tensor in local_data.items() ]) elif op == "get_raw_sync_data": output = local_data else: raise NotImplementedError( "op {} is not supported yet.".format(op)) return output else: if op == "get_raw_sync_data": return reqs, local_data else: raise NotImplementedError( "op {} is not supported yet.".format(op))
def irecv(self, collectiveArgs, src_rank, retFlag=False, tag=0): retObj = dist.irecv( tensor=collectiveArgs.opTensor, src=src_rank, group=collectiveArgs.group, tag=tag ) collectiveArgs.waitObj.append(retObj) if retFlag: return retObj
def run(rank, size): tensor = torch.zeros(1) req = None if rank == 0: tensor += 1 req = dist.isend(tensor=tensor, dst=1) print('Rank 0 started sending') else: req = dist.irecv(tensor=tensor, src=0) print('Rank 1 started receiving') req.wait() print('Rank ', rank, ' has data ', tensor[0])
def p2p_test_non_block(rank, size): tensor = torch.tensor(rank, dtype=torch.int32) req = None if rank % 2 == 0: dst = rank + 1 req = dist.isend(tensor=tensor, dst=dst) else: src = rank - 1 req = dist.irecv(tensor=tensor, src=src) req.wait() print('Rank %d value %d' % (rank, tensor.item()))
def run(tensor): req = None for nodes in range(num_nodes): if rank == 0: # Send the tensor to process 1 req = dist.isend(tensor=tensor, dst=1) print('Rank 0 started sending to Rank 1') else: # Receive tensor from process 0 req = dist.irecv(tensor=tensor, src=0) print('Rank 1 started receiving') #req.wait() print('Rank ', rank, ' has received data ') return
def run1(rank, size): """Non-blocking point-to-point communication.""" tensor = torch.zeros(1) if rank == 0: tensor += 1 # Send the tensor to process 1 req = dist.isend(tensor=tensor, dst=1) print('Rank 0 started sending') else: # Receive tensor from process 0 req = dist.irecv(tensor=tensor, src=0) print('Rank 1 started receiving') req.wait() print('Rank ', rank, ' has data ', tensor[0])
def run(rank, size): tensor = torch.zeros(1) req = None if rank == 0: tensor += 1 # Send the tensor to process 1 req = dist.isend(tensor=tensor, dst=1) print('Rank 0 started sending') else: # Receive tensor from process 0 req = dist.irecv(tensor=tensor, src=0) print('Rank 1 started receiving') req.wait() # wait here for communication to finish print('Rank ', rank, ' has data ', tensor[0])
def run_send_recv_nonblocking(rank, size): tensor = torch.zeros(1) req = None if rank == 0: tensor += 1 # Send the tensor to process 1 req = dist.isend(tensor=tensor, dst=1) print('Rank 0 started sending') else: # Receive tensor from process 0 req = dist.irecv(tensor=tensor, src=0) print('Rank 1 started receiving') req.wait() print('Rank ', rank, ' has data ', tensor[0])
def p2p_run(rank, size): """ Distributed function to be implemented later. """ tensor = torch.zeros(1) if rank == 0: tensor += 1 # Send the tensor to process 1 req = dist.isend(tensor=tensor, dst=1) print('Rank 0 started sending') else: # Receive tensor from process 0 req = dist.irecv(tensor=tensor, src=0) print('Rank 1 started receiving') req.wait() print('Rank ', rank, ' has data ', tensor[0])
def run_nonblocking(rank, size): """ non-Blocking point-2-point communication """ tensor = torch.zeros(1) req = None if rank == 0: tensor += 1 # Send the tensor to process 1 req = dist.isend(tensor=tensor, dst=1) print("Rank 0 started sending") else: # Receive tensor from process 0 req = dist.irecv(tensor=tensor, src=0) req.wait() print("Rank ", rank, ' has data ', tensor[0])
def run(rank, size): """Non-Blocking point to point communication.""" tensor = torch.zeros(1) req = None if rank == 0: # The master node is going to send the tensor with data `1` to node 1. tensor += 1 req = dist.isend(tensor=tensor, dst=1) print("Rank 0 started sending") else: # The other nodes will receive the tensor with data `1`. req = dist.irecv(tensor=tensor, src=0) print("Rank 1 started receiving") req.wait() # Guarantee that the communication took place print("RANK ", rank, " has data ", tensor[0])
def test_irecv(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] requests = [ dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size) ] for src in range(1, world_size): requests[src - 1].wait() self.assertTrue(requests[src - 1].is_completed()) self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) else: tensor = _build_tensor(rank, 10) dist.send(tensor, 0) self._barrier()