def _next_worker(self): data, address = self._socket.recvfrom(10) data = data.decode().split('.')[0] rank = int(data) dist.isend(self.rank.data, rank) return rank
def allreduce(send, recv): rank = dist.get_rank() size = dist.get_world_size() device = torch.device("cuda:" + str(rank) if torch.cuda.is_available() else "cpu") send_buff = torch.zeros(send.size()).to(device) recv_buff = torch.zeros(send.size()).to(device) accum = torch.zeros(send.size()).to(device) accum[:] = send[:] left = ((rank - 1) + size) % size right = (rank + 1) % size for i in range(size - 1): if i % 2 == 0: # Send send_buff send_req = dist.isend(send_buff, right) dist.recv(recv_buff, left) accum[:] += recv[:] else: # Send recv_buff send_req = dist.isend(recv_buff, right) dist.recv(send_buff, left) accum[:] += send[:] send_req.wait() recv[:] = accum[:]
def main_func(numProcesses, group, queue, queue_exit): print('main func running') starting_time2 = time.time() exited_processes = [] while (True): #time.sleep(1) recv_tensors = [] start_time = time.time() while (True): if not queue.empty(): rec_tens = queue.get() recv_tensors.append(rec_tens) if len(recv_tensors) > 1 or time.time() - start_time > 0.01: break #print('main received tensors',recv_tensors) for r in recv_tensors: #last element corresponds to rank it came from print('sending it back: ', r[:-1].clone() + 3) print('destination: ', int(r[-1].item())) dist.isend(tensor=r[:-1].clone() + 3, dst=int(r[-1].item())) if not queue_exit.empty(): exited_processes.append(queue_exit.get()) #print(len(exited_processes)) if len(exited_processes) == numProcesses - 1: print('everyones exited') print(time.time() - starting_time2) exit()
def allreduce(send, recv): """ Implementation of a ring-reduce. """ rank = dist.get_rank() size = dist.get_world_size() send_buff = th.zeros(send.size()) recv_buff = th.zeros(send.size()) accum = th.zeros(send.size()) accum[:] = send[:] # th.cuda.synchronize() left = ((rank - 1) + size) % size right = (rank + 1) % size for i in range(size - 1): if i % 2 == 0: # Send send_buff send_req = dist.isend(send_buff, right) dist.recv(recv_buff, left) accum[:] += recv[:] else: # Send recv_buff send_req = dist.isend(recv_buff, right) dist.recv(send_buff, left) accum[:] += send[:] send_req.wait() # th.cuda.synchronize() recv[:] = accum[:]
def dist_sgd(model, rank): group = dist.new_group([0, 1, 3]) for param in model.parameters(): sending_right = copy.deepcopy(param.data) sending_left = copy.deepcopy(sending_right) recving_left_1 = copy.deepcopy(sending_right) recving_right_1 = copy.deepcopy(sending_right) size = dist.get_world_size() left = ((rank - 1) + size) % size right = (rank + 1) % size if rank % 2 == 0: req = dist.isend(sending_right, dst=right) req.wait() req = dist.irecv(recving_left_1, src=left) req.wait() else: req = dist.irecv(recving_left_1, src=left) req.wait() req = dist.isend(sending_right, dst=right) req.wait() dist.barrier() if rank % 2 == 0: req = dist.isend(sending_left, dst=left) req.wait() req = dist.irecv(recving_right_1, src=right) req.wait() else: req = dist.irecv(recving_right_1, src=right) req.wait() req = dist.isend(sending_left, dst=left) req.wait() param.data = (sending_left + recving_left_1 + recving_right_1) / 3
def run(i): global rank rank = i dist.init_process_group(backend="gloo", init_method="env://", timeout=timedelta(seconds=2.5), world_size=world_size, rank=rank) if i == 0: time.sleep(2000) print("Process 0 exit.") exit(-1) t = time.time() for j in range(messages): for r in range(world_size): print("round {}: {}, {} begin".format(j, rank, r)) try: dist.isend(storage, r) #dist.recv(storage1) except RuntimeError: print("round {}: {}, {} error".format(j, rank, r)) else: print("round {}: {}, {}, {:.5f}".format(j, rank, r, time.time() - t)) print(time.time() - t) dist.destroy_process_group()
def transfer(tag, send_buf, shape): if shape == None: left, right = get_left_right(tag) send_opt = dist.isend(tensor=send_buf, dst=right) send_opt.wait() return None elif not torch.is_tensor(send_buf): left, right = get_left_right(tag) try: recv_buf = torch.zeros(shape) # , dtype=torch.int8 dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error..") return None return recv_buf else: left, right = get_left_right(tag) send_opt = dist.isend(tensor=send_buf, dst=right) try: recv_buf = torch.zeros(shape) dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error") return None send_opt.wait() return recv_buf
def allreduce(r, world, peers, tensor): r = dist.get_rank() world = dist.get_world_size() peers = list(filter(lambda i: i != r, list(range(world)))) sizeOfTensor = list(tensor.size())[0] chunksize = sizeOfTensor // world reqs = [ dist.isend(tensor=tensor[i * chunksize:(i + 1) * chunksize], dst=i) for i in peers ] # K concurrent transfers recv = torch.zeros(sizeOfTensor // (world)) for i in peers: # K steps dist.recv(tensor=recv, src=i) # K / ??? values... tensor[r * chunksize:(r + 1) * chunksize] += recv[:] for req in reqs: req.wait() # we have to set to zero the values that we are not responsible (they will be included on their way back) reqs = [ dist.isend(tensor=tensor[r * chunksize:(r + 1) * chunksize], dst=i) for i in peers ] for i in peers: dist.recv(tensor=recv, src=i) tensor[i * chunksize:(i + 1) * chunksize] = recv for req in reqs: req.wait()
def transfer4backend1(tag, send_buf, flag=False): if not flag: left, right = get_left_right(tag) dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1), dst=right) send_opt = dist.isend(tensor=send_buf, dst=right) send_opt.wait() return None else: left, right = get_left_right(tag) dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1), dst=right) send_opt = dist.isend(tensor=send_buf, dst=right) try: shape_buf = torch.zeros([1], dtype=torch.short) dist.recv(tensor=shape_buf, src=left) recv_buf = torch.zeros(torch.Size(shape_buf.tolist())) dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error") return None send_opt.wait() return recv_buf
def _commit(self): # Initiate parameter sharing with the master. self._enqueue_master() dist.recv(self.rank, src=0) # Send the update to the master for group in self.param_groups: for p in group["params"]: dist.isend(p.grad.data, dst=0)
def send_message(message_code, payload, dst=0): """Sends a message to a destination Concatenates both the message code and destination with the payload into a single tensor and then sends that as a tensor """ _LOGGER.info("SENDING MESSAGE: {} RANK: {}".format(message_code, dist.get_rank())) m_parameter = torch.Tensor([dist.get_rank(), message_code.value]) m_parameter = torch.cat((m_parameter, payload)) dist.isend(tensor=m_parameter, dst=dst)
def test_send_recv_obj(self): TEST_OBJECTS = [ { "a": 1, "b": 2, "c": 3 }, torch.tensor(1), torch.nn.Linear(10, 5), CNN(), ] for param in TEST_OBJECTS[2].parameters(): param.data.fill_(1.0) for param in TEST_OBJECTS[3].parameters(): param.data.fill_(1.0) serial.register_safe_class(CNN) for reference in TEST_OBJECTS: for src in range(self.world_size): if self.rank == src: test_obj = reference comm.get().send_obj(test_obj, 1 - self.rank) else: test_obj = comm.get().recv_obj(1 - self.rank) if isinstance(reference, torch.nn.Module): test_obj_params = list(test_obj.parameters()) reference_params = list(reference.parameters()) for i, param in enumerate(reference_params): self.assertTrue(test_obj_params[i].eq(param).all(), "broadcast_obj failed") else: self.assertEqual(test_obj, reference, "broadcast_obj failed") # Test that the restricted loader will raise an error for code injection for invalid_obj in INVALID_SERIALIZED_OBJECTS: for src in range(self.world_size): if self.rank == src: # Mimic send_obj without pickling invalid bytestream size = torch.tensor(len(invalid_obj), dtype=torch.int32) arr = torch.from_numpy( numpy.frombuffer(invalid_obj, dtype=numpy.int8)) r0 = dist.isend(size, dst=(1 - self.rank), group=comm.get().main_group) r1 = dist.isend(arr, dst=(1 - self.rank), group=comm.get().main_group) r0.wait() r1.wait() else: with self.assertRaises(ValueError): comm.get().recv_obj(1 - self.rank)
def eval(layer, logger, args, targets_queue, e, save_event, data_size, testloader): dist.init_process_group(backend='tcp', init_method=args.path, world_size=args.size, rank=args.rank) criterion = nn.CrossEntropyLoss() criterion.cuda() layer.eval() with torch.no_grad(): if dist.get_rank() == 0: for batch_idx, (inputs, targets) in enumerate(testloader): inputs, targets = inputs.cuda(0), targets outputs = layer(inputs) targets_queue.put(targets.numpy()) print(outputs.size()) send_opt = dist.isend(tensor=outputs.cpu(), dst=1) send_opt.wait() send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() e.wait() elif dist.get_rank() == 1: batch_idx = 0 test_loss = 0 correct = 0 total = 0 save_event.clear() global best_acc while True: try: rec_val = torch.zeros([100, 128, 16, 16]) dist.recv(tensor=rec_val, src=0) except RuntimeError as error: print("done....") acc = 100. * correct / total if acc > best_acc: best_acc = acc save_event.set() e.set() break outputs = layer(rec_val.cuda(1)) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(1) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total)) if batch_idx % 10 == 0: logger.error("eval:" + str(test_loss / (batch_idx + 1)))
def send_obj(obj, dst, group): buf = pickle.dumps(obj) size = torch.tensor(len(buf), dtype=torch.int32) arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8)) r0 = dist.isend(size, dst=dst, group=group) r1 = dist.isend(arr, dst=dst, group=group) r0.wait() r1.wait()
def broadcast(data, rank, world_size, recv_buff_l, recv_buff_r): left = ((rank - 1) + world_size) % world_size right = (rank + 1) % world_size send_req_l = dist.isend(data, dst=left) recv_req_r = dist.irecv(recv_buff_r, src=right) recv_req_r.wait() send_req_l.wait() send_req_r = dist.isend(data, dst=right) recv_req_l = dist.irecv(recv_buff_l, src=left) recv_req_l.wait() send_req_r.wait()
def run(qs, layer): if dist.get_rank() == 0: input_x = torch.ones(20, requires_grad=True) output_x = layer(input_x) input_x.share_memory_() qs[0].put(input_x) send_opt = dist.isend(tensor=output_x, dst=1) send_opt.wait() time.sleep(20) elif dist.get_rank() == 1: rec_val = torch.zeros(10, requires_grad=True) dist.recv(tensor=rec_val, src=0) rec_val.share_memory_() qs[1].put(rec_val) send_opt = dist.isend(tensor=torch.ones(1), dst=2) send_opt.wait() time.sleep(20) elif dist.get_rank() == 2: rec_val = torch.randn(1) dist.recv(tensor=rec_val, src=1) input_x = qs[1].get() input_x.requires_grad_() output_x = layer(input_x) optimizer = optim.SGD(layer.parameters(), lr=0.01) optimizer.zero_grad() criterion = nn.MSELoss() target_v = torch.randn(1) loss = criterion(output_x, target_v) loss.backward() optimizer.step() send_opt = dist.isend(tensor=input_x.grad, dst=3) send_opt.wait() time.sleep(10) elif dist.get_rank() == 3: back_grad = torch.zeros(10, requires_grad=True) dist.recv(tensor=back_grad, src=2) input_x = qs[0].get() input_x.requires_grad_() output_x = layer(input_x) optimizer = optim.SGD(layer.parameters(), lr=0.01) optimizer.zero_grad() output_x.backward(back_grad) optimizer.step() time.sleep(10)
def eval(layer, logger, args, targets_queue, e, save_event, data_size, testloader): criterion = nn.CrossEntropyLoss() criterion.cuda(1) layer.eval() with torch.no_grad(): if dist.get_rank() == 0: for batch_idx, (inputs, targets) in enumerate(testloader): print('batch_idx: ' + str(batch_idx)) inputs, targets = inputs.cuda(0), targets outputs = layer(inputs) targets_queue.put(targets.numpy()) send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=1) send_opt.wait() send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() e.wait() elif dist.get_rank() == 1: batch_idx = 0 test_loss = 0 correct = 0 total = 0 save_event.clear() global best_acc while True: try: rec_val = torch.zeros([100, 256, 4, 4], dtype=torch.int8) dist.recv(tensor=rec_val, src=0) except RuntimeError as error: print("done....") acc = 100. * correct / total if acc > best_acc: best_acc = acc save_event.set() e.set() break rec_val = dq_act(rec_val) outputs = layer(rec_val.cuda(1)) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(1) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total)) #if batch_idx % 10 == 0: logger.error("eval:" + str(test_loss / (batch_idx + 1))) batch_idx += 1 acc_str = "eacc: %.3f" % (100. * correct / total,) logger.error(acc_str)
def send_obj(self, obj, dst, group=None): if group is None: group = self.main_group buf = pickle.dumps(obj) size = torch.tensor(len(buf), dtype=torch.int32) arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8)) r0 = dist.isend(size, dst=dst, group=group) r1 = dist.isend(arr, dst=dst, group=group) r0.wait() r1.wait()
def timed_pt2pt(input, args): if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': import deepspeed.comm as dist sync_all() # Warmups, establish connections, etc. for i in range(args.warmups): if dist.get_rank() == 0: if args.async_op: dist.isend(input, 1) else: dist.send(input, 1) if dist.get_rank() == 1: if args.async_op: dist.irecv(input, src=0) else: dist.recv(input, src=0) sync_all() # time the actual comm op trials times and average it pre = time.perf_counter() for i in range(args.trials): if dist.get_rank() == 0: if args.async_op: dist.isend(input, 1) else: dist.send(input, 1) if dist.get_rank() == 1: if args.async_op: dist.irecv(input, src=0) else: dist.recv(input, src=0) sync_all() duration = time.perf_counter() - pre # maintain and clean performance data avg_duration = duration / args.trials size = input.element_size() * input.nelement() n = dist.get_world_size() tput, busbw = get_bw('pt2pt', size, avg_duration, args) tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration) desc = f'{input.nelement()}x{input.element_size()}' if not args.raw: size = convert_size(size) print_rank_0( f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
def send_obj(self, obj, dst, group=None): """Sends the specified object to the destination `dst`.""" if group is None: group = self.main_group buf = pickle.dumps(obj) size = torch.tensor(len(buf), dtype=torch.int32) arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8)) r0 = dist.isend(size, dst=dst, group=group) r1 = dist.isend(arr, dst=dst, group=group) r0.wait() r1.wait()
def ms_allreduce(r, world, peers, tensor, quantize, unquantize, numberOfThreads=1): r = dist.get_rank() arraySize = list(tensor.size())[0] acc = torch.zeros(arraySize) world = dist.get_world_size() chunksize = arraySize // world assert chunksize % dataSz == 0 acc[r * chunksize:(r + 1) * chunksize] = tensor[r * chunksize:(r + 1) * chunksize] reqs = [] #"Naive all-reduce" #i = 0 #print('actual: {} vs. expected: {}'.format(torch.zeros(int(arraySize / (chunksize * dataSz))).size(), quantize(tensor[i*chunksize:(i+1)*chunksize]).size())) for i in range(world): # K steps if i != r: chunk = tensor[i * chunksize:(i + 1) * chunksize] qchunk = quantize(chunk, numberOfThreads) reqs += [dist.isend(tensor=qchunk, dst=i)] # K concurrent transfers recv = torch.zeros(arraySize // (dataSz * world), dtype=torch.int32) for i in range(world): # K steps if i != r: dist.recv(tensor=recv, src=i) # K / ??? values... chunk = unquantize(recv, numberOfThreads) acc[r * chunksize:(r + 1) * chunksize] += chunk for req in reqs: req.wait() reqs = [] #"Naive all-gather" for i in range(world): if i != r: chunk = acc[r * chunksize:(r + 1) * chunksize] qchunk = quantize(chunk, numberOfThreads) reqs += [dist.isend(tensor=qchunk, dst=i)] #"Naive all-gather" for i in range(world): if i != r: dist.recv(tensor=recv, src=i) chunk = unquantize(recv, numberOfThreads) acc[i * chunksize:(i + 1) * chunksize] += chunk for req in reqs: req.wait() tensor[:] = acc[:]
def _agg(self, data, op): """Aggregate data using `op` operation. Args: data (:obj:`torch.Tensor`): A Tensor to be aggragated. op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc. Returns: :obj:`torch.Tensor`: An aggregated tensor. """ # Create some tensors to host the values from neighborhood. local_data = {i: torch.zeros_like(data) for i in self.neighbors} local_data[self.rank] = data reqs = [] for node in self.neighbors: reqs.append(dist.isend(tensor=local_data[self.rank], dst=node)) reqs.append(dist.irecv(tensor=local_data[node], src=node)) for req in reqs: req.wait() # Aggregate local_data if op == "avg": output = sum(local_data.values()) / (len(self.neighbors) + 1) else: raise NotImplementedError("op {} is not supported yet.".format(op)) return output
def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list): """Each process scatters list of input tensors to all processes in a cluster and return gathered list of tensors in output list. Parameters ---------- rank : int The rank of current worker world_size : int The size of the entire output_tensor_list : List of tensor The received tensors input_tensor_list : List of tensor The tensors to exchange """ # send tensor to each target trainer using torch.distributed.isend # isend is async senders = [] for i in range(world_size): if i == rank: output_tensor_list[i] = input_tensor_list[i].to(th.device('cpu')) else: sender = dist.isend(input_tensor_list[i].to(th.device('cpu')), dst=i) senders.append(sender) for i in range(world_size): if i != rank: dist.recv(output_tensor_list[i], src=i) th.distributed.barrier()
def backward_rank1(semaphore, start_event, start_event2): start_event.wait() batch_idx = 0 while True: try: #semaphore.release() print("before grad recv...") grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=grad_recv1, src=2) print("after grad recv.....") except RuntimeError as error: print("backward runtime error") send_opt = dist.isend(tensor=torch.zeros(0), dst=0) send_opt.wait() break grad_recv1 = dequantize(grad_recv1.cuda(0).float()) inputs, outputs = outputs_queue.get(block=False) inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() inputs_grad = quantize(inputs.grad, char=True).cpu() print(inputs_grad.size()) if batch_idx == 0: start_event2.set() #send_opt = dist.isend(tensor=inputs_grad, dst=0) #send_opt.wait() dist.send(tensor=inputs_grad, dst=0) batch_idx += 1
def train_model(model, train_loader, optimizer, criterion, epoch, rank): """ model (torch.nn.module): The model created to train train_loader (pytorch data loader): Training data loader optimizer (optimizer.*): A instance of some sort of optimizer, usually SGD criterion (nn.CrossEntropyLoss) : Loss function used to train the network epoch (int): Current epoch number """ # remember to exit the train loop at end of the epoch model.train() for batch_idx, (data, target) in enumerate(train_loader): # Your code goes here! data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) train_loss = criterion(output, target) train_loss.backward() for p in model.parameters(): req = dist.isend(tensor=p.grad, dst=0) req.wait() req = dist.irecv(tensor=p.grad, src=0) req.wait() optimizer.step() if batch_idx % 20 == 0: print(batch_idx, "loss: ", train_loss.item()) now = datetime.now() if batch_idx == 10: later = datetime.now() print("average time: ", (later - now).total_seconds()/9)
def print_loss_file(file_name, loss_iteration, rank, size): if rank != 0: data = torch.tensor(loss_iteration) dist.barrier() req = dist.isend(data, dst=0) req.wait() else: loss_list_tensor = [] loss_iter = [] data = torch.tensor(loss_iteration) for i in range(size): data = copy.deepcopy(data) loss_list_tensor.append(data) dist.barrier() for i in range(size - 1): req = dist.irecv(loss_list_tensor[i + 1], src=i + 1) req.wait() for j in range(len(loss_list_tensor[0])): element = 0 for i in range(size): element += loss_list_tensor[i][j].item() loss_iter.append(element / size) if rank == 0: file_object = open(file_name, 'w') for loss in loss_iter: file_object.write(str(loss)) file_object.write('\t') file_object.close()
def send_torch(self, torch_tensor: torch.Tensor, name: str): tag = self.get_dist_tag(name) send_tensor = torch_tensor.cpu() self.all_wait[tag] = dist.isend(tensor=send_tensor, dst=self.dst, tag=tag) self.traffic_record.send_byte(get_num_byte(send_tensor))
def _master_procedure(self): # Receive the rank of the next worker in the queue. worker_rank = self._next_worker() # Receive the update and store it in the parameter buffer. requests = [] for group in self.param_groups: for p in group["params"]: parameter_buffer = self.state[p]["buffer"] request = dist.irecv(parameter_buffer.data, src=worker_rank) requests.append(request) # Update central variable. for group in self.param_groups: for index, p in enumerate(group["params"]): parameter_buffer = self.state[p]["buffer"] requests[index].wait() p.data.add_(parameter_buffer.data) dist.isend(p.data, dst=worker_rank)
def store(self, key: str, src: torch.Tensor, accum: bool = False, overwrite: bool = True) -> None: """Store or accumulate a tensor on the server.""" self._validate_store(key, src, accum=accum, overwrite=overwrite) cmd_rpc = torch.tensor( [ STORE_CMD, len(key), -1 if accum else src.ndimension(), int(accum), int(overwrite), _dtypes.index(src.dtype), ], dtype=torch.long, ) metadata_pg = self._metadata_pg() td.send(cmd_rpc, self.server_rank, group=metadata_pg) td.send(_fromstring(key), self.server_rank, group=metadata_pg) if not accum: td.send( torch.tensor(list(src.size()), dtype=torch.long), self.server_rank, group=metadata_pg, ) start_t = time.monotonic() data_pgs = self._data_pgs() if data_pgs is None: td.send(src, dst=self.server_rank) else: outstanding_work = [] flattened_src = src.flatten() flattened_size = flattened_src.shape[0] for idx, (pg, slice_) in enumerate( zip( data_pgs, split_almost_equally(flattened_size, num_parts=len(data_pgs)), )): outstanding_work.append( td.isend( tensor=flattened_src[slice_], dst=self.server_rank, group=pg, tag=idx, )) for w in outstanding_work: w.wait() end_t = time.monotonic() if self.log_stats: stats_size = src.numel() * src.element_size() stats_time = end_t - start_t logger.debug(f"Sent tensor {key} to server {self.server_rank}: " f"{stats_size:,} bytes " f"in {stats_time:,g} seconds " f"=> {stats_size / stats_time:,.0f} B/s")
def message_pass(rank, ws): if rank == 0: my_tensors = [torch.zeros(ws, requires_grad=True) for _ in range(ws)] # Can't pass messages to self, so do any work within proc my_tensors[0][0] = 1 # Wait for messages from other procs msgs = [dist.irecv(my_tensors[i], src=i) for i in range(1, ws)] [msg.wait() for msg in msgs] print(my_tensors) else: # Send master onehot tensor of rank t = torch.zeros(ws, requires_grad=True) t[rank] = 1 t = torch.sigmoid(t) dist.isend(t, 0)
def test_isend(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: requests = [ dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size) ] for request in requests: request.wait() self.assertTrue(request.is_completed()) else: tensor = _build_tensor(rank, -1) dist.recv(tensor, 0) self.assertEqual(tensor, _build_tensor(rank, 10)) self._barrier()