def fp_recv_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid, comm_rank, world_sz, bs, subbs, pd, input_shp, output_shp, fp_head_list, shared_cnters, global_step, sta_lidx, end_lidx): #proc fp_send:0; fp_recv:1; bp_send:2; bp_recv:3 iter_thresh = int(bs / subbs) allreduce_group, fp_gather_group, bp_scatter_group = init_processes( comm_rank, world_sz) #print("fp_recv_proc comm_rank=", comm_rank) if wid == 0 or wid == 1: shared_cnters[0] = iter_thresh return src_rank = pred_wid * 4 place_tensor = torch.zeros(1) while True: if shared_cnters[0] < iter_thresh: #print("fp recv ", comm_rank, " <- ", src_rank, " ", shared_cnters[0], " ", bs) if wid == 3: dist.recv(tensor=fp_head_list[shared_cnters[0]], src=src_rank) elif wid == 2: glist = list(fp_head_list[shared_cnters[0]].chunk(chunks=2, dim=0)) place_tensor = glist[0] #print("place_tensor sz ", place_tensor.size()) glist.append(place_tensor) dist.gather(tensor=place_tensor, gather_list=glist, dst=comm_rank, group=fp_gather_group, async_op=False) shared_cnters[0] += 1 #print("wid=",wid, " fp recv ") else: time.sleep(0.001)
def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list): """Each process scatters list of input tensors to all processes in a cluster and return gathered list of tensors in output list. Parameters ---------- rank : int The rank of current worker world_size : int The size of the entire output_tensor_list : List of tensor The received tensors input_tensor_list : List of tensor The tensors to exchange """ # send tensor to each target trainer using torch.distributed.isend # isend is async senders = [] for i in range(world_size): if i == rank: output_tensor_list[i] = input_tensor_list[i].to(th.device('cpu')) else: sender = dist.isend(input_tensor_list[i].to(th.device('cpu')), dst=i) senders.append(sender) for i in range(world_size): if i != rank: dist.recv(output_tensor_list[i], src=i) th.distributed.barrier()
def bp_recv_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid, comm_rank, world_sz, bs, subbs, pd, input_shp, output_shp, bp_tail_list, shared_cnters, global_step, sta_lidx, end_lidx): #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3 iter_thresh = int(bs / subbs) allreduce_group, fp_gather_group, bp_scatter_group = init_processes( comm_rank, world_sz) print("bp_recv_proc comm_rank=", comm_rank) if wid == wn - 1: shared_cnters[3] = iter_thresh return src_rank = succ_wid * 4 + 2 while True: if shared_cnters[3] < iter_thresh: if wid == 2: dist.recv(tensor=bp_tail_list[shared_cnters[3]], src=src_rank) elif wid == 0 or wid == 1: dist.scatter(tensor=bp_tail_list[shared_cnters[3]], scatter_list=[], src=src_rank, group=bp_scatter_group, async_op=False) shared_cnters[3] += 1 #print("wid=",wid, " bp_recv") else: time.sleep(0.001)
def runServer(model): # model = Net() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) optimizer.zero_grad() numberOfTimes = dist.get_world_size() - 1 for param in model.parameters(): param.sum().backward() tag = torch.zeros(1) while True: optimizer.zero_grad src = dist.recv(tensor=tag) # print("Reached ", src) if tag[0] == 0: for param in model.parameters(): dist.send(tensor=param.data, dst=src) elif tag[0] == -1: numberOfTimes -= 1 if numberOfTimes == 0: # print("------------- Breaking ----------------") break else: for param in model.parameters(): dist.recv(tensor=param.grad.data, src=src) optimizer.step() optimizer.zero_grad() for param in model.parameters(): dist.send(tensor=param.data, dst=src)
def handle_store( self, rank: int, key: str, ndim: int, accum: int, overwrite: int, ttype: int, ) -> None: if ndim == -1: assert key in self.parameters size = self.parameters[key].size() else: size = torch.empty((ndim, ), dtype=torch.long) td.recv(size, src=rank) size = size.tolist() tensor_type = _tensor_types[ttype] if not accum and overwrite and key in self.parameters: # avoid holding onto 2x the memory del self.parameters[key] data = tensor_type(*size) td.recv(data, src=rank) if accum: self.parameters[key] += data elif (key not in self.parameters) or overwrite: self.parameters[key] = data
def backward_rank0(semaphore): batch_idx = 0 grad_recv = torch.zeros(shapes[0]) dist.recv(tensor=grad_recv, src=1) while True: grad_recv = grad_recv.cuda(0) print(" backwardbatch_idx:" + str(batch_idx)) try: loss = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break loss.backward(grad_recv) if batch_idx % 3 == 0: # print("step: " + str(batch_idx)) optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: print("eq...") break grad_recv = transfer(6, None, shapes[0]) #shapes[0] print("backward send.....") print("backward end..")
def backward(ctx, grad_output): tensor, = ctx.saved_tensors # TODO: Add ctx.needs_input_grad check grad_tensor = torch.zeros_like(tensor) dist.recv(grad_tensor, ctx.dst, ctx.group, ctx.tag) return grad_tensor, None, None, None
def backward_rank1(): residual = None batch_idx = 0 grad_recv1 = torch.zeros(shapes[1], dtype=torch.int8) #grad_recv1 = torch.HalfTensor(torch.Size(shapes[1])) dist.recv(tensor=grad_recv1, src=2) while True: print(" backward batch_idx:" + str(batch_idx)) #grad_recv1 = unpack(grad_recv1.cuda(), shapes[1]) grad_recv1 = dequantize(grad_recv1.cuda().float()) #grad_recv1 = grad_recv1.cuda() try: inputs, outputs = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break inputs.requires_grad_() outputs.backward(grad_recv1) #inputs_grad = quantize(inputs.grad, char=True).cpu() inputs_grad, residual = compress(inputs.grad, residual=residual) inputs_grad = inputs_grad.cpu() #inputs_grad = inputs.grad.cpu() if batch_idx % 2 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: transfer(3, inputs_grad, None) print("backend In send..") break grad_recv1 = transfer(3, inputs_grad, shapes[1]) print("backward send.......") print("backard end....")
def allreduce(r, world, peers, tensor): r = dist.get_rank() world = dist.get_world_size() peers = list(filter(lambda i: i != r, list(range(world)))) sizeOfTensor = list(tensor.size())[0] chunksize = sizeOfTensor // world reqs = [ dist.isend(tensor=tensor[i * chunksize:(i + 1) * chunksize], dst=i) for i in peers ] # K concurrent transfers recv = torch.zeros(sizeOfTensor // (world)) for i in peers: # K steps dist.recv(tensor=recv, src=i) # K / ??? values... tensor[r * chunksize:(r + 1) * chunksize] += recv[:] for req in reqs: req.wait() # we have to set to zero the values that we are not responsible (they will be included on their way back) reqs = [ dist.isend(tensor=tensor[r * chunksize:(r + 1) * chunksize], dst=i) for i in peers ] for i in peers: dist.recv(tensor=recv, src=i) tensor[i * chunksize:(i + 1) * chunksize] = recv for req in reqs: req.wait()
def backward_rank1(semaphore, start_event, start_event2): start_event.wait() batch_idx = 0 while True: try: #semaphore.release() print("before grad recv...") grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=grad_recv1, src=2) print("after grad recv.....") except RuntimeError as error: print("backward runtime error") send_opt = dist.isend(tensor=torch.zeros(0), dst=0) send_opt.wait() break grad_recv1 = dequantize(grad_recv1.cuda(0).float()) inputs, outputs = outputs_queue.get(block=False) inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() inputs_grad = quantize(inputs.grad, char=True).cpu() print(inputs_grad.size()) if batch_idx == 0: start_event2.set() #send_opt = dist.isend(tensor=inputs_grad, dst=0) #send_opt.wait() dist.send(tensor=inputs_grad, dst=0) batch_idx += 1
def recv(self, collectiveArgs, src_rank, retFlag=False, tag=0): dist.recv( tensor=collectiveArgs.opTensor, src=src_rank, group=collectiveArgs.group, tag=tag )
def transfer4backend1(tag, send_buf, flag=False): if not flag: left, right = get_left_right(tag) dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1), dst=right) send_opt = dist.isend(tensor=send_buf, dst=right) send_opt.wait() return None else: left, right = get_left_right(tag) dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1), dst=right) send_opt = dist.isend(tensor=send_buf, dst=right) try: shape_buf = torch.zeros([1], dtype=torch.short) dist.recv(tensor=shape_buf, src=left) recv_buf = torch.zeros(torch.Size(shape_buf.tolist())) dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error") return None send_opt.wait() return recv_buf
def cumsum(self, dim): new_chunk = self.chunk.cumsum(dim) if self.byrow and dim==0: buf = torch.zeros_like(new_chunk[-1, :]) for i in range(self.size-1): if self.rank == i: synchronize() dist.send(new_chunk[-1,:], i+1) elif self.rank == i + 1: synchronize() dist.recv(buf, i) new_chunk += buf dist.barrier() elif not self.byrow and dim==1: buf = torch.zeros_like(new_chunk[:, -1]) for i in range(self.size-1): if self.rank==i: synchronize() dist.send(new_chunk[:, -1], i+1) elif self.rank == i+1: synchronize() dist.recv(buf, i) new_chunk += buf dist.barrier() return THDistMat(self.shape, self.sizes, new_chunk, self.byrow)
def worker(): """ Initialize the distributed environment. """ import torch import torch.distributed as dist from torch.multiprocessing import Process print("Initializing distributed pytorch") os.environ['MASTER_ADDR'] = str(args.master_addr) os.environ['MASTER_PORT'] = str(args.master_port) dist.init_process_group(args.backend, rank=args.rank, world_size=args.size) for i in range(100): tensor = torch.ones(args.data_size_mb * 250 * 1000) * (args.rank + 1) # print('before: rank ', args.rank, ' has data ', tensor[0]) start_time = time.perf_counter() if args.rank == 0: dist.send(tensor=tensor, dst=1) else: dist.recv(tensor=tensor, src=0) elapsed_time = time.perf_counter() - start_time # print('after: rank ', args.rank, ' has data ', tensor[0]) rate = args.data_size_mb / elapsed_time print("Process %d transferred %d MB in %.1f ms (%.1f MB/sec)" % (args.rank, args.data_size_mb, elapsed_time * 1000, rate))
def transfer(tag, send_buf, shape): if shape == None: left, right = get_left_right(tag) send_opt = dist.isend(tensor=send_buf, dst=right) send_opt.wait() return None elif not torch.is_tensor(send_buf): left, right = get_left_right(tag) try: recv_buf = torch.zeros(shape) # , dtype=torch.int8 dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error..") return None return recv_buf else: left, right = get_left_right(tag) send_opt = dist.isend(tensor=send_buf, dst=right) try: recv_buf = torch.zeros(shape) dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error") return None send_opt.wait() return recv_buf
def eval(layer, logger, e, save_event, data_size, testloader): criterion = nn.CrossEntropyLoss() criterion.cuda() layer.eval() with torch.no_grad(): if dist.get_rank() == 0: for batch_idx, (inputs, targets) in enumerate(testloader): print('batch_idx: ' + str(batch_idx)) inputs = inputs.cuda(0) outputs = layer(inputs) dist.send(tensor=outputs.cpu(), dst=1) print("send.....") e.wait() elif dist.get_rank() == 1: batch_idx = 0 while data_size > batch_idx: print("batch_idx:" + str(batch_idx)) rec_val = torch.zeros( [100, 256, 4, 4]) # difference model has difference shape dist.recv(tensor=rec_val, src=0) print("after recv....") outputs = layer(rec_val.cuda()) dist.send(tensor=outputs.cpu(), dst=2) batch_idx += 1 print("send...") e.wait() elif dist.get_rank() == 2: test_loss = 0 correct = 0 total = 0 save_event.clear() global best_acc for batch_idx, (inputs, targets) in enumerate(testloader): rec_val = torch.zeros([100, 512, 2, 2]) dist.recv(tensor=rec_val, src=1) outputs = layer(rec_val.cuda(0)) targets = targets.cuda() loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total)) logger.error("eval:" + str(test_loss / (batch_idx + 1))) acc_str = "eacc: %.3f" % (100. * correct / total, ) logger.error(acc_str) time.sleep(1) acc = 100. * correct / total if acc > best_acc: best_acc = acc save_event.set() time.sleep(1) e.set()
def backward(layer, atom, outputs_queue, args): optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() dist.init_process_group(backend='gloo', init_method=args.path, world_size=3, rank=2) batch_idx = 0 while True: try: grad = torch.zeros([args.batch_size, 128, 16, 16]).half() dist.recv(tensor=grad, src=1) #grad = grad_queue.get(block=True, timeout=1) #grad = torch.from_numpy(grad) #grad = dense(grad, [args.batch_size, 128, 16, 16]).cuda(0) grad = grad.cuda(0).float() except Empty as empty: print("backward empty.....") break loss = get_tensor(outputs_queue, atom, 1) loss.backward(grad) if batch_idx % 2 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1
def receive(theModel, src, tag): weights = model.save(theModel) # will be overwritten for i in range(len(weights)): dist.recv(tensor=weights[i], src=src, tag=tag * 100 + i) theModel = model.load(weights, theModel) print("Model received from", src) return theModel
def backward_rank0(semaphore): batch_idx = 0 ten_len = tensor_len(shapes[0]) grad_recv = torch.zeros(ten_len + 2) #grad_recv = torch.zeros(shapes[0], dtype=torch.int8) #grad_recv = torch.HalfTensor(torch.Size(shapes[0])) dist.recv(tensor=grad_recv, src=1) while True: #semaphore.release() #grad_recv = dequantize(grad_recv.cuda().float()) grad_recv = de_piecewise_quantize(grad_recv.cuda(), shapes[0]) #grad_recv = unpack(grad_recv.cuda(), shapes[0]) print(" backwardbatch_idx:" + str(batch_idx)) # grad_recv = grad_recv.cuda() try: loss = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break loss.backward(grad_recv) if batch_idx % 2 == 0: # print("step: " + str(batch_idx)) optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: print("eq...") break grad_recv = transfer2(4, None, ten_len + 2) #shapes[0] print("backward send.....") print("backward end..")
def receive_tensor_helper(dist, tensor, src_rank, group, tag, num_iterations, intra_server_broadcast): for i in range(num_iterations): if intra_server_broadcast: dist.broadcast(tensor=tensor, group=group, src=src_rank) else: dist.recv(tensor=tensor, src=src_rank, tag=tag)
def backward_rank1(): residual = None batch_idx = 0 grad_recv1 = torch.zeros(shapes[1]) dist.recv(tensor=grad_recv1, src=2) while True: print(" backward batch_idx:" + str(batch_idx)) grad_recv1 = grad_recv1.cuda(1) try: inputs, outputs = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % 3 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: transfer(5, inputs.grad.cpu(), None) print("backend In send..") break grad_recv1 = transfer(5, inputs.grad.cpu(), shapes[1]) #shapes[1] print("backward send.......") print("backard end....")
def _recv_key(rank: int, keylen: int, group: Optional["td.ProcessGroup"] = None) -> str: """Receive a string tensor key from a client node.""" key_buffer = torch.zeros((keylen, ), dtype=torch.int8) td.recv(key_buffer, src=rank, group=group) return _tostring(key_buffer)
def allreduce(send, recv): rank = dist.get_rank() size = dist.get_world_size() device = torch.device("cuda:" + str(rank) if torch.cuda.is_available() else "cpu") send_buff = torch.zeros(send.size()).to(device) recv_buff = torch.zeros(send.size()).to(device) accum = torch.zeros(send.size()).to(device) accum[:] = send[:] left = ((rank - 1) + size) % size right = (rank + 1) % size for i in range(size - 1): if i % 2 == 0: # Send send_buff send_req = dist.isend(send_buff, right) dist.recv(recv_buff, left) accum[:] += recv[:] else: # Send recv_buff send_req = dist.isend(recv_buff, right) dist.recv(send_buff, left) accum[:] += send[:] send_req.wait() recv[:] = accum[:]
def backward_rank1(): residual = None batch_idx = 0 ten_len = tensor_len(shapes[1]) grad_recv1 = torch.zeros(ten_len + 2) dist.recv(tensor=grad_recv1, src=2) while True: print(" backward batch_idx:" + str(batch_idx)) grad_recv1 = de_piecewise_quantize(grad_recv1.cuda(), shapes[1]) try: inputs, outputs = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break inputs.requires_grad_() outputs.backward(grad_recv1) inputs_grad, residual = piecewise_quantize(inputs.grad, logger=logger, residual=residual) if batch_idx % 3 == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: transfer2(5, inputs_grad, None) print("backend In send..") break grad_recv1 = transfer2(5, inputs_grad, ten_len + 2) #shapes[1] print("backward send.......") print("backard end....")
def swap( self, key: str, src: torch.Tensor, dst: Optional[torch.Tensor] = None, accum: bool = False, overwrite: bool = False, ) -> None: """Store or accumulate a tensor on the server, and then get its current value. """ if dst is None: dst = torch.zeros_like(src) # tic = time.time() cmd_rpc = torch.tensor([ SWAP_CMD, len(key), -1 if accum else src.ndimension(), int(accum), int(overwrite), _tensor_type_idx[src.type()] ], dtype=torch.long) td.send(cmd_rpc, self.server_rank) td.send(_fromstring(key), self.server_rank) if not accum: td.send(torch.tensor(list(src.size()), dtype=torch.long), self.server_rank) td.send(src, self.server_rank) td.recv(dst, src=self.server_rank)
def backward_rank0(semaphore): batch_idx = 0 shape_buf = torch.zeros([1], dtype=torch.short) dist.recv(tensor=shape_buf, src=1) grad_recv = torch.zeros(torch.Size(shape_buf.tolist())) dist.recv(tensor=grad_recv, src=1) while True: #semaphore.release() grad_recv = unpack(grad_recv.cuda(), shapes[0]) print(" backwardbatch_idx:" + str(batch_idx)) # grad_recv = grad_recv.cuda() try: loss = outputs_queue.get(block=True, timeout=4) except Empty: print("empty........") break loss.backward(grad_recv) if batch_idx % 2 == 0: # print("step: " + str(batch_idx)) optimizer.step() optimizer.zero_grad() batch_idx += 1 if data_size == batch_idx: print("eq...") break grad_recv = transfer4backend0(4) print("backward send.....") print("backward end..")
def allreduce(send, recv): """ Implementation of a ring-reduce. """ rank = dist.get_rank() size = dist.get_world_size() send_buff = th.zeros(send.size()) recv_buff = th.zeros(send.size()) accum = th.zeros(send.size()) accum[:] = send[:] # th.cuda.synchronize() left = ((rank - 1) + size) % size right = (rank + 1) % size for i in range(size - 1): if i % 2 == 0: # Send send_buff send_req = dist.isend(send_buff, right) dist.recv(recv_buff, left) accum[:] += recv[:] else: # Send recv_buff send_req = dist.isend(recv_buff, right) dist.recv(send_buff, left) accum[:] += send[:] send_req.wait() # th.cuda.synchronize() recv[:] = accum[:]
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() # send parameter request every N iterations if self.idx % self.param_groups[0]['tau'] == 0: self.idx = 1 self.send_message( MessageCode.PullTilde, torch.randn(self.squash_model(self.model).numel())) # pull x tilde m_parameter = torch.zeros( self.squash_model(self.model).numel() + 7).to(torch.int16) dist.recv(tensor=m_parameter) # build alpha term m_parameter = m_parameter[2:] m_parameter = dequantize_tensor(m_parameter) current_index = 0 # keep track of where to read from parameter_update delta = copy.deepcopy(self.model) alpha = self.param_groups[0]['rho'] * self.param_groups[0]['lr'] for parameter in delta.parameters(): numel = parameter.data.numel() size = parameter.data.size() parameter.data.add_( -1, m_parameter[current_index:current_index + numel].view(size)) parameter.data.mul_(alpha) current_index += numel # delta = delta * self.param_groups[0]['rho'] * self.param_groups[0]['lr'] # update x for cur_parameter, cur_delta in zip(self.model.parameters(), delta.parameters()): cur_parameter.data.add_(-1, cur_delta.data) # push delta to update x tilde self.send_message(MessageCode.UpdateTilde, self.squash_model(delta)) else: self.idx += 1 # internal sgd update for group in self.param_groups: for p in group['params']: if p.grad is None: continue d_p = p.grad.data p.data.add_(-group['lr'], d_p) return loss
def get_lr(self, group_name): cmd = torch.LongTensor([ getattr(TORCH_PARAMETER_SERVER_CMDS, f"GET_{group_name.upper()}_LR_CMD"), 0 ]) dist.send(cmd, dst=self.server_rank) dist.recv(self.lr_buffer, src=self.server_rank) return self.lr_buffer[0].item()
def sync_buffers(self): for p in self.model._all_buffers(): p.data.zero_() recv_buff = torch.FloatTensor(p.data.size()).cuda() for w in self.workers: dist.recv(recv_buff, src=w) p.data.add_(recv_buff) p.data.div_(self.num_workers)
def test_isend(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: requests = [ dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size) ] for request in requests: request.wait() self.assertTrue(request.is_completed()) else: tensor = _build_tensor(rank, -1) dist.recv(tensor, 0) self.assertEqual(tensor, _build_tensor(rank, 10)) self._barrier()
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(src + 1, value=-1) expected_tensor = _build_tensor(src + 1) dist.recv(tensor, src) self.assertEqual(tensor, expected_tensor) self._barrier()
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, rank) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) recv_ranks = set() for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(10, value=-1) dist.recv(tensor) recv_ranks.add(tensor.resize_(1)[0]) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def recv(self, var): dist.recv(tensor=var, src=self.other) return var
print_header("send from 0 to 1") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.send(tensor, 1) end = timer() print_stats(bytes, num_tensors, end - start) print() elif rank == 1: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.recv(tensor, 0) dist.barrier() if rank == 0: print_header("reduce") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.reduce(tensor, 0) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: