Example #1
0
def worker():
    """ Initialize the distributed environment. """

    import torch
    import torch.distributed as dist
    from torch.multiprocessing import Process

    print("Initializing distributed pytorch")
    os.environ['MASTER_ADDR'] = str(args.master_addr)
    os.environ['MASTER_PORT'] = str(args.master_port)
    dist.init_process_group(args.backend, rank=args.rank, world_size=args.size)

    for i in range(100):
        tensor = torch.ones(args.data_size_mb * 250 * 1000) * (args.rank + 1)
        # print('before: rank ', args.rank, ' has data ', tensor[0])

        start_time = time.perf_counter()
        if args.rank == 0:
            dist.send(tensor=tensor, dst=1)
        else:
            dist.recv(tensor=tensor, src=0)

        elapsed_time = time.perf_counter() - start_time
        # print('after: rank ', args.rank, ' has data ', tensor[0])
        rate = args.data_size_mb / elapsed_time

        print("Process %d transferred %d MB in %.1f ms (%.1f MB/sec)" %
              (args.rank, args.data_size_mb, elapsed_time * 1000, rate))
Example #2
0
def bp_send_proc(rank, bs, subbs, wid, wn, wrank, nproc, gn, gsz, bp_head_list,
                 shared_cnters):
    #world_sz =  nproc * wn *4  #+1
    #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3
    comm_rank = wid * nproc * 4 + rank * 4 + 2
    iter_thresh = int(bs / subbs)
    #init_processes(comm_rank, world_sz)
    init_processes(comm_rank, wid, wn, nproc, gn, gsz, backend='gloo')
    print("bp_send_proc comm_rank=", comm_rank)
    #if wid == 0:
    if wrank == 0:
        shared_cnters[2] = 0
        return
    local_bp_sent_counter = 0
    dst_rank = (wid - 1) * nproc * 4 + rank * 4 + 3
    while True:
        if local_bp_sent_counter < shared_cnters[2]:
            dist.send(tensor=bp_head_list[local_bp_sent_counter], dst=dst_rank)
            #print("bp send ", bp_head_list[local_bp_sent_counter].numel())
            local_bp_sent_counter += 1
        else:
            time.sleep(0.001)
        if local_bp_sent_counter == iter_thresh:
            local_bp_sent_counter = 0
            shared_cnters[2].zero_()
Example #3
0
def transfer4backend1(tag, send_buf, flag=False):

    if not flag:
        left, right = get_left_right(tag)
        dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1),
                  dst=right)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        send_opt.wait()
        return None

    else:
        left, right = get_left_right(tag)
        dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1),
                  dst=right)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        try:
            shape_buf = torch.zeros([1], dtype=torch.short)
            dist.recv(tensor=shape_buf, src=left)
            recv_buf = torch.zeros(torch.Size(shape_buf.tolist()))
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error")
            return None
        send_opt.wait()
        return recv_buf
Example #4
0
def send_tensor_helper(dist, tensor, dst_rank, group, tag, num_iterations,
                       intra_server_broadcast):
    for i in range(num_iterations):
        if intra_server_broadcast:
            dist.broadcast(tensor=tensor, group=group, src=1 - dst_rank)
        else:
            dist.send(tensor=tensor, dst=dst_rank, tag=tag)
 def get(
     self,
     key: str,
     dst: Optional[torch.Tensor] = None,
     shared: bool = False,
 ) -> Optional[torch.Tensor]:
     """Get a tensor from the server.
     """
     cmd_rpc = torch.tensor(
         [GET_CMD, len(key), dst is None, 0, 0, 0], dtype=torch.long)
     td.send(cmd_rpc, self.server_rank)
     td.send(_fromstring(key), self.server_rank)
     if dst is None:
         meta = torch.full((2, ), -1, dtype=torch.long)
         td.recv(meta, src=self.server_rank)
         ndim, ttype = meta
         if ndim.item() == -1:
             return None
         size = torch.full((ndim.item(), ), -1, dtype=torch.long)
         td.recv(size, src=self.server_rank)
         tensor_type = _tensor_types[ttype.item()]
         if shared:
             dst_storage = tensor_type().storage_type()._new_shared(
                 size.prod())
             dst = tensor_type(dst_storage).view(*size.tolist())
         else:
             dst = tensor_type(*size.tolist())
     td.recv(dst, src=self.server_rank)
     return dst
Example #6
0
    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, value=rank)
        recv_ranks = set()

        for dst in range(0, dist.get_world_size()):
            if dst == rank:
                # Recv mode
                for dst in range(0, dist.get_world_size()):
                    if dst == rank:
                        continue
                    output_tensor = _build_tensor(10, value=-1)
                    sender = dist.recv(output_tensor)

                    # Assert the scalar value "sender" that should be
                    # equal to the rank of the sender is equal to all
                    # values in the received tensor.
                    self.assertTrue(output_tensor.eq(sender).all())
                    recv_ranks.add(sender)
            else:
                # Send mode
                dist.send(tensor, dst)

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
Example #7
0
def fp_send_proc(rank, bs, subbs, wid, wn, nproc, fp_tail_list, shared_cnters):
    world_sz = nproc * wn * 4  #+1
    #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3
    comm_rank = wid * nproc * 4 + rank * 4
    iter_thresh = bs / subbs
    init_processes(comm_rank, world_sz)
    print("fp_send_proc comm_rank=", comm_rank)
    if wid == wn - 1:
        shared_cnters[1] = 4
        return
    local_fp_sent_counter = 0
    dst_rank = (wid + 1) * nproc * 4 + rank * 4 + 1
    while True:
        #print("fp send ", local_fp_sent_counter, " ", shared_cnters[1])
        #fp_tail_tensor
        if local_fp_sent_counter < shared_cnters[1]:
            # is it okay to directly send gpu tensor?
            #print("fp send ", comm_rank, "  -> ", dst_rank)
            dist.send(tensor=fp_tail_list[local_fp_sent_counter], dst=dst_rank)
            #print("fp send ", fp_tail_list[local_fp_sent_counter].numel())
            #print("fin fp send ", comm_rank, "  -> ", dst_rank)
            local_fp_sent_counter += 1
        else:
            time.sleep(0.001)
        if local_fp_sent_counter == iter_thresh:
            #reset
            local_fp_sent_counter = 0
            shared_cnters[1].zero_()
 def send(self, collectiveArgs, dst_rank, retFlag=False, tag=0):
     dist.send(
         tensor=collectiveArgs.ipTensor,
         dst=dst_rank,
         group=collectiveArgs.group,
         tag=tag
     )
Example #9
0
def _send(tensor, tensor_name, src_rank, dst_rank, tag, sub_process_group=None):
    """
    Sends tensor by calling PyTorch's send() call.

    If tensor is being sent not via broadcast(), it will
    be first copied to the CPU.
    """
    if sub_process_group is not None:
        assert tensor.is_cuda

        # Send tensor shape.
        tensor_shape = torch.tensor(tensor.shape, dtype=torch.int)
        dist.broadcast(tensor=tensor_shape, src=src_rank,
                      group=sub_process_group)

        # Send tensor.
        contiguous_tensor = tensor.detach().clone()
        dist.broadcast(tensor=contiguous_tensor.contiguous(),
                       src=src_rank,
                       group=sub_process_group)
    else:
        assert tensor.is_cuda
        tensor = tensor.cpu()

        # Send tensor shape.
        tensor_shape = torch.tensor(tensor.shape, dtype=torch.int)
        dist.send(tensor=tensor_shape, dst=dst_rank, tag=tag)

        # Send tensor.
        dist.send(tensor=tensor, dst=dst_rank, tag=tag)
 def forward(ctx, tensor, dst, group=dist.group.WORLD, tag=0):
     ctx.save_for_backward(tensor)
     ctx.dst = dst
     ctx.group = group
     ctx.tag = tag
     dist.send(tensor, dst, group, tag)
     return tensor.new_tensor([])
Example #11
0
def send_gradient_to_server(model):
    # send the server the msg saying we're not done yet
    dist.send(torch.zeros(1), 0)
    data = flatten_many_tensors([p.grad for p in model.parameters()],
                                get_total_size(model))
    dist.send(data, 0)
    receive_model_from_server(model)
Example #12
0
    def backward_rank1(semaphore, start_event, start_event2):

        start_event.wait()

        batch_idx = 0
        while True:
            try:
                #semaphore.release()
                print("before grad recv...")
                grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8)
                dist.recv(tensor=grad_recv1, src=2)
                print("after grad recv.....")
            except RuntimeError as error:
                print("backward runtime error")
                send_opt = dist.isend(tensor=torch.zeros(0), dst=0)
                send_opt.wait()
                break
            grad_recv1 = dequantize(grad_recv1.cuda(0).float())
            inputs, outputs = outputs_queue.get(block=False)
            inputs.requires_grad_()
            outputs.backward(grad_recv1)
            if batch_idx % args.buffer_size == 0:
                optimizer.step()
                optimizer.zero_grad()

            inputs_grad = quantize(inputs.grad, char=True).cpu()
            print(inputs_grad.size())
            if batch_idx == 0:
                start_event2.set()
            #send_opt = dist.isend(tensor=inputs_grad, dst=0)
            #send_opt.wait()
            dist.send(tensor=inputs_grad, dst=0)
            batch_idx += 1
Example #13
0
def send_model_to_worker(model, worker_id=None):
    assert worker_id is not None
    # flatten it all so we can send in one go
    data = flatten_many_tensors([p for p in model.parameters()],
                                get_total_size(model))
    # data now contains the entire model flattened
    dist.send(data, worker_id)
Example #14
0
def eval(layer, logger, e, save_event, data_size, testloader):
    criterion = nn.CrossEntropyLoss()
    criterion.cuda()
    layer.eval()
    with torch.no_grad():
        if dist.get_rank() == 0:
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print('batch_idx: ' + str(batch_idx))
                inputs = inputs.cuda(0)
                outputs = layer(inputs)
                dist.send(tensor=outputs.cpu(), dst=1)
                print("send.....")

            e.wait()
        elif dist.get_rank() == 1:
            batch_idx = 0
            while data_size > batch_idx:
                print("batch_idx:" + str(batch_idx))
                rec_val = torch.zeros(
                    [100, 256, 4, 4])  # difference model has difference shape
                dist.recv(tensor=rec_val, src=0)
                print("after recv....")
                outputs = layer(rec_val.cuda())
                dist.send(tensor=outputs.cpu(), dst=2)
                batch_idx += 1
                print("send...")

            e.wait()
        elif dist.get_rank() == 2:
            test_loss = 0
            correct = 0
            total = 0
            save_event.clear()
            global best_acc

            for batch_idx, (inputs, targets) in enumerate(testloader):
                rec_val = torch.zeros([100, 512, 2, 2])
                dist.recv(tensor=rec_val, src=1)
                outputs = layer(rec_val.cuda(0))
                targets = targets.cuda()
                loss = criterion(outputs, targets)
                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                progress_bar(
                    batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' %
                    (test_loss /
                     (batch_idx + 1), 100. * correct / total, correct, total))
                logger.error("eval:" + str(test_loss / (batch_idx + 1)))
                acc_str = "eacc: %.3f" % (100. * correct / total, )
                logger.error(acc_str)
            time.sleep(1)
            acc = 100. * correct / total
            if acc > best_acc:
                best_acc = acc
                save_event.set()
            time.sleep(1)
            e.set()
Example #15
0
def runServer(model):
    # model = Net()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    optimizer.zero_grad()
    numberOfTimes = dist.get_world_size() - 1
    for param in model.parameters():
        param.sum().backward()
    tag = torch.zeros(1)
    while True:
        optimizer.zero_grad
        src = dist.recv(tensor=tag)
        # print("Reached ", src)
        if tag[0] == 0:
            for param in model.parameters():
                dist.send(tensor=param.data, dst=src)
        elif tag[0] == -1:
            numberOfTimes -= 1
            if numberOfTimes == 0:
                # print("------------- Breaking ----------------")
                break
        else:
            for param in model.parameters():
                dist.recv(tensor=param.grad.data, src=src)
            optimizer.step()
            optimizer.zero_grad()
            for param in model.parameters():
                dist.send(tensor=param.data, dst=src)
    def start(self) -> None:
        join_count = 0
        while True:
            # 1. receive the command
            cmd_buffer = torch.full((6, ), -1, dtype=torch.long)
            rank = td.recv(cmd_buffer)
            cmd = cmd_buffer[0].item()

            if cmd == STORE_CMD:
                key = self._recv_key(rank, cmd_buffer[1].item())
                self.handle_store(rank, key, cmd_buffer[2].item(),
                                  cmd_buffer[3].item(), cmd_buffer[4].item(),
                                  cmd_buffer[5].item())
            elif cmd == GET_CMD:
                key = self._recv_key(rank, cmd_buffer[1].item())
                self.handle_get(rank, key, cmd_buffer[2].item())
            elif cmd == SWAP_CMD:
                key = self._recv_key(rank, cmd_buffer[1].item())
                self.handle_store(rank, key, cmd_buffer[2].item(),
                                  cmd_buffer[3].item(), cmd_buffer[4].item(),
                                  cmd_buffer[5].item())
                self.handle_get(rank, key, False)
            elif cmd == JOIN_CMD:
                join_count += 1
                if join_count == self.num_clients:
                    for r in range(self.num_clients):
                        # after sending the join cmd,
                        # each client waits on this ack to know everyone is done
                        # and it's safe to exit
                        td.send(torch.zeros((1, )), dst=r)
                    break
            else:
                raise RuntimeError(
                    "Command is unknown value %d from rank %d." % (cmd, rank))
Example #17
0
 def cumsum(self, dim):
     new_chunk = self.chunk.cumsum(dim)
     if self.byrow and dim==0:
         buf = torch.zeros_like(new_chunk[-1, :])
         for i in range(self.size-1):
             if self.rank == i: 
                 synchronize()
                 dist.send(new_chunk[-1,:], i+1)
             elif self.rank == i + 1:
                 synchronize()
                 dist.recv(buf, i)
                 new_chunk += buf
             dist.barrier()
     elif not self.byrow and dim==1:
         buf = torch.zeros_like(new_chunk[:, -1])
         for i in range(self.size-1):
             if self.rank==i:
                 synchronize()
                 dist.send(new_chunk[:, -1], i+1)
             elif self.rank == i+1:
                 synchronize()
                 dist.recv(buf, i)
                 new_chunk += buf
             dist.barrier()
     return THDistMat(self.shape, self.sizes, new_chunk, self.byrow)
Example #18
0
 def set_lr(self, group_name, lr):
     cmd = torch.LongTensor([
         getattr(TORCH_PARAMETER_SERVER_CMDS,
                 f"SET_{group_name.upper()}_LR_CMD"), 0
     ])
     dist.send(cmd, dst=self.server_rank)
     self.lr_buffer[0] = lr
     dist.send(self.lr_buffer, dst=self.server_rank)
Example #19
0
 def get_lr(self, group_name):
     cmd = torch.LongTensor([
         getattr(TORCH_PARAMETER_SERVER_CMDS,
                 f"GET_{group_name.upper()}_LR_CMD"), 0
     ])
     dist.send(cmd, dst=self.server_rank)
     dist.recv(self.lr_buffer, src=self.server_rank)
     return self.lr_buffer[0].item()
 def _send_model_to_master(self):
     dist.barrier()
     self.conf.logger.log(
         f"Worker-{self.conf.graph.worker_id} (client-{self.conf.graph.client_id}) sending the model ({self.arch}) back to Master."
     )
     flatten_model = TensorBuffer(list(self.model.state_dict().values()))
     dist.send(tensor=flatten_model.buffer, dst=0)
     dist.barrier()
Example #21
0
 def step_optim(self, group_name):
     if group_name == "entity":
         parameter_index = 0
     else:
         parameter_index = 1
     cmd = torch.LongTensor(
         [TORCH_PARAMETER_SERVER_CMDS.STEP_OPTIM_CMD, parameter_index])
     dist.send(cmd, dst=self.server_rank)
Example #22
0
    def start(self, groups: List["td.ProcessGroup"]) -> None:
        self.groups = ([groups[idx] for idx in self.group_idxs]
                       if self.group_idxs is not None else None)
        join_count = 0
        metadata_pg = self._metadata_pg()
        while True:
            # 1. receive the command
            cmd_buffer = torch.full((6, ), -1, dtype=torch.long)
            rank = td.recv(cmd_buffer, group=metadata_pg)
            cmd = cmd_buffer[0].item()

            if cmd == STORE_CMD:
                key = self._recv_key(rank,
                                     cmd_buffer[1].item(),
                                     group=metadata_pg)
                self.handle_store(
                    rank,
                    key,
                    cmd_buffer[2].item(),
                    cmd_buffer[3].item(),
                    cmd_buffer[4].item(),
                    cmd_buffer[5].item(),
                )
            elif cmd == GET_CMD:
                key = self._recv_key(rank,
                                     cmd_buffer[1].item(),
                                     group=metadata_pg)
                self.handle_get(rank, key, cmd_buffer[2].item())
            elif cmd == SWAP_CMD:
                assert metadata_pg is None, "Swap is not used for partition servers."
                key = self._recv_key(rank, cmd_buffer[1].item())
                self.handle_store(
                    rank,
                    key,
                    cmd_buffer[2].item(),
                    cmd_buffer[3].item(),
                    cmd_buffer[4].item(),
                    cmd_buffer[5].item(),
                )
                self.handle_get(rank, key, False)
            elif cmd == JOIN_CMD:
                join_count += 1
                logger.info(f"ParameterServer join: join_count= {join_count}")
                if join_count == self.num_clients:
                    for r in range(self.num_clients):
                        # after sending the join cmd,
                        # each client waits on this ack to know everyone is done
                        # and it's safe to exit
                        td.send(torch.zeros((1, )), dst=r)
                    do_barrier = cmd_buffer[1].item()
                    if do_barrier:
                        logger.info("ParameterServer barrier begin")
                        td.barrier(self.groups[0])
                        logger.info("ParameterServer barrier end")
                    break
            else:
                raise RuntimeError(
                    "Command is unknown value %d from rank %d." % (cmd, rank))
Example #23
0
def runWorker(dataset, criterion, group, model):

    torch.manual_seed(1234)
    # model = Net()
    # optimizer = optim.SGD(model.parameters(), lr=lr, momentum = 0.9)
    size = dist.get_world_size()
    rank = dist.get_rank()

    epoch_loss = 0.0
    numberOfSamples = 0

    train_set, bsz = partition_dataset(dataset)

    num_batches = ceil(len(train_set.dataset) / float(bsz))
    # print("started ",rank)
    t0 = time.monotonic()
    dist.send(tensor=torch.Tensor([0]), dst=0)
    for param in model.parameters():
        dist.recv(tensor=param.data, src=0)
    dist.barrier(group)
    for epoch in range(epochs):
        epoch_loss = 0.0
        numberOfSamples = 0
        for batch_idx, (data, target) in enumerate(train_set):
            numberOfSamples += data.size()[0]
            data, target = Variable(data), Variable(target)
            model.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            epoch_loss += loss.item()
            loss.backward()
            dist.send(tensor=torch.Tensor([rank]), dst=0)
            for param in model.parameters():
                dist.send(tensor=param.grad.data, dst=0)
            for param in model.parameters():
                dist.recv(tensor=param.data, src=0)
            # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_set.dataset), 100. * batch_idx / len(train_set), loss.item()))
        dist.send(tensor=torch.Tensor([0]), dst=0)
        for param in model.parameters():
            dist.recv(tensor=param.data, src=0)
        dist.barrier(group)
        # print('Rank ', dist.get_rank(), ', epoch ', epoch, ': ', epoch_loss / num_batches)
    dist.send(tensor=torch.Tensor([-1]), dst=0)
    t0 = time.monotonic() - t0
    t0 /= epochs
    # if rank == 1:
    #     print(t0)
    # print('Rank ', dist.get_rank(), ', epoch_loss ', epoch_loss/ num_batches, ', number of samples ', numberOfSamples)
    execTime = torch.Tensor([t0])
    loss_w = torch.Tensor([epoch_loss * numberOfSamples / num_batches])
    numberOfSamples = torch.Tensor([numberOfSamples])
    dist.all_reduce(loss_w, op=dist.reduce_op.SUM, group=group)
    dist.all_reduce(numberOfSamples, op=dist.reduce_op.SUM, group=group)
    dist.all_reduce(execTime, op=dist.reduce_op.SUM, group=group)
    if rank == 1:
        print("\n C4 \n")
        print(loss_w / numberOfSamples, ',', execTime / (size - 1), ' s')
Example #24
0
 def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
     if rank == root:
         for idx in range(size):
             if idx != rank:
                 dist.recv(recvbuf[idx], src=idx, group=group)
             else:
                 recvbuf[rank] = sendbuf
     else:
         dist.send(sendbuf, group=group, dst=root)
Example #25
0
 def get(self,
         key: str,
         dst: Optional[torch.Tensor] = None,
         shared: bool = False) -> Optional[torch.Tensor]:
     """Get a tensor from the server."""
     self._validate_get(key, dst=dst, shared=shared)
     cmd_rpc = torch.tensor(
         [GET_CMD, len(key), dst is None, 0, 0, 0], dtype=torch.long)
     metadata_pg = self._metadata_pg()
     td.send(cmd_rpc, self.server_rank, group=metadata_pg)
     td.send(_fromstring(key), self.server_rank, group=metadata_pg)
     if dst is None:
         meta = torch.full((2, ), -1, dtype=torch.long)
         td.recv(meta, src=self.server_rank, group=metadata_pg)
         ndim, ttype = meta
         if ndim.item() == -1:
             return None
         size = torch.full((ndim.item(), ), -1, dtype=torch.long)
         td.recv(size, src=self.server_rank, group=metadata_pg)
         dtype = _dtypes[ttype.item()]
         if shared:
             dst = allocate_shared_tensor(size.tolist(), dtype=dtype)
         else:
             dst = torch.empty(size.tolist(), dtype=dtype)
     start_t = time.monotonic()
     data_pgs = self._data_pgs()
     if data_pgs is None:
         td.recv(dst, src=self.server_rank)
     else:
         outstanding_work = []
         flattened_dst = dst.flatten()
         flattened_size = flattened_dst.shape[0]
         for idx, (pg, slice_) in enumerate(
                 zip(
                     data_pgs,
                     split_almost_equally(flattened_size,
                                          num_parts=len(data_pgs)),
                 )):
             outstanding_work.append(
                 td.irecv(
                     tensor=flattened_dst[slice_],
                     src=self.server_rank,
                     group=pg,
                     tag=idx,
                 ))
         for w in outstanding_work:
             w.wait()
     end_t = time.monotonic()
     if self.log_stats:
         stats_size = dst.numel() * dst.element_size()
         stats_time = end_t - start_t
         logger.debug(
             f"Received tensor {key} from server {self.server_rank}: "
             f"{stats_size:,} bytes "
             f"in {stats_time:,g} seconds "
             f"=> {stats_size / stats_time:,.0f} B/s")
     return dst
Example #26
0
 def sendParameters(network, rank, broadcast=False, workGroup=workGroup):
     global VERBOSE
     if VERBOSE:
         print("Server -> Worker", rank)
     for param in network.parameters():
         if broadcast:
             dist.broadcast(param.data, src=dist.get_rank())
         else:
             dist.send(param.data, dst=rank, tag=0)
    def join(self) -> None:
        """All clients should call join at the end, which will allow the server
        to exit.
        """

        cmd_rpc = torch.tensor([JOIN_CMD, 0, 0, 0, 0, 0], dtype=torch.long)
        td.send(cmd_rpc, self.server_rank)
        ack = torch.empty((1, ))
        td.recv(ack, src=self.server_rank)
Example #28
0
 def send_message(self, message_code, payload, dst=0):
     """Sends a message to a destination
     Concatenates both the message code and destination with the payload into a single tensor and then sends that as a tensor
     """
     _LOGGER.info("SENDING MESSAGE: {} RANK: {}".format(
         message_code, dist.get_rank()))
     m_parameter = quantize_tensor(payload, self.quantize_num_bits)
     meta = torch.Tensor([dist.get_rank(), message_code]).to(torch.int16)
     m_parameter = torch.cat((meta, m_parameter))
     dist.send(tensor=m_parameter, dst=dst)
Example #29
0
def run(rank, size):
    tensor = torch.zeros(1)
    if rank == 0:
        tensor += 1
        for i in range(1, size):
            dist.send(tensor=tensor, dst=i)
    else:
        # Receive tensor from process 0
        dist.recv(tensor=tensor, src=0)
    print('Rank ', rank, ' has data ', tensor[0])
Example #30
0
def run(rank, size):
    tensor = torch.zeros(1)
    if rank == 0:
        tensor += 1
        # Send the tensor to process 1
        dist.send(tensor=tensor, dst=1)
    else:
        # Receive tensor from process 0
        dist.recv(tensor=tensor, src=0)
    print('Rank ', rank, ' has data ', tensor[0])
Example #31
0
    def test_send_recv(self):
        rank = dist.get_rank()
        tensor = _build_tensor(rank + 1)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(src + 1, value=-1)
            expected_tensor = _build_tensor(src + 1)
            dist.recv(tensor, src)
            self.assertEqual(tensor, expected_tensor)

        self._barrier()
Example #32
0
    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, rank)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        recv_ranks = set()
        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(10, value=-1)
            dist.recv(tensor)
            recv_ranks.add(tensor.resize_(1)[0])

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
Example #33
0
    def test_irecv(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
            requests = [
                dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
            ]

            for src in range(1, world_size):
                requests[src - 1].wait()
                self.assertTrue(requests[src - 1].is_completed())
                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
        else:
            tensor = _build_tensor(rank, 10)
            dist.send(tensor, 0)

        self._barrier()
 def send(self, var):
     dist.send(tensor=var, dst=self.other)
Example #35
0
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.broadcast(tensor, 0)
dist.barrier()

if rank == 0:
    print_header("send from 0 to 1")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.send(tensor, 1)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
elif rank == 1:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.recv(tensor, 0)
dist.barrier()

if rank == 0:
    print_header("reduce")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)