Example #1
0
def dist_sgd(model, rank):
    group = dist.new_group([0, 1, 3])
    for param in model.parameters():
        sending_right = copy.deepcopy(param.data)
        sending_left = copy.deepcopy(sending_right)
        recving_left_1 = copy.deepcopy(sending_right)
        recving_right_1 = copy.deepcopy(sending_right)
        size = dist.get_world_size()
        left = ((rank - 1) + size) % size
        right = (rank + 1) % size
        if rank % 2 == 0:
            req = dist.isend(sending_right, dst=right)
            req.wait()
            req = dist.irecv(recving_left_1, src=left)
            req.wait()
        else:
            req = dist.irecv(recving_left_1, src=left)
            req.wait()
            req = dist.isend(sending_right, dst=right)
            req.wait()
        dist.barrier()
        if rank % 2 == 0:
            req = dist.isend(sending_left, dst=left)
            req.wait()
            req = dist.irecv(recving_right_1, src=right)
            req.wait()
        else:
            req = dist.irecv(recving_right_1, src=right)
            req.wait()
            req = dist.isend(sending_left, dst=left)
            req.wait()
        param.data = (sending_left + recving_left_1 + recving_right_1) / 3
Example #2
0
def recv_obj(src, group):
    size = torch.tensor(1, dtype=torch.int32)
    dist.irecv(size, src=src, group=group).wait()

    data = torch.zeros(size=(size, ), dtype=torch.int8)
    dist.irecv(data, src=src, group=group).wait()
    buf = data.numpy().tobytes()
    return pickle.loads(buf)
Example #3
0
        def _setup_generators(self):
            seed = torch.empty(size=(), dtype=torch.long)
            dist.irecv(tensor=seed,
                       src=comm.get().get_ttp_rank(),
                       group=self.group).wait()
            dist.barrier(group=self.group)

            self.generator = torch.Generator()
            self.generator.manual_seed(seed.item())
    def recv_obj(self, src, group=None):
        if group is None:
            group = self.main_group

        size = torch.tensor(1, dtype=torch.int32)
        dist.irecv(size, src=src, group=group).wait()

        data = torch.zeros(size=(size,), dtype=torch.int8)
        dist.irecv(data, src=src, group=group).wait()
        buf = data.numpy().tobytes()
        return pickle.loads(buf)
Example #5
0
def broadcast(data, rank, world_size, recv_buff_l, recv_buff_r):
    left = ((rank - 1) + world_size) % world_size
    right = (rank + 1) % world_size
    send_req_l = dist.isend(data, dst=left)
    recv_req_r = dist.irecv(recv_buff_r, src=right)
    recv_req_r.wait()
    send_req_l.wait()
    send_req_r = dist.isend(data, dst=right)
    recv_req_l = dist.irecv(recv_buff_l, src=left)
    recv_req_l.wait()
    send_req_r.wait()
Example #6
0
    def recv_obj(self, src, group=None):
        """Receives a tensor from a source `src`."""
        if group is None:
            group = self.main_group

        size = torch.tensor(1, dtype=torch.int32)
        dist.irecv(size, src=src, group=group).wait()

        data = torch.empty(size=(size, ), dtype=torch.int8)
        dist.irecv(data, src=src, group=group).wait()
        buf = data.numpy().tobytes()
        return pickle.loads(buf)
Example #7
0
def timed_pt2pt(input, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        if dist.get_rank() == 0:
            if args.async_op:
                dist.isend(input, 1)
            else:
                dist.send(input, 1)
        if dist.get_rank() == 1:
            if args.async_op:
                dist.irecv(input, src=0)
            else:
                dist.recv(input, src=0)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        if dist.get_rank() == 0:
            if args.async_op:
                dist.isend(input, 1)
            else:
                dist.send(input, 1)
        if dist.get_rank() == 1:
            if args.async_op:
                dist.irecv(input, src=0)
            else:
                dist.recv(input, src=0)

    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
Example #8
0
        def _setup_generators(self):
            seed = torch.empty(size=(), dtype=torch.long)
            dist.irecv(
                tensor=seed, src=comm.get().get_ttp_rank(), group=self.ttp_group
            ).wait()
            dist.barrier(group=self.ttp_group)

            self.generator = torch.Generator(device="cpu")
            self.generator.manual_seed(seed.item())

            if torch.cuda.is_available():
                self.generator_cuda = torch.Generator(device="cuda")
                self.generator_cuda.manual_seed(seed.item())
            else:
                self.generator_cuda = None
def train_model(model, train_loader, optimizer, criterion, epoch, rank):
    """
    model (torch.nn.module): The model created to train
    train_loader (pytorch data loader): Training data loader
    optimizer (optimizer.*): A instance of some sort of optimizer, usually SGD
    criterion (nn.CrossEntropyLoss) : Loss function used to train the network
    epoch (int): Current epoch number
    """

    # remember to exit the train loop at end of the epoch
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # Your code goes here!
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        train_loss = criterion(output, target)
        train_loss.backward()
        for p in model.parameters():
            req = dist.isend(tensor=p.grad, dst=0)
            req.wait()
            req = dist.irecv(tensor=p.grad, src=0)
            req.wait()
        optimizer.step()
        if batch_idx % 20 == 0:
            print(batch_idx, "loss: ", train_loss.item())
            now = datetime.now()
        if batch_idx == 10:
            later = datetime.now()
            print("average time: ", (later - now).total_seconds()/9)
    def _receive_models_from_selected_clients(self, selected_client_ids):
        self.conf.logger.log(f"Master waits to receive the local models.")
        dist.barrier()

        # init the placeholders to recv the local models from workers.
        flatten_local_models = dict()
        for selected_client_id in selected_client_ids:
            arch = self.clientid2arch[selected_client_id]
            client_tb = TensorBuffer(
                list(self.client_models[arch].state_dict().values()))
            client_tb.buffer = torch.zeros_like(client_tb.buffer)
            flatten_local_models[selected_client_id] = client_tb

        # async to receive model from clients.
        reqs = []
        for client_id, world_id in zip(selected_client_ids, self.world_ids):
            req = dist.irecv(tensor=flatten_local_models[client_id].buffer,
                             src=world_id)
            reqs.append(req)

        for req in reqs:
            req.wait()

        dist.barrier()
        self.conf.logger.log(f"Master received all local models.")
        return flatten_local_models
 def _pull(self):
     # Receive the central variable from the master.
     for group in self.param_groups:
         for p in group["params"]:
             request = dist.irecv(p.data, src=0)
     # Wait for the last receive call.
     request.wait()
Example #12
0
def print_loss_file(file_name, loss_iteration, rank, size):
    if rank != 0:
        data = torch.tensor(loss_iteration)
        dist.barrier()
        req = dist.isend(data, dst=0)
        req.wait()
    else:
        loss_list_tensor = []
        loss_iter = []
        data = torch.tensor(loss_iteration)
        for i in range(size):
            data = copy.deepcopy(data)
            loss_list_tensor.append(data)
        dist.barrier()
        for i in range(size - 1):
            req = dist.irecv(loss_list_tensor[i + 1], src=i + 1)
            req.wait()
        for j in range(len(loss_list_tensor[0])):
            element = 0
            for i in range(size):
                element += loss_list_tensor[i][j].item()
            loss_iter.append(element / size)
    if rank == 0:
        file_object = open(file_name, 'w')
        for loss in loss_iter:
            file_object.write(str(loss))
            file_object.write('\t')
        file_object.close()
Example #13
0
        def _setup_generators(self):
            """Setup RNG generator shared between each party (client) and the TTPServer"""
            seed = torch.empty(size=(), dtype=torch.long)
            dist.irecv(
                tensor=seed, src=comm.get().get_ttp_rank(), group=self.ttp_group
            ).wait()
            dist.barrier(group=self.ttp_group)

            self.generator = torch.Generator(device="cpu")
            self.generator.manual_seed(seed.item())

            if torch.cuda.is_available():
                self.generator_cuda = torch.Generator(device="cuda")
                self.generator_cuda.manual_seed(seed.item())
            else:
                self.generator_cuda = None
Example #14
0
    def _agg(self, data, op):
        """Aggregate data using `op` operation.

        Args:
            data (:obj:`torch.Tensor`): A Tensor to be aggragated.
            op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc.

        Returns:
            :obj:`torch.Tensor`: An aggregated tensor.
        """
        # Create some tensors to host the values from neighborhood.
        local_data = {i: torch.zeros_like(data) for i in self.neighbors}
        local_data[self.rank] = data

        reqs = []
        for node in self.neighbors:
            reqs.append(dist.isend(tensor=local_data[self.rank], dst=node))
            reqs.append(dist.irecv(tensor=local_data[node], src=node))

        for req in reqs:
            req.wait()

        # Aggregate local_data
        if op == "avg":
            output = sum(local_data.values()) / (len(self.neighbors) + 1)
        else:
            raise NotImplementedError("op {} is not supported yet.".format(op))

        return output
Example #15
0
 def irecv(self, shape=None, dtype=None):
     if shape is None:
         shape = self.defaultShape
     if dtype is None:
         dtype = self.defaultDType
     self.tensor = torch.zeros(shape, dtype=dtype)
     self.tensor[0] = SEMAPHORE
     self.req = dist.irecv(self.tensor, src=self.rank)
Example #16
0
 def get(self,
         key: str,
         dst: Optional[torch.Tensor] = None,
         shared: bool = False) -> Optional[torch.Tensor]:
     """Get a tensor from the server."""
     self._validate_get(key, dst=dst, shared=shared)
     cmd_rpc = torch.tensor(
         [GET_CMD, len(key), dst is None, 0, 0, 0], dtype=torch.long)
     metadata_pg = self._metadata_pg()
     td.send(cmd_rpc, self.server_rank, group=metadata_pg)
     td.send(_fromstring(key), self.server_rank, group=metadata_pg)
     if dst is None:
         meta = torch.full((2, ), -1, dtype=torch.long)
         td.recv(meta, src=self.server_rank, group=metadata_pg)
         ndim, ttype = meta
         if ndim.item() == -1:
             return None
         size = torch.full((ndim.item(), ), -1, dtype=torch.long)
         td.recv(size, src=self.server_rank, group=metadata_pg)
         dtype = _dtypes[ttype.item()]
         if shared:
             dst = allocate_shared_tensor(size.tolist(), dtype=dtype)
         else:
             dst = torch.empty(size.tolist(), dtype=dtype)
     start_t = time.monotonic()
     data_pgs = self._data_pgs()
     if data_pgs is None:
         td.recv(dst, src=self.server_rank)
     else:
         outstanding_work = []
         flattened_dst = dst.flatten()
         flattened_size = flattened_dst.shape[0]
         for idx, (pg, slice_) in enumerate(
                 zip(
                     data_pgs,
                     split_almost_equally(flattened_size,
                                          num_parts=len(data_pgs)),
                 )):
             outstanding_work.append(
                 td.irecv(
                     tensor=flattened_dst[slice_],
                     src=self.server_rank,
                     group=pg,
                     tag=idx,
                 ))
         for w in outstanding_work:
             w.wait()
     end_t = time.monotonic()
     if self.log_stats:
         stats_size = dst.numel() * dst.element_size()
         stats_time = end_t - start_t
         logger.debug(
             f"Received tensor {key} from server {self.server_rank}: "
             f"{stats_size:,} bytes "
             f"in {stats_time:,g} seconds "
             f"=> {stats_size / stats_time:,.0f} B/s")
     return dst
Example #17
0
    def handle_store(
        self,
        rank: int,
        key: str,
        ndim: int,
        accum: int,
        overwrite: int,
        ttype: int,
    ) -> None:
        if ndim == -1:
            assert key in self.parameters
            size = self.parameters[key].size()
        else:
            size = torch.empty((ndim, ), dtype=torch.long)
            td.recv(size, src=rank)
            size = size.tolist()
        dtype = _dtypes[ttype]
        if not accum and overwrite and key in self.parameters:
            # avoid holding onto 2x the memory
            del self.parameters[key]
        data = torch.empty(size, dtype=dtype)

        start_t = time.monotonic()
        if self.groups is None:
            td.recv(tensor=data, src=rank)
        else:
            outstanding_work = []
            flattened_data = data.flatten()
            flattened_size = flattened_data.shape[0]
            for idx, (pg, slice_) in enumerate(
                    zip(
                        self.groups,
                        split_almost_equally(flattened_size,
                                             num_parts=len(self.groups)))):
                outstanding_work.append(
                    td.irecv(tensor=flattened_data[slice_],
                             src=rank,
                             group=pg,
                             tag=idx))
            for w in outstanding_work:
                w.wait()
        end_t = time.monotonic()
        if self.log_stats:
            stats_size = data.numel() * data.element_size()
            stats_time = end_t - start_t
            logger.debug(f"Received tensor {key} from client {rank}: "
                         f"{stats_size:,} bytes "
                         f"in {stats_time:,g} seconds "
                         f"=> {stats_size / stats_time:,.0f} B/s")

        if accum:
            self.parameters[key] += data
        elif (key not in self.parameters) or overwrite:
            self.parameters[key] = data
Example #18
0
 def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
     req = []
     if rank == root:
         for idx in range(size):
             if idx != rank:
                 req.append(dist.irecv(recvbuf[idx], src=idx, group=group))
             else:
                 recvbuf[rank] = sendbuf
     else:
         req.append(dist.isend(sendbuf, group=group, dst=root))
     return req
Example #19
0
def dist_data(data, src=0, TType=torch.DoubleTensor):
    '''
    distribute a row-major matrix
    '''
    rank = dist.get_rank()
    size = dist.get_world_size()

    if rank == src:
        p = data.shape[0]
        q = data.shape[1]
        shape = torch.LongTensor([p, q])
        if p % size != 0:
            sizes = [p // size + 1 for i in range(size - 1)
                     ] + [p // size + 1 - (size - p % size)]
        else:
            sizes = [p // size for i in range(size)]
        p, q = shape[0], shape[1]
        sizes = torch.LongTensor(sizes)
        dist.broadcast(shape, src)
        dist.broadcast(sizes, src)
    else:
        shape = torch.LongTensor(2)
        sizes = torch.LongTensor(size)
        dist.broadcast(shape, src)
        dist.broadcast(sizes, src)
        shape = list(shape)
        p, q = shape[0], shape[1]

    p_chunk = sizes[rank].item()
    q_chunk = q.item()
    # print(rank, p_chunk, q_chunk)
    chunk = TType(p_chunk, q_chunk)

    reqs = []
    if rank == src:
        data_ = TType(data.shape).copy_(data)
        sizes_int = tuple(x.item() for x in tuple(sizes))
        data_split = torch.split(data_, sizes_int)

        chunk.copy_(data_split[src])
        for i in range(size):
            if i == src: continue
            synchronize()
            reqs.append(dist.isend(data_split[i], i))
    else:
        synchronize()
        reqs.append(dist.irecv(chunk, src))

    for req in reqs:
        req.wait()

    dist.barrier()
    return THDistMat(shape, sizes, chunk, True)
Example #20
0
    def _agg(self, data, op, force_wait=True):
        """Aggregate data using `op` operation.
        Args:
            data (:obj:`torch.Tensor`): A Tensor to be aggragated.
            op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, `weighted`, etc.
        Returns:
            :obj:`torch.Tensor`: An aggregated tensor.
        """
        with self.timer('com/preparation'):
            # Create some tensors to host the values from neighborhood.
            local_data = {
                i: torch.empty_like(data)
                for i in self.neighbor_ranks
            }
            local_data[self.rank] = data
        with self.timer('com/data_exchange'):
            # async send data.
            reqs = []
            for node_rank in self.neighbor_ranks:
                with self.timer('com/isend'):
                    reqs.append(
                        dist.isend(tensor=local_data[self.rank],
                                   dst=node_rank))
                with self.timer('com/irecv'):
                    reqs.append(
                        dist.irecv(tensor=local_data[node_rank],
                                   src=node_rank))

        # wait until finish.
        if force_wait:
            with self.timer('com/force_wait'):
                self.complete_wait(reqs)

                # Aggregate local_data
                if op == "avg":
                    output = sum(local_data.values()) / (self.world_size + 1)
                elif op == "weighted":
                    output = sum([
                        tensor * self.neighbors_info[rank]
                        for rank, tensor in local_data.items()
                    ])
                elif op == "get_raw_sync_data":
                    output = local_data
                else:
                    raise NotImplementedError(
                        "op {} is not supported yet.".format(op))
            return output
        else:
            if op == "get_raw_sync_data":
                return reqs, local_data
            else:
                raise NotImplementedError(
                    "op {} is not supported yet.".format(op))
    def irecv(self, collectiveArgs, src_rank, retFlag=False, tag=0):
        retObj = dist.irecv(
            tensor=collectiveArgs.opTensor,
            src=src_rank,
            group=collectiveArgs.group,
            tag=tag
        )

        collectiveArgs.waitObj.append(retObj)

        if retFlag:
            return retObj
 def run(rank, size):
     tensor = torch.zeros(1)
     req = None
     if rank == 0:
         tensor += 1
         req = dist.isend(tensor=tensor, dst=1)
         print('Rank 0 started sending')
     else:
         req = dist.irecv(tensor=tensor, src=0)
         print('Rank 1 started receiving')
     req.wait()
     print('Rank ', rank, ' has data ', tensor[0])
Example #23
0
def p2p_test_non_block(rank, size):
    tensor = torch.tensor(rank, dtype=torch.int32)
    req = None
    if rank % 2 == 0:
        dst = rank + 1
        req = dist.isend(tensor=tensor, dst=dst)
    else:
        src = rank - 1
        req = dist.irecv(tensor=tensor, src=src)

    req.wait()

    print('Rank %d value %d' % (rank, tensor.item()))
Example #24
0
def run(tensor):
    req = None
    for nodes in range(num_nodes):
        if rank == 0:
            # Send the tensor to process 1
            req = dist.isend(tensor=tensor, dst=1)
            print('Rank 0 started sending to Rank 1')
        else:
            # Receive tensor from process 0
            req = dist.irecv(tensor=tensor, src=0)
            print('Rank 1 started receiving')
        #req.wait()
        print('Rank ', rank, ' has received data ')
    return
Example #25
0
def run1(rank, size):
    """Non-blocking point-to-point communication."""
    tensor = torch.zeros(1)
    if rank == 0:
        tensor += 1
        # Send the tensor to process 1
        req = dist.isend(tensor=tensor, dst=1)
        print('Rank 0 started sending')
    else:
        # Receive tensor from process 0
        req = dist.irecv(tensor=tensor, src=0)
        print('Rank 1 started receiving')
    req.wait()
    print('Rank ', rank, ' has data ', tensor[0])
Example #26
0
def run(rank, size):
    tensor = torch.zeros(1)
    req = None
    if rank == 0:
        tensor += 1
        # Send the tensor to process 1
        req = dist.isend(tensor=tensor, dst=1)
        print('Rank 0 started sending')
    else:
        # Receive tensor from process 0
        req = dist.irecv(tensor=tensor, src=0)
        print('Rank 1 started receiving')
    req.wait()  # wait here for communication to finish
    print('Rank ', rank, ' has data ', tensor[0])
Example #27
0
def run_send_recv_nonblocking(rank, size):
    tensor = torch.zeros(1)
    req = None
    if rank == 0:
        tensor += 1
        # Send the tensor to process 1
        req = dist.isend(tensor=tensor, dst=1)
        print('Rank 0 started sending')
    else:
        # Receive tensor from process 0
        req = dist.irecv(tensor=tensor, src=0)
        print('Rank 1 started receiving')
    req.wait()
    print('Rank ', rank, ' has data ', tensor[0])
Example #28
0
def p2p_run(rank, size):
    """ Distributed function to be implemented later. """
    tensor = torch.zeros(1)
    if rank == 0:
        tensor += 1
        # Send the tensor to process 1
        req = dist.isend(tensor=tensor, dst=1)
        print('Rank 0 started sending')
    else:
        # Receive tensor from process 0
        req = dist.irecv(tensor=tensor, src=0)
        print('Rank 1 started receiving')
    req.wait()
    print('Rank ', rank, ' has data ', tensor[0])
Example #29
0
def run_nonblocking(rank, size):
    """ non-Blocking point-2-point communication """
    tensor = torch.zeros(1)
    req = None
    if rank == 0:
        tensor += 1
        # Send the tensor to process 1
        req = dist.isend(tensor=tensor, dst=1)
        print("Rank 0 started sending")
    else:
        # Receive tensor from process 0
        req = dist.irecv(tensor=tensor, src=0)

    req.wait()
    print("Rank ", rank, ' has data ', tensor[0])
Example #30
0
def run(rank, size):
    """Non-Blocking point to point communication."""
    tensor = torch.zeros(1)
    req = None
    if rank == 0:
        # The master node is going to send the tensor with data `1` to node 1.
        tensor += 1
        req = dist.isend(tensor=tensor, dst=1)
        print("Rank 0 started sending")
    else:
        # The other nodes will receive the tensor with data `1`.
        req = dist.irecv(tensor=tensor, src=0)
        print("Rank 1 started receiving")
    req.wait()  # Guarantee that the communication took place
    print("RANK ", rank, " has data ", tensor[0])
Example #31
0
    def test_irecv(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
            requests = [
                dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
            ]

            for src in range(1, world_size):
                requests[src - 1].wait()
                self.assertTrue(requests[src - 1].is_completed())
                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
        else:
            tensor = _build_tensor(rank, 10)
            dist.send(tensor, 0)

        self._barrier()