Exemple #1
0
def fp_recv_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid,
                 comm_rank, world_sz, bs, subbs, pd, input_shp, output_shp,
                 fp_head_list, shared_cnters, global_step, sta_lidx, end_lidx):
    #proc fp_send:0; fp_recv:1; bp_send:2; bp_recv:3
    iter_thresh = int(bs / subbs)
    allreduce_group, fp_gather_group, bp_scatter_group = init_processes(
        comm_rank, world_sz)
    #print("fp_recv_proc comm_rank=", comm_rank)
    if wid == 0 or wid == 1:
        shared_cnters[0] = iter_thresh
        return
    src_rank = pred_wid * 4
    place_tensor = torch.zeros(1)
    while True:
        if shared_cnters[0] < iter_thresh:
            #print("fp recv  ", comm_rank, " <- ", src_rank, " ", shared_cnters[0], " ", bs)
            if wid == 3:
                dist.recv(tensor=fp_head_list[shared_cnters[0]], src=src_rank)
            elif wid == 2:
                glist = list(fp_head_list[shared_cnters[0]].chunk(chunks=2,
                                                                  dim=0))
                place_tensor = glist[0]
                #print("place_tensor sz ", place_tensor.size())
                glist.append(place_tensor)
                dist.gather(tensor=place_tensor,
                            gather_list=glist,
                            dst=comm_rank,
                            group=fp_gather_group,
                            async_op=False)
            shared_cnters[0] += 1
            #print("wid=",wid, " fp recv ")
        else:
            time.sleep(0.001)
Exemple #2
0
def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
    """Each process scatters list of input tensors to all processes in a cluster
    and return gathered list of tensors in output list.

    Parameters
    ----------
    rank : int
        The rank of current worker
    world_size : int
        The size of the entire
    output_tensor_list : List of tensor
        The received tensors
    input_tensor_list : List of tensor
        The tensors to exchange
    """
    # send tensor to each target trainer using torch.distributed.isend
    # isend is async
    senders = []
    for i in range(world_size):
        if i == rank:
            output_tensor_list[i] = input_tensor_list[i].to(th.device('cpu'))
        else:
            sender = dist.isend(input_tensor_list[i].to(th.device('cpu')),
                                dst=i)
            senders.append(sender)

    for i in range(world_size):
        if i != rank:
            dist.recv(output_tensor_list[i], src=i)

    th.distributed.barrier()
Exemple #3
0
def bp_recv_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid,
                 comm_rank, world_sz, bs, subbs, pd, input_shp, output_shp,
                 bp_tail_list, shared_cnters, global_step, sta_lidx, end_lidx):
    #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3
    iter_thresh = int(bs / subbs)
    allreduce_group, fp_gather_group, bp_scatter_group = init_processes(
        comm_rank, world_sz)
    print("bp_recv_proc comm_rank=", comm_rank)
    if wid == wn - 1:
        shared_cnters[3] = iter_thresh
        return
    src_rank = succ_wid * 4 + 2
    while True:
        if shared_cnters[3] < iter_thresh:
            if wid == 2:
                dist.recv(tensor=bp_tail_list[shared_cnters[3]], src=src_rank)
            elif wid == 0 or wid == 1:
                dist.scatter(tensor=bp_tail_list[shared_cnters[3]],
                             scatter_list=[],
                             src=src_rank,
                             group=bp_scatter_group,
                             async_op=False)
            shared_cnters[3] += 1
            #print("wid=",wid, " bp_recv")
        else:
            time.sleep(0.001)
Exemple #4
0
def runServer(model):
    # model = Net()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    optimizer.zero_grad()
    numberOfTimes = dist.get_world_size() - 1
    for param in model.parameters():
        param.sum().backward()
    tag = torch.zeros(1)
    while True:
        optimizer.zero_grad
        src = dist.recv(tensor=tag)
        # print("Reached ", src)
        if tag[0] == 0:
            for param in model.parameters():
                dist.send(tensor=param.data, dst=src)
        elif tag[0] == -1:
            numberOfTimes -= 1
            if numberOfTimes == 0:
                # print("------------- Breaking ----------------")
                break
        else:
            for param in model.parameters():
                dist.recv(tensor=param.grad.data, src=src)
            optimizer.step()
            optimizer.zero_grad()
            for param in model.parameters():
                dist.send(tensor=param.data, dst=src)
    def handle_store(
        self,
        rank: int,
        key: str,
        ndim: int,
        accum: int,
        overwrite: int,
        ttype: int,
    ) -> None:
        if ndim == -1:
            assert key in self.parameters
            size = self.parameters[key].size()
        else:
            size = torch.empty((ndim, ), dtype=torch.long)
            td.recv(size, src=rank)
            size = size.tolist()
        tensor_type = _tensor_types[ttype]
        if not accum and overwrite and key in self.parameters:
            # avoid holding onto 2x the memory
            del self.parameters[key]
        data = tensor_type(*size)

        td.recv(data, src=rank)

        if accum:
            self.parameters[key] += data
        elif (key not in self.parameters) or overwrite:
            self.parameters[key] = data
Exemple #6
0
    def backward_rank0(semaphore):
        batch_idx = 0
        grad_recv = torch.zeros(shapes[0])
        dist.recv(tensor=grad_recv, src=1)
        while True:
            grad_recv = grad_recv.cuda(0)
            print(" backwardbatch_idx:" + str(batch_idx))
            try:
                loss = outputs_queue.get(block=True, timeout=4)
            except Empty:
                print("empty........")
                break

            loss.backward(grad_recv)
            if batch_idx % 3 == 0:
                # print("step: " + str(batch_idx))
                optimizer.step()
                optimizer.zero_grad()
            batch_idx += 1
            if data_size == batch_idx:
                print("eq...")
                break
            grad_recv = transfer(6, None, shapes[0])  #shapes[0]
            print("backward send.....")
        print("backward end..")
    def backward(ctx, grad_output):
        tensor, = ctx.saved_tensors
        # TODO: Add ctx.needs_input_grad check
        grad_tensor = torch.zeros_like(tensor)
        dist.recv(grad_tensor, ctx.dst, ctx.group, ctx.tag)

        return grad_tensor, None, None, None
Exemple #8
0
    def backward_rank1():
        residual = None
        batch_idx = 0
        grad_recv1 = torch.zeros(shapes[1], dtype=torch.int8)
        #grad_recv1 = torch.HalfTensor(torch.Size(shapes[1]))
        dist.recv(tensor=grad_recv1, src=2)
        while True:
            print(" backward batch_idx:" + str(batch_idx))
            #grad_recv1 = unpack(grad_recv1.cuda(), shapes[1])
            grad_recv1 = dequantize(grad_recv1.cuda().float())
            #grad_recv1 = grad_recv1.cuda()
            try:
                inputs, outputs = outputs_queue.get(block=True, timeout=4)
            except Empty:
                print("empty........")
                break
            inputs.requires_grad_()
            outputs.backward(grad_recv1)

            #inputs_grad = quantize(inputs.grad, char=True).cpu()
            inputs_grad, residual = compress(inputs.grad, residual=residual)
            inputs_grad = inputs_grad.cpu()
            #inputs_grad = inputs.grad.cpu()
            if batch_idx % 2 == 0:
                optimizer.step()
                optimizer.zero_grad()
            batch_idx += 1
            if data_size == batch_idx:
                transfer(3, inputs_grad, None)
                print("backend In send..")
                break
            grad_recv1 = transfer(3, inputs_grad, shapes[1])
            print("backward send.......")
        print("backard end....")
Exemple #9
0
def allreduce(r, world, peers, tensor):
    r = dist.get_rank()
    world = dist.get_world_size()
    peers = list(filter(lambda i: i != r, list(range(world))))
    sizeOfTensor = list(tensor.size())[0]
    chunksize = sizeOfTensor // world
    reqs = [
        dist.isend(tensor=tensor[i * chunksize:(i + 1) * chunksize], dst=i)
        for i in peers
    ]  # K concurrent transfers
    recv = torch.zeros(sizeOfTensor // (world))
    for i in peers:  # K steps
        dist.recv(tensor=recv, src=i)  # K / ??? values...
        tensor[r * chunksize:(r + 1) * chunksize] += recv[:]
    for req in reqs:
        req.wait()
    # we have to set to zero the values that we are not responsible (they will be included on their way back)
    reqs = [
        dist.isend(tensor=tensor[r * chunksize:(r + 1) * chunksize], dst=i)
        for i in peers
    ]
    for i in peers:
        dist.recv(tensor=recv, src=i)
        tensor[i * chunksize:(i + 1) * chunksize] = recv
    for req in reqs:
        req.wait()
Exemple #10
0
    def backward_rank1(semaphore, start_event, start_event2):

        start_event.wait()

        batch_idx = 0
        while True:
            try:
                #semaphore.release()
                print("before grad recv...")
                grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8)
                dist.recv(tensor=grad_recv1, src=2)
                print("after grad recv.....")
            except RuntimeError as error:
                print("backward runtime error")
                send_opt = dist.isend(tensor=torch.zeros(0), dst=0)
                send_opt.wait()
                break
            grad_recv1 = dequantize(grad_recv1.cuda(0).float())
            inputs, outputs = outputs_queue.get(block=False)
            inputs.requires_grad_()
            outputs.backward(grad_recv1)
            if batch_idx % args.buffer_size == 0:
                optimizer.step()
                optimizer.zero_grad()

            inputs_grad = quantize(inputs.grad, char=True).cpu()
            print(inputs_grad.size())
            if batch_idx == 0:
                start_event2.set()
            #send_opt = dist.isend(tensor=inputs_grad, dst=0)
            #send_opt.wait()
            dist.send(tensor=inputs_grad, dst=0)
            batch_idx += 1
 def recv(self, collectiveArgs, src_rank, retFlag=False, tag=0):
     dist.recv(
         tensor=collectiveArgs.opTensor,
         src=src_rank,
         group=collectiveArgs.group,
         tag=tag
     )
Exemple #12
0
def transfer4backend1(tag, send_buf, flag=False):

    if not flag:
        left, right = get_left_right(tag)
        dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1),
                  dst=right)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        send_opt.wait()
        return None

    else:
        left, right = get_left_right(tag)
        dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1),
                  dst=right)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        try:
            shape_buf = torch.zeros([1], dtype=torch.short)
            dist.recv(tensor=shape_buf, src=left)
            recv_buf = torch.zeros(torch.Size(shape_buf.tolist()))
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error")
            return None
        send_opt.wait()
        return recv_buf
Exemple #13
0
 def cumsum(self, dim):
     new_chunk = self.chunk.cumsum(dim)
     if self.byrow and dim==0:
         buf = torch.zeros_like(new_chunk[-1, :])
         for i in range(self.size-1):
             if self.rank == i: 
                 synchronize()
                 dist.send(new_chunk[-1,:], i+1)
             elif self.rank == i + 1:
                 synchronize()
                 dist.recv(buf, i)
                 new_chunk += buf
             dist.barrier()
     elif not self.byrow and dim==1:
         buf = torch.zeros_like(new_chunk[:, -1])
         for i in range(self.size-1):
             if self.rank==i:
                 synchronize()
                 dist.send(new_chunk[:, -1], i+1)
             elif self.rank == i+1:
                 synchronize()
                 dist.recv(buf, i)
                 new_chunk += buf
             dist.barrier()
     return THDistMat(self.shape, self.sizes, new_chunk, self.byrow)
Exemple #14
0
def worker():
    """ Initialize the distributed environment. """

    import torch
    import torch.distributed as dist
    from torch.multiprocessing import Process

    print("Initializing distributed pytorch")
    os.environ['MASTER_ADDR'] = str(args.master_addr)
    os.environ['MASTER_PORT'] = str(args.master_port)
    dist.init_process_group(args.backend, rank=args.rank, world_size=args.size)

    for i in range(100):
        tensor = torch.ones(args.data_size_mb * 250 * 1000) * (args.rank + 1)
        # print('before: rank ', args.rank, ' has data ', tensor[0])

        start_time = time.perf_counter()
        if args.rank == 0:
            dist.send(tensor=tensor, dst=1)
        else:
            dist.recv(tensor=tensor, src=0)

        elapsed_time = time.perf_counter() - start_time
        # print('after: rank ', args.rank, ' has data ', tensor[0])
        rate = args.data_size_mb / elapsed_time

        print("Process %d transferred %d MB in %.1f ms (%.1f MB/sec)" %
              (args.rank, args.data_size_mb, elapsed_time * 1000, rate))
Exemple #15
0
def transfer(tag, send_buf, shape):

    if shape == None:
        left, right = get_left_right(tag)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        send_opt.wait()
        return None
    elif not torch.is_tensor(send_buf):
        left, right = get_left_right(tag)
        try:
            recv_buf = torch.zeros(shape)  # , dtype=torch.int8
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error..")
            return None
        return recv_buf

    else:
        left, right = get_left_right(tag)
        send_opt = dist.isend(tensor=send_buf, dst=right)

        try:
            recv_buf = torch.zeros(shape)
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error")
            return None
        send_opt.wait()
        return recv_buf
Exemple #16
0
def eval(layer, logger, e, save_event, data_size, testloader):
    criterion = nn.CrossEntropyLoss()
    criterion.cuda()
    layer.eval()
    with torch.no_grad():
        if dist.get_rank() == 0:
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print('batch_idx: ' + str(batch_idx))
                inputs = inputs.cuda(0)
                outputs = layer(inputs)
                dist.send(tensor=outputs.cpu(), dst=1)
                print("send.....")

            e.wait()
        elif dist.get_rank() == 1:
            batch_idx = 0
            while data_size > batch_idx:
                print("batch_idx:" + str(batch_idx))
                rec_val = torch.zeros(
                    [100, 256, 4, 4])  # difference model has difference shape
                dist.recv(tensor=rec_val, src=0)
                print("after recv....")
                outputs = layer(rec_val.cuda())
                dist.send(tensor=outputs.cpu(), dst=2)
                batch_idx += 1
                print("send...")

            e.wait()
        elif dist.get_rank() == 2:
            test_loss = 0
            correct = 0
            total = 0
            save_event.clear()
            global best_acc

            for batch_idx, (inputs, targets) in enumerate(testloader):
                rec_val = torch.zeros([100, 512, 2, 2])
                dist.recv(tensor=rec_val, src=1)
                outputs = layer(rec_val.cuda(0))
                targets = targets.cuda()
                loss = criterion(outputs, targets)
                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                progress_bar(
                    batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' %
                    (test_loss /
                     (batch_idx + 1), 100. * correct / total, correct, total))
                logger.error("eval:" + str(test_loss / (batch_idx + 1)))
                acc_str = "eacc: %.3f" % (100. * correct / total, )
                logger.error(acc_str)
            time.sleep(1)
            acc = 100. * correct / total
            if acc > best_acc:
                best_acc = acc
                save_event.set()
            time.sleep(1)
            e.set()
def backward(layer, atom, outputs_queue, args):
    optimizer = optim.SGD(layer.parameters(),
                          lr=0.01,
                          momentum=0.9,
                          weight_decay=5e-4)
    optimizer.zero_grad()
    dist.init_process_group(backend='gloo',
                            init_method=args.path,
                            world_size=3,
                            rank=2)
    batch_idx = 0
    while True:
        try:
            grad = torch.zeros([args.batch_size, 128, 16, 16]).half()
            dist.recv(tensor=grad, src=1)
            #grad = grad_queue.get(block=True, timeout=1)
            #grad = torch.from_numpy(grad)
            #grad = dense(grad, [args.batch_size, 128, 16, 16]).cuda(0)
            grad = grad.cuda(0).float()
        except Empty as empty:
            print("backward empty.....")
            break
        loss = get_tensor(outputs_queue, atom, 1)
        loss.backward(grad)
        if batch_idx % 2 == 0:
            optimizer.step()
            optimizer.zero_grad()
        batch_idx += 1
Exemple #18
0
def receive(theModel, src, tag):
    weights = model.save(theModel)  # will be overwritten
    for i in range(len(weights)):
        dist.recv(tensor=weights[i], src=src, tag=tag * 100 + i)
    theModel = model.load(weights, theModel)
    print("Model received from", src)
    return theModel
Exemple #19
0
    def backward_rank0(semaphore):
        batch_idx = 0
        ten_len = tensor_len(shapes[0])
        grad_recv = torch.zeros(ten_len + 2)
        #grad_recv = torch.zeros(shapes[0], dtype=torch.int8)
        #grad_recv = torch.HalfTensor(torch.Size(shapes[0]))
        dist.recv(tensor=grad_recv, src=1)
        while True:
            #semaphore.release()

            #grad_recv = dequantize(grad_recv.cuda().float())
            grad_recv = de_piecewise_quantize(grad_recv.cuda(), shapes[0])
            #grad_recv = unpack(grad_recv.cuda(), shapes[0])
            print(" backwardbatch_idx:" + str(batch_idx))
            # grad_recv = grad_recv.cuda()
            try:
                loss = outputs_queue.get(block=True, timeout=4)
            except Empty:
                print("empty........")
                break

            loss.backward(grad_recv)
            if batch_idx % 2 == 0:
                # print("step: " + str(batch_idx))
                optimizer.step()
                optimizer.zero_grad()
            batch_idx += 1
            if data_size == batch_idx:
                print("eq...")
                break
            grad_recv = transfer2(4, None, ten_len + 2)  #shapes[0]
            print("backward send.....")
        print("backward end..")
Exemple #20
0
def receive_tensor_helper(dist, tensor, src_rank, group, tag, num_iterations,
                          intra_server_broadcast):
    for i in range(num_iterations):
        if intra_server_broadcast:
            dist.broadcast(tensor=tensor, group=group, src=src_rank)
        else:
            dist.recv(tensor=tensor, src=src_rank, tag=tag)
Exemple #21
0
    def backward_rank1():
        residual = None
        batch_idx = 0

        grad_recv1 = torch.zeros(shapes[1])
        dist.recv(tensor=grad_recv1, src=2)
        while True:
            print(" backward batch_idx:" + str(batch_idx))
            grad_recv1 = grad_recv1.cuda(1)
            try:
                inputs, outputs = outputs_queue.get(block=True, timeout=4)
            except Empty:
                print("empty........")
                break
            inputs.requires_grad_()
            outputs.backward(grad_recv1)
            if batch_idx % 3 == 0:
                optimizer.step()
                optimizer.zero_grad()
            batch_idx += 1
            if data_size == batch_idx:
                transfer(5, inputs.grad.cpu(), None)
                print("backend In send..")
                break
            grad_recv1 = transfer(5, inputs.grad.cpu(), shapes[1])  #shapes[1]
            print("backward send.......")
        print("backard end....")
Exemple #22
0
 def _recv_key(rank: int,
               keylen: int,
               group: Optional["td.ProcessGroup"] = None) -> str:
     """Receive a string tensor key from a client node."""
     key_buffer = torch.zeros((keylen, ), dtype=torch.int8)
     td.recv(key_buffer, src=rank, group=group)
     return _tostring(key_buffer)
Exemple #23
0
def allreduce(send, recv):
    rank = dist.get_rank()
    size = dist.get_world_size()
    device = torch.device("cuda:" +
                          str(rank) if torch.cuda.is_available() else "cpu")
    send_buff = torch.zeros(send.size()).to(device)
    recv_buff = torch.zeros(send.size()).to(device)
    accum = torch.zeros(send.size()).to(device)
    accum[:] = send[:]

    left = ((rank - 1) + size) % size
    right = (rank + 1) % size

    for i in range(size - 1):
        if i % 2 == 0:
            # Send send_buff
            send_req = dist.isend(send_buff, right)
            dist.recv(recv_buff, left)
            accum[:] += recv[:]
        else:
            # Send recv_buff
            send_req = dist.isend(recv_buff, right)
            dist.recv(send_buff, left)
            accum[:] += send[:]
        send_req.wait()
    recv[:] = accum[:]
Exemple #24
0
 def backward_rank1():
     residual = None
     batch_idx = 0
     ten_len = tensor_len(shapes[1])
     grad_recv1 = torch.zeros(ten_len + 2)
     dist.recv(tensor=grad_recv1, src=2)
     while True:
         print(" backward batch_idx:" + str(batch_idx))
         grad_recv1 = de_piecewise_quantize(grad_recv1.cuda(), shapes[1])
         try:
             inputs, outputs = outputs_queue.get(block=True, timeout=4)
         except Empty:
             print("empty........")
             break
         inputs.requires_grad_()
         outputs.backward(grad_recv1)
         inputs_grad, residual = piecewise_quantize(inputs.grad,
                                                    logger=logger,
                                                    residual=residual)
         if batch_idx % 3 == 0:
             optimizer.step()
             optimizer.zero_grad()
         batch_idx += 1
         if data_size == batch_idx:
             transfer2(5, inputs_grad, None)
             print("backend In send..")
             break
         grad_recv1 = transfer2(5, inputs_grad, ten_len + 2)  #shapes[1]
         print("backward send.......")
     print("backard end....")
    def swap(
        self,
        key: str,
        src: torch.Tensor,
        dst: Optional[torch.Tensor] = None,
        accum: bool = False,
        overwrite: bool = False,
    ) -> None:
        """Store or accumulate a tensor on the server,
        and then get its current value.
        """
        if dst is None:
            dst = torch.zeros_like(src)

        # tic = time.time()
        cmd_rpc = torch.tensor([
            SWAP_CMD,
            len(key), -1 if accum else src.ndimension(),
            int(accum),
            int(overwrite), _tensor_type_idx[src.type()]
        ],
                               dtype=torch.long)
        td.send(cmd_rpc, self.server_rank)
        td.send(_fromstring(key), self.server_rank)
        if not accum:
            td.send(torch.tensor(list(src.size()), dtype=torch.long),
                    self.server_rank)
        td.send(src, self.server_rank)
        td.recv(dst, src=self.server_rank)
Exemple #26
0
    def backward_rank0(semaphore):
        batch_idx = 0
        shape_buf = torch.zeros([1], dtype=torch.short)
        dist.recv(tensor=shape_buf, src=1)
        grad_recv = torch.zeros(torch.Size(shape_buf.tolist()))
        dist.recv(tensor=grad_recv, src=1)
        while True:
            #semaphore.release()
            grad_recv = unpack(grad_recv.cuda(), shapes[0])
            print(" backwardbatch_idx:" + str(batch_idx))
            # grad_recv = grad_recv.cuda()
            try:
                loss = outputs_queue.get(block=True, timeout=4)
            except Empty:
                print("empty........")
                break

            loss.backward(grad_recv)
            if batch_idx % 2 == 0:
                # print("step: " + str(batch_idx))
                optimizer.step()
                optimizer.zero_grad()
            batch_idx += 1

            if data_size == batch_idx:
                print("eq...")
                break
            grad_recv = transfer4backend0(4)

            print("backward send.....")
        print("backward end..")
Exemple #27
0
def allreduce(send, recv):
    """ Implementation of a ring-reduce. """
    rank = dist.get_rank()
    size = dist.get_world_size()
    send_buff = th.zeros(send.size())
    recv_buff = th.zeros(send.size())
    accum = th.zeros(send.size())
    accum[:] = send[:]
    # th.cuda.synchronize()

    left = ((rank - 1) + size) % size
    right = (rank + 1) % size

    for i in range(size - 1):
        if i % 2 == 0:
            # Send send_buff
            send_req = dist.isend(send_buff, right)
            dist.recv(recv_buff, left)
            accum[:] += recv[:]
        else:
            # Send recv_buff
            send_req = dist.isend(recv_buff, right)
            dist.recv(send_buff, left)
            accum[:] += send[:]
        send_req.wait()
    # th.cuda.synchronize()
    recv[:] = accum[:]
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # send parameter request every N iterations
        if self.idx % self.param_groups[0]['tau'] == 0:
            self.idx = 1
            self.send_message(
                MessageCode.PullTilde,
                torch.randn(self.squash_model(self.model).numel()))
            # pull x tilde
            m_parameter = torch.zeros(
                self.squash_model(self.model).numel() + 7).to(torch.int16)
            dist.recv(tensor=m_parameter)

            # build alpha term

            m_parameter = m_parameter[2:]
            m_parameter = dequantize_tensor(m_parameter)
            current_index = 0  # keep track of where to read from parameter_update
            delta = copy.deepcopy(self.model)
            alpha = self.param_groups[0]['rho'] * self.param_groups[0]['lr']
            for parameter in delta.parameters():
                numel = parameter.data.numel()
                size = parameter.data.size()
                parameter.data.add_(
                    -1, m_parameter[current_index:current_index +
                                    numel].view(size))
                parameter.data.mul_(alpha)
                current_index += numel
            # delta = delta * self.param_groups[0]['rho'] * self.param_groups[0]['lr']

            # update x
            for cur_parameter, cur_delta in zip(self.model.parameters(),
                                                delta.parameters()):
                cur_parameter.data.add_(-1, cur_delta.data)

            # push delta to update x tilde
            self.send_message(MessageCode.UpdateTilde,
                              self.squash_model(delta))
        else:
            self.idx += 1

        # internal sgd update
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                p.data.add_(-group['lr'], d_p)

        return loss
Exemple #29
0
 def get_lr(self, group_name):
     cmd = torch.LongTensor([
         getattr(TORCH_PARAMETER_SERVER_CMDS,
                 f"GET_{group_name.upper()}_LR_CMD"), 0
     ])
     dist.send(cmd, dst=self.server_rank)
     dist.recv(self.lr_buffer, src=self.server_rank)
     return self.lr_buffer[0].item()
Exemple #30
0
 def sync_buffers(self):
     for p in self.model._all_buffers():
         p.data.zero_()
         recv_buff = torch.FloatTensor(p.data.size()).cuda()
         for w in self.workers:
             dist.recv(recv_buff, src=w)
             p.data.add_(recv_buff)
         p.data.div_(self.num_workers)
Exemple #31
0
    def test_isend(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            requests = [
                dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
            ]
            for request in requests:
                request.wait()
                self.assertTrue(request.is_completed())
        else:
            tensor = _build_tensor(rank, -1)
            dist.recv(tensor, 0)
            self.assertEqual(tensor, _build_tensor(rank, 10))

        self._barrier()
Exemple #32
0
    def test_send_recv(self):
        rank = dist.get_rank()
        tensor = _build_tensor(rank + 1)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(src + 1, value=-1)
            expected_tensor = _build_tensor(src + 1)
            dist.recv(tensor, src)
            self.assertEqual(tensor, expected_tensor)

        self._barrier()
Exemple #33
0
    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, rank)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        recv_ranks = set()
        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(10, value=-1)
            dist.recv(tensor)
            recv_ranks.add(tensor.resize_(1)[0])

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
 def recv(self, var):
     dist.recv(tensor=var, src=self.other)
     return var
Exemple #35
0
    print_header("send from 0 to 1")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.send(tensor, 1)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
elif rank == 1:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.recv(tensor, 0)
dist.barrier()

if rank == 0:
    print_header("reduce")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.reduce(tensor, 0)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: