def _next_worker(self):
        data, address = self._socket.recvfrom(10)
        data = data.decode().split('.')[0]
        rank = int(data)
        dist.isend(self.rank.data, rank)

        return rank
Exemple #2
0
def allreduce(send, recv):
    rank = dist.get_rank()
    size = dist.get_world_size()
    device = torch.device("cuda:" +
                          str(rank) if torch.cuda.is_available() else "cpu")
    send_buff = torch.zeros(send.size()).to(device)
    recv_buff = torch.zeros(send.size()).to(device)
    accum = torch.zeros(send.size()).to(device)
    accum[:] = send[:]

    left = ((rank - 1) + size) % size
    right = (rank + 1) % size

    for i in range(size - 1):
        if i % 2 == 0:
            # Send send_buff
            send_req = dist.isend(send_buff, right)
            dist.recv(recv_buff, left)
            accum[:] += recv[:]
        else:
            # Send recv_buff
            send_req = dist.isend(recv_buff, right)
            dist.recv(send_buff, left)
            accum[:] += send[:]
        send_req.wait()
    recv[:] = accum[:]
def main_func(numProcesses, group, queue, queue_exit):
    print('main func running')
    starting_time2 = time.time()
    exited_processes = []

    while (True):
        #time.sleep(1)

        recv_tensors = []
        start_time = time.time()
        while (True):
            if not queue.empty():
                rec_tens = queue.get()
                recv_tensors.append(rec_tens)

            if len(recv_tensors) > 1 or time.time() - start_time > 0.01:
                break

        #print('main received tensors',recv_tensors)

        for r in recv_tensors:
            #last element corresponds to rank it came from
            print('sending it back: ', r[:-1].clone() + 3)
            print('destination: ', int(r[-1].item()))
            dist.isend(tensor=r[:-1].clone() + 3, dst=int(r[-1].item()))

        if not queue_exit.empty():
            exited_processes.append(queue_exit.get())

        #print(len(exited_processes))
        if len(exited_processes) == numProcesses - 1:
            print('everyones exited')
            print(time.time() - starting_time2)
            exit()
Exemple #4
0
def allreduce(send, recv):
    """ Implementation of a ring-reduce. """
    rank = dist.get_rank()
    size = dist.get_world_size()
    send_buff = th.zeros(send.size())
    recv_buff = th.zeros(send.size())
    accum = th.zeros(send.size())
    accum[:] = send[:]
    # th.cuda.synchronize()

    left = ((rank - 1) + size) % size
    right = (rank + 1) % size

    for i in range(size - 1):
        if i % 2 == 0:
            # Send send_buff
            send_req = dist.isend(send_buff, right)
            dist.recv(recv_buff, left)
            accum[:] += recv[:]
        else:
            # Send recv_buff
            send_req = dist.isend(recv_buff, right)
            dist.recv(send_buff, left)
            accum[:] += send[:]
        send_req.wait()
    # th.cuda.synchronize()
    recv[:] = accum[:]
Exemple #5
0
def dist_sgd(model, rank):
    group = dist.new_group([0, 1, 3])
    for param in model.parameters():
        sending_right = copy.deepcopy(param.data)
        sending_left = copy.deepcopy(sending_right)
        recving_left_1 = copy.deepcopy(sending_right)
        recving_right_1 = copy.deepcopy(sending_right)
        size = dist.get_world_size()
        left = ((rank - 1) + size) % size
        right = (rank + 1) % size
        if rank % 2 == 0:
            req = dist.isend(sending_right, dst=right)
            req.wait()
            req = dist.irecv(recving_left_1, src=left)
            req.wait()
        else:
            req = dist.irecv(recving_left_1, src=left)
            req.wait()
            req = dist.isend(sending_right, dst=right)
            req.wait()
        dist.barrier()
        if rank % 2 == 0:
            req = dist.isend(sending_left, dst=left)
            req.wait()
            req = dist.irecv(recving_right_1, src=right)
            req.wait()
        else:
            req = dist.irecv(recving_right_1, src=right)
            req.wait()
            req = dist.isend(sending_left, dst=left)
            req.wait()
        param.data = (sending_left + recving_left_1 + recving_right_1) / 3
Exemple #6
0
def run(i):
    global rank
    rank = i
    dist.init_process_group(backend="gloo",
                            init_method="env://",
                            timeout=timedelta(seconds=2.5),
                            world_size=world_size,
                            rank=rank)

    if i == 0:
        time.sleep(2000)
        print("Process 0 exit.")
        exit(-1)
    t = time.time()
    for j in range(messages):
        for r in range(world_size):
            print("round {}: {}, {} begin".format(j, rank, r))
            try:
                dist.isend(storage, r)
                #dist.recv(storage1)
            except RuntimeError:
                print("round {}: {}, {} error".format(j, rank, r))
            else:
                print("round {}: {}, {}, {:.5f}".format(j, rank, r, time.time() - t))
    print(time.time() - t)
    dist.destroy_process_group()
Exemple #7
0
def transfer(tag, send_buf, shape):

    if shape == None:
        left, right = get_left_right(tag)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        send_opt.wait()
        return None
    elif not torch.is_tensor(send_buf):
        left, right = get_left_right(tag)
        try:
            recv_buf = torch.zeros(shape)  # , dtype=torch.int8
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error..")
            return None
        return recv_buf

    else:
        left, right = get_left_right(tag)
        send_opt = dist.isend(tensor=send_buf, dst=right)

        try:
            recv_buf = torch.zeros(shape)
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error")
            return None
        send_opt.wait()
        return recv_buf
Exemple #8
0
def allreduce(r, world, peers, tensor):
    r = dist.get_rank()
    world = dist.get_world_size()
    peers = list(filter(lambda i: i != r, list(range(world))))
    sizeOfTensor = list(tensor.size())[0]
    chunksize = sizeOfTensor // world
    reqs = [
        dist.isend(tensor=tensor[i * chunksize:(i + 1) * chunksize], dst=i)
        for i in peers
    ]  # K concurrent transfers
    recv = torch.zeros(sizeOfTensor // (world))
    for i in peers:  # K steps
        dist.recv(tensor=recv, src=i)  # K / ??? values...
        tensor[r * chunksize:(r + 1) * chunksize] += recv[:]
    for req in reqs:
        req.wait()
    # we have to set to zero the values that we are not responsible (they will be included on their way back)
    reqs = [
        dist.isend(tensor=tensor[r * chunksize:(r + 1) * chunksize], dst=i)
        for i in peers
    ]
    for i in peers:
        dist.recv(tensor=recv, src=i)
        tensor[i * chunksize:(i + 1) * chunksize] = recv
    for req in reqs:
        req.wait()
Exemple #9
0
def transfer4backend1(tag, send_buf, flag=False):

    if not flag:
        left, right = get_left_right(tag)
        dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1),
                  dst=right)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        send_opt.wait()
        return None

    else:
        left, right = get_left_right(tag)
        dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1),
                  dst=right)
        send_opt = dist.isend(tensor=send_buf, dst=right)
        try:
            shape_buf = torch.zeros([1], dtype=torch.short)
            dist.recv(tensor=shape_buf, src=left)
            recv_buf = torch.zeros(torch.Size(shape_buf.tolist()))
            dist.recv(tensor=recv_buf, src=left)
        except RuntimeError as error:
            print("runtime error")
            return None
        send_opt.wait()
        return recv_buf
 def _commit(self):
     # Initiate parameter sharing with the master.
     self._enqueue_master()
     dist.recv(self.rank, src=0)
     # Send the update to the master
     for group in self.param_groups:
         for p in group["params"]:
             dist.isend(p.grad.data, dst=0)
Exemple #11
0
def send_message(message_code, payload, dst=0):
    """Sends a message to a destination
    Concatenates both the message code and destination with the payload into a single tensor and then sends that as a tensor
    """
    _LOGGER.info("SENDING MESSAGE: {} RANK: {}".format(message_code, dist.get_rank()))
    m_parameter = torch.Tensor([dist.get_rank(), message_code.value])
    m_parameter = torch.cat((m_parameter, payload))
    dist.isend(tensor=m_parameter, dst=dst)
Exemple #12
0
    def test_send_recv_obj(self):
        TEST_OBJECTS = [
            {
                "a": 1,
                "b": 2,
                "c": 3
            },
            torch.tensor(1),
            torch.nn.Linear(10, 5),
            CNN(),
        ]
        for param in TEST_OBJECTS[2].parameters():
            param.data.fill_(1.0)
        for param in TEST_OBJECTS[3].parameters():
            param.data.fill_(1.0)
        serial.register_safe_class(CNN)

        for reference in TEST_OBJECTS:
            for src in range(self.world_size):
                if self.rank == src:
                    test_obj = reference
                    comm.get().send_obj(test_obj, 1 - self.rank)
                else:
                    test_obj = comm.get().recv_obj(1 - self.rank)

                if isinstance(reference, torch.nn.Module):
                    test_obj_params = list(test_obj.parameters())
                    reference_params = list(reference.parameters())
                    for i, param in enumerate(reference_params):
                        self.assertTrue(test_obj_params[i].eq(param).all(),
                                        "broadcast_obj failed")
                else:
                    self.assertEqual(test_obj, reference,
                                     "broadcast_obj failed")

        # Test that the restricted loader will raise an error for code injection
        for invalid_obj in INVALID_SERIALIZED_OBJECTS:
            for src in range(self.world_size):
                if self.rank == src:
                    # Mimic send_obj without pickling invalid bytestream
                    size = torch.tensor(len(invalid_obj), dtype=torch.int32)
                    arr = torch.from_numpy(
                        numpy.frombuffer(invalid_obj, dtype=numpy.int8))

                    r0 = dist.isend(size,
                                    dst=(1 - self.rank),
                                    group=comm.get().main_group)
                    r1 = dist.isend(arr,
                                    dst=(1 - self.rank),
                                    group=comm.get().main_group)

                    r0.wait()
                    r1.wait()
                else:
                    with self.assertRaises(ValueError):
                        comm.get().recv_obj(1 - self.rank)
def eval(layer, logger, args, targets_queue, e, save_event, data_size,
         testloader):
    dist.init_process_group(backend='tcp',
                            init_method=args.path,
                            world_size=args.size,
                            rank=args.rank)
    criterion = nn.CrossEntropyLoss()
    criterion.cuda()
    layer.eval()

    with torch.no_grad():
        if dist.get_rank() == 0:
            for batch_idx, (inputs, targets) in enumerate(testloader):
                inputs, targets = inputs.cuda(0), targets
                outputs = layer(inputs)
                targets_queue.put(targets.numpy())
                print(outputs.size())
                send_opt = dist.isend(tensor=outputs.cpu(), dst=1)
                send_opt.wait()
            send_opt = dist.isend(tensor=torch.zeros(0), dst=1)
            send_opt.wait()
            e.wait()
        elif dist.get_rank() == 1:
            batch_idx = 0
            test_loss = 0
            correct = 0
            total = 0
            save_event.clear()
            global best_acc
            while True:
                try:
                    rec_val = torch.zeros([100, 128, 16, 16])
                    dist.recv(tensor=rec_val, src=0)
                except RuntimeError as error:
                    print("done....")
                    acc = 100. * correct / total
                    if acc > best_acc:
                        best_acc = acc
                        save_event.set()
                    e.set()
                    break
                outputs = layer(rec_val.cuda(1))
                targets = targets_queue.get(block=True, timeout=2)
                targets = torch.from_numpy(targets).cuda(1)
                loss = criterion(outputs, targets)
                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                progress_bar(
                    batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' %
                    (test_loss /
                     (batch_idx + 1), 100. * correct / total, correct, total))
                if batch_idx % 10 == 0:
                    logger.error("eval:" + str(test_loss / (batch_idx + 1)))
Exemple #14
0
def send_obj(obj, dst, group):
    buf = pickle.dumps(obj)
    size = torch.tensor(len(buf), dtype=torch.int32)
    arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8))

    r0 = dist.isend(size, dst=dst, group=group)
    r1 = dist.isend(arr, dst=dst, group=group)

    r0.wait()
    r1.wait()
Exemple #15
0
def broadcast(data, rank, world_size, recv_buff_l, recv_buff_r):
    left = ((rank - 1) + world_size) % world_size
    right = (rank + 1) % world_size
    send_req_l = dist.isend(data, dst=left)
    recv_req_r = dist.irecv(recv_buff_r, src=right)
    recv_req_r.wait()
    send_req_l.wait()
    send_req_r = dist.isend(data, dst=right)
    recv_req_l = dist.irecv(recv_buff_l, src=left)
    recv_req_l.wait()
    send_req_r.wait()
Exemple #16
0
def run(qs, layer):

    if dist.get_rank() == 0:
        input_x = torch.ones(20, requires_grad=True)
        output_x = layer(input_x)
        input_x.share_memory_()
        qs[0].put(input_x)
        send_opt = dist.isend(tensor=output_x, dst=1)
        send_opt.wait()
        time.sleep(20)
    elif dist.get_rank() == 1:

        rec_val = torch.zeros(10, requires_grad=True)
        dist.recv(tensor=rec_val, src=0)
        rec_val.share_memory_()
        qs[1].put(rec_val)
        send_opt = dist.isend(tensor=torch.ones(1), dst=2)
        send_opt.wait()
        time.sleep(20)

    elif dist.get_rank() == 2:
        rec_val = torch.randn(1)
        dist.recv(tensor=rec_val, src=1)

        input_x = qs[1].get()
        input_x.requires_grad_()

        output_x = layer(input_x)
        optimizer = optim.SGD(layer.parameters(), lr=0.01)
        optimizer.zero_grad()
        criterion = nn.MSELoss()
        target_v = torch.randn(1)
        loss = criterion(output_x, target_v)
        loss.backward()

        optimizer.step()

        send_opt = dist.isend(tensor=input_x.grad, dst=3)
        send_opt.wait()
        time.sleep(10)
    elif dist.get_rank() == 3:
        back_grad = torch.zeros(10, requires_grad=True)
        dist.recv(tensor=back_grad, src=2)

        input_x = qs[0].get()
        input_x.requires_grad_()

        output_x = layer(input_x)
        optimizer = optim.SGD(layer.parameters(), lr=0.01)
        optimizer.zero_grad()
        output_x.backward(back_grad)
        optimizer.step()

        time.sleep(10)
Exemple #17
0
def eval(layer, logger, args, targets_queue, e, save_event, data_size, testloader):
    criterion = nn.CrossEntropyLoss()
    criterion.cuda(1)
    layer.eval()

    with torch.no_grad():
        if dist.get_rank() == 0:
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print('batch_idx: ' + str(batch_idx))
                inputs, targets = inputs.cuda(0), targets
                outputs = layer(inputs)
                targets_queue.put(targets.numpy())
                send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=1)
                send_opt.wait()
            send_opt = dist.isend(tensor=torch.zeros(0), dst=1)
            send_opt.wait()
            e.wait()
        elif dist.get_rank() == 1:
            batch_idx = 0
            test_loss = 0
            correct = 0
            total = 0
            save_event.clear()
            global best_acc
            while True:
                try:
                    rec_val = torch.zeros([100, 256, 4, 4], dtype=torch.int8)
                    dist.recv(tensor=rec_val, src=0)
                except RuntimeError as error:
                    print("done....")
                    acc = 100. * correct / total
                    if acc > best_acc:
                        best_acc = acc
                        save_event.set()
                    e.set()
                    break
                rec_val = dq_act(rec_val)
                outputs = layer(rec_val.cuda(1))
                targets = targets_queue.get(block=True, timeout=2)
                targets = torch.from_numpy(targets).cuda(1)
                loss = criterion(outputs, targets)
                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                             % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total))
                #if batch_idx % 10 == 0:
                logger.error("eval:" + str(test_loss / (batch_idx + 1)))
                batch_idx += 1
            acc_str = "eacc: %.3f" % (100. * correct / total,)
            logger.error(acc_str)
    def send_obj(self, obj, dst, group=None):
        if group is None:
            group = self.main_group

        buf = pickle.dumps(obj)
        size = torch.tensor(len(buf), dtype=torch.int32)
        arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8))

        r0 = dist.isend(size, dst=dst, group=group)
        r1 = dist.isend(arr, dst=dst, group=group)

        r0.wait()
        r1.wait()
Exemple #19
0
def timed_pt2pt(input, args):
    if args.dist == 'torch':
        import torch.distributed as dist
    elif args.dist == 'deepspeed':
        import deepspeed.comm as dist

    sync_all()
    # Warmups, establish connections, etc.
    for i in range(args.warmups):
        if dist.get_rank() == 0:
            if args.async_op:
                dist.isend(input, 1)
            else:
                dist.send(input, 1)
        if dist.get_rank() == 1:
            if args.async_op:
                dist.irecv(input, src=0)
            else:
                dist.recv(input, src=0)
    sync_all()

    # time the actual comm op trials times and average it
    pre = time.perf_counter()
    for i in range(args.trials):
        if dist.get_rank() == 0:
            if args.async_op:
                dist.isend(input, 1)
            else:
                dist.send(input, 1)
        if dist.get_rank() == 1:
            if args.async_op:
                dist.irecv(input, src=0)
            else:
                dist.recv(input, src=0)

    sync_all()
    duration = time.perf_counter() - pre

    # maintain and clean performance data
    avg_duration = duration / args.trials
    size = input.element_size() * input.nelement()
    n = dist.get_world_size()
    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
    desc = f'{input.nelement()}x{input.element_size()}'

    if not args.raw:
        size = convert_size(size)

    print_rank_0(
        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
Exemple #20
0
    def send_obj(self, obj, dst, group=None):
        """Sends the specified object to the destination `dst`."""
        if group is None:
            group = self.main_group

        buf = pickle.dumps(obj)
        size = torch.tensor(len(buf), dtype=torch.int32)
        arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8))

        r0 = dist.isend(size, dst=dst, group=group)
        r1 = dist.isend(arr, dst=dst, group=group)

        r0.wait()
        r1.wait()
Exemple #21
0
def ms_allreduce(r,
                 world,
                 peers,
                 tensor,
                 quantize,
                 unquantize,
                 numberOfThreads=1):
    r = dist.get_rank()
    arraySize = list(tensor.size())[0]
    acc = torch.zeros(arraySize)
    world = dist.get_world_size()
    chunksize = arraySize // world
    assert chunksize % dataSz == 0
    acc[r * chunksize:(r + 1) * chunksize] = tensor[r * chunksize:(r + 1) *
                                                    chunksize]
    reqs = []
    #"Naive all-reduce"
    #i = 0
    #print('actual: {} vs. expected: {}'.format(torch.zeros(int(arraySize / (chunksize * dataSz))).size(), quantize(tensor[i*chunksize:(i+1)*chunksize]).size()))
    for i in range(world):  # K steps
        if i != r:
            chunk = tensor[i * chunksize:(i + 1) * chunksize]
            qchunk = quantize(chunk, numberOfThreads)
            reqs += [dist.isend(tensor=qchunk,
                                dst=i)]  # K concurrent transfers

    recv = torch.zeros(arraySize // (dataSz * world), dtype=torch.int32)
    for i in range(world):  # K steps
        if i != r:
            dist.recv(tensor=recv, src=i)  # K / ??? values...
            chunk = unquantize(recv, numberOfThreads)
            acc[r * chunksize:(r + 1) * chunksize] += chunk
    for req in reqs:
        req.wait()
    reqs = []
    #"Naive all-gather"
    for i in range(world):
        if i != r:
            chunk = acc[r * chunksize:(r + 1) * chunksize]
            qchunk = quantize(chunk, numberOfThreads)
            reqs += [dist.isend(tensor=qchunk, dst=i)]
    #"Naive all-gather"
    for i in range(world):
        if i != r:
            dist.recv(tensor=recv, src=i)
            chunk = unquantize(recv, numberOfThreads)
            acc[i * chunksize:(i + 1) * chunksize] += chunk
    for req in reqs:
        req.wait()
    tensor[:] = acc[:]
Exemple #22
0
    def _agg(self, data, op):
        """Aggregate data using `op` operation.

        Args:
            data (:obj:`torch.Tensor`): A Tensor to be aggragated.
            op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc.

        Returns:
            :obj:`torch.Tensor`: An aggregated tensor.
        """
        # Create some tensors to host the values from neighborhood.
        local_data = {i: torch.zeros_like(data) for i in self.neighbors}
        local_data[self.rank] = data

        reqs = []
        for node in self.neighbors:
            reqs.append(dist.isend(tensor=local_data[self.rank], dst=node))
            reqs.append(dist.irecv(tensor=local_data[node], src=node))

        for req in reqs:
            req.wait()

        # Aggregate local_data
        if op == "avg":
            output = sum(local_data.values()) / (len(self.neighbors) + 1)
        else:
            raise NotImplementedError("op {} is not supported yet.".format(op))

        return output
Exemple #23
0
def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
    """Each process scatters list of input tensors to all processes in a cluster
    and return gathered list of tensors in output list.

    Parameters
    ----------
    rank : int
        The rank of current worker
    world_size : int
        The size of the entire
    output_tensor_list : List of tensor
        The received tensors
    input_tensor_list : List of tensor
        The tensors to exchange
    """
    # send tensor to each target trainer using torch.distributed.isend
    # isend is async
    senders = []
    for i in range(world_size):
        if i == rank:
            output_tensor_list[i] = input_tensor_list[i].to(th.device('cpu'))
        else:
            sender = dist.isend(input_tensor_list[i].to(th.device('cpu')),
                                dst=i)
            senders.append(sender)

    for i in range(world_size):
        if i != rank:
            dist.recv(output_tensor_list[i], src=i)

    th.distributed.barrier()
Exemple #24
0
    def backward_rank1(semaphore, start_event, start_event2):

        start_event.wait()

        batch_idx = 0
        while True:
            try:
                #semaphore.release()
                print("before grad recv...")
                grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8)
                dist.recv(tensor=grad_recv1, src=2)
                print("after grad recv.....")
            except RuntimeError as error:
                print("backward runtime error")
                send_opt = dist.isend(tensor=torch.zeros(0), dst=0)
                send_opt.wait()
                break
            grad_recv1 = dequantize(grad_recv1.cuda(0).float())
            inputs, outputs = outputs_queue.get(block=False)
            inputs.requires_grad_()
            outputs.backward(grad_recv1)
            if batch_idx % args.buffer_size == 0:
                optimizer.step()
                optimizer.zero_grad()

            inputs_grad = quantize(inputs.grad, char=True).cpu()
            print(inputs_grad.size())
            if batch_idx == 0:
                start_event2.set()
            #send_opt = dist.isend(tensor=inputs_grad, dst=0)
            #send_opt.wait()
            dist.send(tensor=inputs_grad, dst=0)
            batch_idx += 1
def train_model(model, train_loader, optimizer, criterion, epoch, rank):
    """
    model (torch.nn.module): The model created to train
    train_loader (pytorch data loader): Training data loader
    optimizer (optimizer.*): A instance of some sort of optimizer, usually SGD
    criterion (nn.CrossEntropyLoss) : Loss function used to train the network
    epoch (int): Current epoch number
    """

    # remember to exit the train loop at end of the epoch
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # Your code goes here!
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        train_loss = criterion(output, target)
        train_loss.backward()
        for p in model.parameters():
            req = dist.isend(tensor=p.grad, dst=0)
            req.wait()
            req = dist.irecv(tensor=p.grad, src=0)
            req.wait()
        optimizer.step()
        if batch_idx % 20 == 0:
            print(batch_idx, "loss: ", train_loss.item())
            now = datetime.now()
        if batch_idx == 10:
            later = datetime.now()
            print("average time: ", (later - now).total_seconds()/9)
Exemple #26
0
def print_loss_file(file_name, loss_iteration, rank, size):
    if rank != 0:
        data = torch.tensor(loss_iteration)
        dist.barrier()
        req = dist.isend(data, dst=0)
        req.wait()
    else:
        loss_list_tensor = []
        loss_iter = []
        data = torch.tensor(loss_iteration)
        for i in range(size):
            data = copy.deepcopy(data)
            loss_list_tensor.append(data)
        dist.barrier()
        for i in range(size - 1):
            req = dist.irecv(loss_list_tensor[i + 1], src=i + 1)
            req.wait()
        for j in range(len(loss_list_tensor[0])):
            element = 0
            for i in range(size):
                element += loss_list_tensor[i][j].item()
            loss_iter.append(element / size)
    if rank == 0:
        file_object = open(file_name, 'w')
        for loss in loss_iter:
            file_object.write(str(loss))
            file_object.write('\t')
        file_object.close()
Exemple #27
0
    def send_torch(self, torch_tensor: torch.Tensor, name: str):
        tag = self.get_dist_tag(name)
        send_tensor = torch_tensor.cpu()
        self.all_wait[tag] = dist.isend(tensor=send_tensor,
                                        dst=self.dst,
                                        tag=tag)

        self.traffic_record.send_byte(get_num_byte(send_tensor))
 def _master_procedure(self):
     # Receive the rank of the next worker in the queue.
     worker_rank = self._next_worker()
     # Receive the update and store it in the parameter buffer.
     requests = []
     for group in self.param_groups:
         for p in group["params"]:
             parameter_buffer = self.state[p]["buffer"]
             request = dist.irecv(parameter_buffer.data, src=worker_rank)
             requests.append(request)
     # Update central variable.
     for group in self.param_groups:
         for index, p in enumerate(group["params"]):
             parameter_buffer = self.state[p]["buffer"]
             requests[index].wait()
             p.data.add_(parameter_buffer.data)
             dist.isend(p.data, dst=worker_rank)
Exemple #29
0
 def store(self,
           key: str,
           src: torch.Tensor,
           accum: bool = False,
           overwrite: bool = True) -> None:
     """Store or accumulate a tensor on the server."""
     self._validate_store(key, src, accum=accum, overwrite=overwrite)
     cmd_rpc = torch.tensor(
         [
             STORE_CMD,
             len(key),
             -1 if accum else src.ndimension(),
             int(accum),
             int(overwrite),
             _dtypes.index(src.dtype),
         ],
         dtype=torch.long,
     )
     metadata_pg = self._metadata_pg()
     td.send(cmd_rpc, self.server_rank, group=metadata_pg)
     td.send(_fromstring(key), self.server_rank, group=metadata_pg)
     if not accum:
         td.send(
             torch.tensor(list(src.size()), dtype=torch.long),
             self.server_rank,
             group=metadata_pg,
         )
     start_t = time.monotonic()
     data_pgs = self._data_pgs()
     if data_pgs is None:
         td.send(src, dst=self.server_rank)
     else:
         outstanding_work = []
         flattened_src = src.flatten()
         flattened_size = flattened_src.shape[0]
         for idx, (pg, slice_) in enumerate(
                 zip(
                     data_pgs,
                     split_almost_equally(flattened_size,
                                          num_parts=len(data_pgs)),
                 )):
             outstanding_work.append(
                 td.isend(
                     tensor=flattened_src[slice_],
                     dst=self.server_rank,
                     group=pg,
                     tag=idx,
                 ))
         for w in outstanding_work:
             w.wait()
     end_t = time.monotonic()
     if self.log_stats:
         stats_size = src.numel() * src.element_size()
         stats_time = end_t - start_t
         logger.debug(f"Sent tensor {key} to server {self.server_rank}: "
                      f"{stats_size:,} bytes "
                      f"in {stats_time:,g} seconds "
                      f"=> {stats_size / stats_time:,.0f} B/s")
Exemple #30
0
def message_pass(rank, ws):
    if rank == 0:
        my_tensors = [torch.zeros(ws, requires_grad=True) for _ in range(ws)]

        # Can't pass messages to self, so do any work within proc
        my_tensors[0][0] = 1

        # Wait for messages from other procs
        msgs = [dist.irecv(my_tensors[i], src=i) for i in range(1, ws)]
        [msg.wait() for msg in msgs]

        print(my_tensors)
    else:
        # Send master onehot tensor of rank
        t = torch.zeros(ws, requires_grad=True)
        t[rank] = 1
        t = torch.sigmoid(t)
        dist.isend(t, 0)
Exemple #31
0
    def test_isend(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            requests = [
                dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
            ]
            for request in requests:
                request.wait()
                self.assertTrue(request.is_completed())
        else:
            tensor = _build_tensor(rank, -1)
            dist.recv(tensor, 0)
            self.assertEqual(tensor, _build_tensor(rank, 10))

        self._barrier()