Example #1
0
def broadcast_parameters(params, root_rank):
    """
    Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the ``model.state_dict()``,
    ``model.named_parameters()``, or ``model.parameters()``.

    Arguments:
        params: One of the following:
            - list of parameters to broadcast
            - dict of parameters to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
    """
    if isinstance(params, dict):
        params = sorted(params.items())
    elif isinstance(params, list):
        # support both named_parameters() and regular parameters()
        params = [p if isinstance(p, tuple) else (None, p) for p in params]
    else:
        raise ValueError("invalid params of type: %s" % type(params))

    # Run asynchronous broadcasts.
    handles = []
    for name, p in params:
        handle = bf.broadcast_nonblocking_(p, root_rank, name)
        handles.append(handle)

    # Wait for completion.
    for handle in handles:
        bf.synchronize(handle)
Example #2
0
def allreduce_parameters(params):
    """
    Allreduce the parameters of all other processes, i.e., forcing all
    processes to have same average model.
    Typical usage is to allreduce the ``model.named_parameters()``,
    or ``model.parameters()``.

    Arguments:
        params: One of the following:
            - list of parameters to allreduce
            - dict of parameters to allreduce
    """
    if isinstance(params, dict):
        params = sorted(params.items())
    elif isinstance(params, list):
        # support both named_parameters() and regular parameters()
        params = [p if isinstance(p, tuple) else (None, p) for p in params]
    else:
        raise ValueError("invalid params of type: %s" % type(params))

    # Run asynchronous broadcasts.
    handles = []
    for name, p in params:
        handle = bf.allreduce_nonblocking_(p, average=True, name=name)
        handles.append(handle)

    # Wait for completion.
    for handle in handles:
        bf.synchronize(handle)
def gradient_tracking(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs):

    if loss == 'logistic_regression':
        rho = kwargs.get('rho', 1e-1)
    elif loss == 'linear_regression':
        rho = 0
    else:
        raise NotImplementedError(
            'Task not supported. This example only supports' +
            ' linear_regression and logistic_regression')

    w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True)
    loss_step(X,
              y,
              w,
              tensor_name='neighbor.allreduce.Grad.Tracking.w',
              loss=loss,
              rho=rho)
    q = w.grad.data.clone()  # q^0 = grad(w^0)
    w.grad.data.zero_()

    grad_prev = q.clone()
    mse = []
    for _ in range(maxite):

        # Algorithm:
        # w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k
        # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k)

        # Notice the communication of neighbor_allreduce can overlap with gradient computation.
        w_handle = bf.neighbor_allreduce_nonblocking(w.data,
                                                     name='Grad.Tracking.w')
        q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q')
        w.data = bf.synchronize(w_handle) - alpha * q
        # calculate local gradient
        loss_step(X,
                  y,
                  w,
                  tensor_name='neighbor.allreduce.Grad.Tracking.w',
                  loss=loss,
                  rho=rho)
        grad = w.grad.data.clone()
        q = bf.synchronize(q_handle) + grad - grad_prev
        grad_prev = grad
        w.grad.data.zero_()

        # record convergence
        if bf.rank() == 0:
            mse.append(torch.norm(w.data - w_opt.data, p=2))

    return w, mse
Example #4
0
    def synchronize(self):
        with torch.no_grad():
            for p, handle in self._handles.items():
                if handle is not None:
                    output = bf.synchronize(handle)
                    p.set_(output)
                self._reduce_delay[p] = self._num_steps_per_communication
        self._handles.clear()

        self._synchronized = True
def benchmark_step():

    global w, q, grad_prev, alpha

    if args.computation_mode == "normal":
        w_handle = bf.neighbor_allreduce_nonblocking(w.data,
                                                     name='Grad.Tracking.w')
        w.data = -alpha * q + bf.synchronize(w_handle)
        q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q')

        # calculate local gradient
        logistic_loss_step(w,
                           rho,
                           X,
                           y,
                           tensor_name='neighbor.allreduce.Grad.Tracking.w',
                           calculate_by_hand=args.no_autograd)
        grad = w.grad.data.clone()
        q = bf.synchronize(q_handle) + grad - grad_prev
        grad_prev = grad
        w.grad.data.zero_()

    elif args.computation_mode == "compute_and_no_communicate":
        w.data = -alpha * q
        # calculate local gradient
        logistic_loss_step(w,
                           rho,
                           X,
                           y,
                           tensor_name='neighbor.allreduce.Grad.Tracking.w',
                           calculate_by_hand=args.no_autograd)
        grad = w.grad.data.clone()
        q = grad - grad_prev
        grad_prev = grad
        w.grad.data.zero_()

    elif args.computation_mode == "sleep_and_communicate":
        w_handle = bf.neighbor_allreduce_nonblocking(w.data,
                                                     name='Grad.Tracking.w')
        q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q')
        w.data = bf.synchronize(w_handle)
        systemtime.sleep(args.sleep_time)
        q = bf.synchronize(q_handle)
Example #6
0
    def synchronize(self):
        missing_p = self._requires_update - set(self._handles.keys())
        for p in missing_p:
            handle = self._allreduce_grad_async(p)
            self._handles[p] = handle

        for p, handle in self._handles.items():
            if handle is None:
                handle = self._allreduce_grad_async(p)
                self._handles[p] = handle

        for p, handle in self._handles.items():
            output = bf.synchronize(handle)
            self._allreduce_delay[p] = self._backward_passes_per_step
            p.grad.set_(output)
        self._handles.clear()

        self._synchronized = True
Example #7
0
def test_hier_neighbor_allreduce_dynamic_move_dst_weight_fusion(
        hier_setup, dtype, dim):
    rank, size, local_rank, local_size = hier_setup
    machine_rank = (rank - local_rank) // local_size
    machine_size = size // local_size

    expected_value = (machine_rank + 1) % machine_size
    src_machine_weights = {(machine_rank + 1) % machine_size: 0.5}
    dst_machine_weights = {(machine_rank - 1) % machine_size: 2.0}

    K = 50  # number of tensors send in short time
    tensor_list, handles, names = [], [], []
    for i in range(K):
        tensor = torch.FloatTensor(
            *([23] * dim)).fill_(i + (rank -
                                      (local_size - 1) / 2.0) / local_size)
        tensor = cast_and_place(tensor, dtype)
        tensor_list.append(tensor)
        names.append("index{}_{}_{}".format(i, dtype, dim))

    for i in range(K):
        handle = bf.hierarchical_neighbor_allreduce_nonblocking(
            tensor_list[i],
            self_weight=0.0,
            src_machine_weights=src_machine_weights,
            dst_machine_weights=dst_machine_weights,
            name=names[i])
        handles.append(handle)

    outputs = []
    for i in range(K):
        output = bf.synchronize(handles[i])
        outputs.append(output)

    for i in range(K):
        assert (
            list(outputs[i].shape) == [23] * dim
        ), f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced shape"
        assert (
            (outputs[i] - expected_value - i).abs().max() < EPSILON
        ), f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced tensor"
Example #8
0
def test_hier_neighbor_allreduce_fusion(hier_setup, dtype, dim):
    rank, size, local_rank, local_size = hier_setup
    machine_rank = (rank - local_rank) // local_size
    machine_size = size // local_size

    neighbor_ranks = bf.in_neighbor_machine_ranks()
    expected_value = (machine_rank +
                      sum(neighbor_ranks)) / (len(neighbor_ranks) + 1)

    K = 50  # number of tensors send in short time
    tensor_list, handles, names = [], [], []
    for i in range(K):
        tensor = torch.FloatTensor(
            *([23] * dim)).fill_(i + (rank -
                                      (local_size - 1) / 2.0) / local_size)
        tensor = cast_and_place(tensor, dtype)
        tensor_list.append(tensor)
        names.append("index{}_{}_{}".format(i, dtype, dim))

    for i in range(K):
        handle = bf.hierarchical_neighbor_allreduce_nonblocking(tensor_list[i],
                                                                name=names[i])
        handles.append(handle)

    outputs = []
    for i in range(K):
        output = bf.synchronize(handles[i])
        outputs.append(output)

    for i in range(K):
        assert (
            list(outputs[i].shape) == [23] * dim
        ), f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced shape"
        assert ((outputs[i] - expected_value - i).abs().max() < EPSILON), (
            f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced tensor"
            f" when K = {i}")
Example #9
0
 def _synchronize(self):
     for group in self.param_groups:
         for p in group['params']:
             state = self._states[p]
             with torch.no_grad():
                 p.set_(bf.synchronize(state['handle']))