def broadcast_parameters(params, root_rank): """ Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the ``model.state_dict()``, ``model.named_parameters()``, or ``model.parameters()``. Arguments: params: One of the following: - list of parameters to broadcast - dict of parameters to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. """ if isinstance(params, dict): params = sorted(params.items()) elif isinstance(params, list): # support both named_parameters() and regular parameters() params = [p if isinstance(p, tuple) else (None, p) for p in params] else: raise ValueError("invalid params of type: %s" % type(params)) # Run asynchronous broadcasts. handles = [] for name, p in params: handle = bf.broadcast_nonblocking_(p, root_rank, name) handles.append(handle) # Wait for completion. for handle in handles: bf.synchronize(handle)
def allreduce_parameters(params): """ Allreduce the parameters of all other processes, i.e., forcing all processes to have same average model. Typical usage is to allreduce the ``model.named_parameters()``, or ``model.parameters()``. Arguments: params: One of the following: - list of parameters to allreduce - dict of parameters to allreduce """ if isinstance(params, dict): params = sorted(params.items()) elif isinstance(params, list): # support both named_parameters() and regular parameters() params = [p if isinstance(p, tuple) else (None, p) for p in params] else: raise ValueError("invalid params of type: %s" % type(params)) # Run asynchronous broadcasts. handles = [] for name, p in params: handle = bf.allreduce_nonblocking_(p, average=True, name=name) handles.append(handle) # Wait for completion. for handle in handles: bf.synchronize(handle)
def gradient_tracking(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) loss_step(X, y, w, tensor_name='neighbor.allreduce.Grad.Tracking.w', loss=loss, rho=rho) q = w.grad.data.clone() # q^0 = grad(w^0) w.grad.data.zero_() grad_prev = q.clone() mse = [] for _ in range(maxite): # Algorithm: # w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k) # Notice the communication of neighbor_allreduce can overlap with gradient computation. w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') w.data = bf.synchronize(w_handle) - alpha * q # calculate local gradient loss_step(X, y, w, tensor_name='neighbor.allreduce.Grad.Tracking.w', loss=loss, rho=rho) grad = w.grad.data.clone() q = bf.synchronize(q_handle) + grad - grad_prev grad_prev = grad w.grad.data.zero_() # record convergence if bf.rank() == 0: mse.append(torch.norm(w.data - w_opt.data, p=2)) return w, mse
def synchronize(self): with torch.no_grad(): for p, handle in self._handles.items(): if handle is not None: output = bf.synchronize(handle) p.set_(output) self._reduce_delay[p] = self._num_steps_per_communication self._handles.clear() self._synchronized = True
def benchmark_step(): global w, q, grad_prev, alpha if args.computation_mode == "normal": w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') w.data = -alpha * q + bf.synchronize(w_handle) q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') # calculate local gradient logistic_loss_step(w, rho, X, y, tensor_name='neighbor.allreduce.Grad.Tracking.w', calculate_by_hand=args.no_autograd) grad = w.grad.data.clone() q = bf.synchronize(q_handle) + grad - grad_prev grad_prev = grad w.grad.data.zero_() elif args.computation_mode == "compute_and_no_communicate": w.data = -alpha * q # calculate local gradient logistic_loss_step(w, rho, X, y, tensor_name='neighbor.allreduce.Grad.Tracking.w', calculate_by_hand=args.no_autograd) grad = w.grad.data.clone() q = grad - grad_prev grad_prev = grad w.grad.data.zero_() elif args.computation_mode == "sleep_and_communicate": w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') w.data = bf.synchronize(w_handle) systemtime.sleep(args.sleep_time) q = bf.synchronize(q_handle)
def synchronize(self): missing_p = self._requires_update - set(self._handles.keys()) for p in missing_p: handle = self._allreduce_grad_async(p) self._handles[p] = handle for p, handle in self._handles.items(): if handle is None: handle = self._allreduce_grad_async(p) self._handles[p] = handle for p, handle in self._handles.items(): output = bf.synchronize(handle) self._allreduce_delay[p] = self._backward_passes_per_step p.grad.set_(output) self._handles.clear() self._synchronized = True
def test_hier_neighbor_allreduce_dynamic_move_dst_weight_fusion( hier_setup, dtype, dim): rank, size, local_rank, local_size = hier_setup machine_rank = (rank - local_rank) // local_size machine_size = size // local_size expected_value = (machine_rank + 1) % machine_size src_machine_weights = {(machine_rank + 1) % machine_size: 0.5} dst_machine_weights = {(machine_rank - 1) % machine_size: 2.0} K = 50 # number of tensors send in short time tensor_list, handles, names = [], [], [] for i in range(K): tensor = torch.FloatTensor( *([23] * dim)).fill_(i + (rank - (local_size - 1) / 2.0) / local_size) tensor = cast_and_place(tensor, dtype) tensor_list.append(tensor) names.append("index{}_{}_{}".format(i, dtype, dim)) for i in range(K): handle = bf.hierarchical_neighbor_allreduce_nonblocking( tensor_list[i], self_weight=0.0, src_machine_weights=src_machine_weights, dst_machine_weights=dst_machine_weights, name=names[i]) handles.append(handle) outputs = [] for i in range(K): output = bf.synchronize(handles[i]) outputs.append(output) for i in range(K): assert ( list(outputs[i].shape) == [23] * dim ), f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced shape" assert ( (outputs[i] - expected_value - i).abs().max() < EPSILON ), f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced tensor"
def test_hier_neighbor_allreduce_fusion(hier_setup, dtype, dim): rank, size, local_rank, local_size = hier_setup machine_rank = (rank - local_rank) // local_size machine_size = size // local_size neighbor_ranks = bf.in_neighbor_machine_ranks() expected_value = (machine_rank + sum(neighbor_ranks)) / (len(neighbor_ranks) + 1) K = 50 # number of tensors send in short time tensor_list, handles, names = [], [], [] for i in range(K): tensor = torch.FloatTensor( *([23] * dim)).fill_(i + (rank - (local_size - 1) / 2.0) / local_size) tensor = cast_and_place(tensor, dtype) tensor_list.append(tensor) names.append("index{}_{}_{}".format(i, dtype, dim)) for i in range(K): handle = bf.hierarchical_neighbor_allreduce_nonblocking(tensor_list[i], name=names[i]) handles.append(handle) outputs = [] for i in range(K): output = bf.synchronize(handles[i]) outputs.append(output) for i in range(K): assert ( list(outputs[i].shape) == [23] * dim ), f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced shape" assert ((outputs[i] - expected_value - i).abs().max() < EPSILON), ( f"{names[i]} (hierarchical neighbor allreduce fusion) produces incorrect reduced tensor" f" when K = {i}")
def _synchronize(self): for group in self.param_groups: for p in group['params']: state = self._states[p] with torch.no_grad(): p.set_(bf.synchronize(state['handle']))