def gradient_tracking(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) loss_step(X, y, w, tensor_name='neighbor.allreduce.Grad.Tracking.w', loss=loss, rho=rho) q = w.grad.data.clone() # q^0 = grad(w^0) w.grad.data.zero_() grad_prev = q.clone() mse = [] for _ in range(maxite): # Algorithm: # w^{k+1} = neighbor_allreduce(w^k) - alpha*q^k # q^{k+1} = neighbor_allreduce(q^k) + grad(w^{k+1}) - grad(w^k) # Notice the communication of neighbor_allreduce can overlap with gradient computation. w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') w.data = bf.synchronize(w_handle) - alpha * q # calculate local gradient loss_step(X, y, w, tensor_name='neighbor.allreduce.Grad.Tracking.w', loss=loss, rho=rho) grad = w.grad.data.clone() q = bf.synchronize(q_handle) + grad - grad_prev grad_prev = grad w.grad.data.zero_() # record convergence if bf.rank() == 0: mse.append(torch.norm(w.data - w_opt.data, p=2)) return w, mse
def _neighbor_allreduce_data_async(self, p): name = self._parameter_names.get(p) handle = bf.neighbor_allreduce_nonblocking( p.data, name=name, self_weight=self.self_weight, neighbor_weights=self.neighbor_weights, send_neighbors=self.send_neighbors, enable_topo_check=self.enable_topo_check) return handle
def benchmark_step(): global w, q, grad_prev, alpha if args.computation_mode == "normal": w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') w.data = -alpha * q + bf.synchronize(w_handle) q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') # calculate local gradient logistic_loss_step(w, rho, X, y, tensor_name='neighbor.allreduce.Grad.Tracking.w', calculate_by_hand=args.no_autograd) grad = w.grad.data.clone() q = bf.synchronize(q_handle) + grad - grad_prev grad_prev = grad w.grad.data.zero_() elif args.computation_mode == "compute_and_no_communicate": w.data = -alpha * q # calculate local gradient logistic_loss_step(w, rho, X, y, tensor_name='neighbor.allreduce.Grad.Tracking.w', calculate_by_hand=args.no_autograd) grad = w.grad.data.clone() q = grad - grad_prev grad_prev = grad w.grad.data.zero_() elif args.computation_mode == "sleep_and_communicate": w_handle = bf.neighbor_allreduce_nonblocking(w.data, name='Grad.Tracking.w') q_handle = bf.neighbor_allreduce_nonblocking(q, name='Grad.Tracking.q') w.data = bf.synchronize(w_handle) systemtime.sleep(args.sleep_time) q = bf.synchronize(q_handle)
def _neighbor_allreduce_data_async(self, p): handle = bf.neighbor_allreduce_nonblocking(p.data) return handle