def distributed_grad_descent(X, y, loss, maxite=5000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') w_opt = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) for _ in range(maxite): # calculate gradient via pytorch autograd loss_step(X, y, w_opt, tensor_name='allreduce.gradient', loss=loss, rho=rho) # global gradient grad = bf.allreduce(w_opt.grad.data, name='gradient') # distributed gradient descent w_opt.data = w_opt.data - alpha * grad w_opt.grad.data.zero_() loss_step(X, y, w_opt, tensor_name='allreduce.gradient', loss=loss, rho=rho) grad = bf.allreduce(w_opt.grad.data, name='gradient') # global gradient # evaluate the convergence of distributed logistic regression # the norm of global gradient is expected to 0 (optimality condition) global_grad_norm = torch.norm(grad, p=2) print("[DG] Rank {}: global gradient norm: {}".format( bf.rank(), global_grad_norm)) # the norm of local gradient is expected not be be close to 0 # this is because each rank converges to global solution, not local solution local_grad_norm = torch.norm(w_opt.grad.data, p=2) print("[DG] Rank {}: local gradient norm: {}".format( bf.rank(), local_grad_norm)) return w_opt
def evaluation(model, dataloader, isCUDA): mseloss = nn.MSELoss() model.eval() total_loss = 0 with torch.no_grad(): for data, target in dataloader: if isCUDA: data, target = data.cuda(), target.cuda() y = model(data) loss = mseloss(y, target) total_loss += loss * len(target) total_loss /= len(dataloader.dataset) avg_total_loss = bf.allreduce(total_loss) return avg_total_loss.item()
def test_hier_allreduce(hier_setup, dtype, dim): rank, size, local_rank, local_size = hier_setup tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank) name = "hier_local_allreduce_tensor_{}_{}".format(dim, dtype) tensor = cast_and_place(tensor, dtype) expected_value = rank - local_rank + (local_size - 1) / 2 reduced_tensor = bf.allreduce(tensor, average=True, is_hierarchical_local=True, name=name) assert (list(reduced_tensor.shape) == [23] * dim), "bf.allreduce (hier_avg) produces incorrect reduced shape" assert ((reduced_tensor.data - expected_value).abs().max() < EPSILON ), "bf.allreduce (hier_avg) produces incorrect reduced tensor"
elif args.virtual_topology == "expo4": bf.set_topology(topology_util.ExponentialGraph(bf.size(), base=4)) elif args.virtual_topology == "ring": bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1)) elif args.virtual_topology == "mesh": bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=0), is_weighted=True) elif args.virtual_topology == "star": bf.set_topology(topology_util.StarGraph(bf.size()), is_weighted=True) elif args.virtual_topology == "full": bf.set_topology(topology_util.FullyConnectedGraph(bf.size())) else: raise ValueError("Unknown args.virtual_topology, supporting options are " + "[expo2(Default), ring, mesh, star].") x_bar = bf.allreduce(x, average=True) mse = [torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)] if not args.asynchronous_mode: self_weight = None neighbor_weights = None send_neighbors = None if args.enable_dynamic_topology: if args.virtual_topology == "InnerOuterExpo2": dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks( bf.size(), local_size=bf.local_size(), self_rank=bf.rank()) else: dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks( bf.load_topology(), bf.rank())
loss=args.task, maxite=args.max_iter, alpha=args.lr, rho=rho) else: raise NotImplementedError( 'Algorithm not support. This example only supports' + ' exact_diffusion, gradient_tracking, and push_diging') # plot and print result if bf.rank() == 0: # print(mse[-100:]) plt.semilogy(mse) finalize_plot() # calculate local and global gradient loss_step(X, y, w, tensor_name='w_buff', loss=args.task, rho=rho) grad = bf.allreduce(w.grad.data, name='gradient') # global gradient # evaluate the convergence of gradient tracking for logistic regression # the norm of global gradient is expected to be 0 (optimality condition) global_grad_norm = torch.norm(grad, p=2) print("[{}] Rank {}: global gradient norm: {}".format(args.method, bf.rank(), global_grad_norm)) # the norm of local gradient is expected not to be close to 0 # this is because each rank converges to global solution, not local solution local_grad_norm = torch.norm(w.grad.data, p=2) print("[{}] Rank {}: local gradient norm: {}".format(args.method, bf.rank(), local_grad_norm))
elif args.method == "ATC_SGD": t0 = time.time() train_ATC_SGD(model, optimizer, train_loader, loss_fn) t = time.time() train_loss, train_acc = test(model, train_loader, loss_fn) test_loss, test_acc = test(model, test_loader, loss_fn) else: t0 = time.time() train_Diffusion_AVRG(model_0, model_i, optimizer_0, optimizer_i, train_loader, loss_fn) t = time.time() train_loss, train_acc = test(model_i, train_loader, loss_fn) test_loss, test_acc = test(model_i, test_loader, loss_fn) total_time += t-t0 if bf.rank() == 0: print(f"{epoch:3d}/{test_loss:.5f}/{test_acc:.2f}%") res_list.append([epoch, train_loss, test_loss, train_acc, test_acc]) avg_time = total_time/n_epoch res_list = bf.allreduce(torch.tensor(res_list)) if bf.rank() == 0: print(f"Avg Time Per Epoch: {avg_time:.2f}s") with open(f'{args.method}_{args.save_name}.csv', 'w') as f: for res in res_list: epoch = res[0] train_loss = res[1] test_loss = res[2] train_acc = res[3] test_acc = res[4] f.write(f"{epoch},{train_loss},{test_loss},{train_acc},{test_acc}\n")
def metric_average(val, name): tensor = torch.tensor(val) # pylint: disable=not-callable avg_tensor = bf.allreduce(tensor, name=name) return avg_tensor.item()
print(s, end='\n' if nl else '', flush=True) # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] enable_profiling = args.profiler & (bf.rank() == 0) with torch.autograd.profiler.profile(enable_profiling, True) as prof: for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.data_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, 'CPU')) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) img_secs_sum = bf.allreduce(torch.from_numpy(np.array(img_secs)), average=False) img_sec_mean_all = np.mean(img_secs_sum.numpy()) img_sec_conf_all = 1.96 * np.std(img_secs_sum.numpy()) print('[%d] Img/sec per %s: %.1f +-%.1f' % (bf.rank(), 'CPU', img_sec_mean, img_sec_conf)) log('Total img/sec on %d %s(s): %.1f +-%.1f' % (bf.size(), 'CPU', img_sec_mean_all, img_sec_conf_all))
def update(self, val): self.sum += bf.allreduce(val.detach().cpu(), name=self.name) self.n += 1