def distributed_grad_descent(X, y, loss, maxite=5000, alpha=1e-1, **kwargs):

    if loss == 'logistic_regression':
        rho = kwargs.get('rho', 1e-1)
    elif loss == 'linear_regression':
        rho = 0
    else:
        raise NotImplementedError(
            'Task not supported. This example only supports' +
            ' linear_regression and logistic_regression')

    w_opt = torch.zeros(n, 1, dtype=torch.double, requires_grad=True)

    for _ in range(maxite):
        # calculate gradient via pytorch autograd
        loss_step(X,
                  y,
                  w_opt,
                  tensor_name='allreduce.gradient',
                  loss=loss,
                  rho=rho)
        # global gradient
        grad = bf.allreduce(w_opt.grad.data, name='gradient')

        # distributed gradient descent
        w_opt.data = w_opt.data - alpha * grad
        w_opt.grad.data.zero_()

    loss_step(X,
              y,
              w_opt,
              tensor_name='allreduce.gradient',
              loss=loss,
              rho=rho)
    grad = bf.allreduce(w_opt.grad.data, name='gradient')  # global gradient

    # evaluate the convergence of distributed logistic regression
    # the norm of global gradient is expected to 0 (optimality condition)
    global_grad_norm = torch.norm(grad, p=2)
    print("[DG] Rank {}: global gradient norm: {}".format(
        bf.rank(), global_grad_norm))

    # the norm of local gradient is expected not be be close to 0
    # this is because each rank converges to global solution, not local solution
    local_grad_norm = torch.norm(w_opt.grad.data, p=2)
    print("[DG] Rank {}: local gradient norm: {}".format(
        bf.rank(), local_grad_norm))

    return w_opt
Example #2
0
def evaluation(model, dataloader, isCUDA):
    mseloss = nn.MSELoss()
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data, target in dataloader:
            if isCUDA:
                data, target = data.cuda(), target.cuda()
            y = model(data)
            loss = mseloss(y, target)
            total_loss += loss * len(target)
        total_loss /= len(dataloader.dataset)
    avg_total_loss = bf.allreduce(total_loss)
    return avg_total_loss.item()
Example #3
0
def test_hier_allreduce(hier_setup, dtype, dim):
    rank, size, local_rank, local_size = hier_setup
    tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank)
    name = "hier_local_allreduce_tensor_{}_{}".format(dim, dtype)
    tensor = cast_and_place(tensor, dtype)

    expected_value = rank - local_rank + (local_size - 1) / 2
    reduced_tensor = bf.allreduce(tensor,
                                  average=True,
                                  is_hierarchical_local=True,
                                  name=name)

    assert (list(reduced_tensor.shape) == [23] *
            dim), "bf.allreduce (hier_avg) produces incorrect reduced shape"
    assert ((reduced_tensor.data - expected_value).abs().max() < EPSILON
            ), "bf.allreduce (hier_avg) produces incorrect reduced tensor"
Example #4
0
elif args.virtual_topology == "expo4":
    bf.set_topology(topology_util.ExponentialGraph(bf.size(), base=4))
elif args.virtual_topology == "ring":
    bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
elif args.virtual_topology == "mesh":
    bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=0),
                    is_weighted=True)
elif args.virtual_topology == "star":
    bf.set_topology(topology_util.StarGraph(bf.size()), is_weighted=True)
elif args.virtual_topology == "full":
    bf.set_topology(topology_util.FullyConnectedGraph(bf.size()))
else:
    raise ValueError("Unknown args.virtual_topology, supporting options are " +
                     "[expo2(Default), ring, mesh, star].")

x_bar = bf.allreduce(x, average=True)
mse = [torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)]

if not args.asynchronous_mode:
    self_weight = None
    neighbor_weights = None
    send_neighbors = None

    if args.enable_dynamic_topology:
        if args.virtual_topology == "InnerOuterExpo2":
            dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
                bf.size(), local_size=bf.local_size(), self_rank=bf.rank())
        else:
            dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks(
                bf.load_topology(), bf.rank())
                         loss=args.task,
                         maxite=args.max_iter,
                         alpha=args.lr,
                         rho=rho)
else:
    raise NotImplementedError(
        'Algorithm not support. This example only supports' +
        ' exact_diffusion, gradient_tracking, and push_diging')

# plot and print result
if bf.rank() == 0:
    # print(mse[-100:])
    plt.semilogy(mse)
    finalize_plot()

# calculate local and global gradient
loss_step(X, y, w, tensor_name='w_buff', loss=args.task, rho=rho)
grad = bf.allreduce(w.grad.data, name='gradient')  # global gradient

# evaluate the convergence of gradient tracking for logistic regression
# the norm of global gradient is expected to be 0 (optimality condition)
global_grad_norm = torch.norm(grad, p=2)
print("[{}] Rank {}: global gradient norm: {}".format(args.method, bf.rank(),
                                                      global_grad_norm))

# the norm of local gradient is expected not to be close to 0
# this is because each rank converges to global solution, not local solution
local_grad_norm = torch.norm(w.grad.data, p=2)
print("[{}] Rank {}: local gradient norm: {}".format(args.method, bf.rank(),
                                                     local_grad_norm))
Example #6
0
    elif args.method == "ATC_SGD":
        t0 = time.time()
        train_ATC_SGD(model, optimizer, train_loader, loss_fn)
        t = time.time()
        train_loss, train_acc = test(model, train_loader, loss_fn)
        test_loss, test_acc = test(model, test_loader, loss_fn)
    else:
        t0 = time.time()
        train_Diffusion_AVRG(model_0, model_i, optimizer_0, optimizer_i, train_loader, loss_fn)
        t = time.time()
        train_loss, train_acc = test(model_i, train_loader, loss_fn)
        test_loss, test_acc = test(model_i, test_loader, loss_fn)
    total_time += t-t0
    if bf.rank() == 0:
        print(f"{epoch:3d}/{test_loss:.5f}/{test_acc:.2f}%")
    
    res_list.append([epoch, train_loss, test_loss, train_acc, test_acc])

avg_time = total_time/n_epoch
res_list = bf.allreduce(torch.tensor(res_list))

if bf.rank() == 0:
    print(f"Avg Time Per Epoch: {avg_time:.2f}s")
    with open(f'{args.method}_{args.save_name}.csv', 'w') as f:
        for res in res_list:
            epoch = res[0]
            train_loss = res[1]
            test_loss = res[2]
            train_acc = res[3]
            test_acc = res[4]
            f.write(f"{epoch},{train_loss},{test_loss},{train_acc},{test_acc}\n")
Example #7
0
def metric_average(val, name):
    tensor = torch.tensor(val)  # pylint: disable=not-callable
    avg_tensor = bf.allreduce(tensor, name=name)
    return avg_tensor.item()
Example #8
0
    print(s, end='\n' if nl else '', flush=True)


# Warm-up
log('Running warmup...')
timeit.timeit(benchmark_step, number=args.num_warmup_batches)

# Benchmark
log('Running benchmark...')
img_secs = []
enable_profiling = args.profiler & (bf.rank() == 0)

with torch.autograd.profiler.profile(enable_profiling, True) as prof:
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.data_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, 'CPU'))
        img_secs.append(img_sec)

# Results
img_sec_mean = np.mean(img_secs)
img_sec_conf = 1.96 * np.std(img_secs)
img_secs_sum = bf.allreduce(torch.from_numpy(np.array(img_secs)),
                            average=False)
img_sec_mean_all = np.mean(img_secs_sum.numpy())
img_sec_conf_all = 1.96 * np.std(img_secs_sum.numpy())
print('[%d] Img/sec per %s: %.1f +-%.1f' %
      (bf.rank(), 'CPU', img_sec_mean, img_sec_conf))
log('Total img/sec on %d %s(s): %.1f +-%.1f' %
    (bf.size(), 'CPU', img_sec_mean_all, img_sec_conf_all))
Example #9
0
 def update(self, val):
     self.sum += bf.allreduce(val.detach().cpu(), name=self.name)
     self.n += 1