Ejemplo n.º 1
0
def hier_setup():
    os.environ['BLUEFOG_NODES_PER_MACHINE'] = '2'
    bf.init()
    assert bf.size() % 2 == 0
    machine_size = int(bf.size() // 2)
    bf.set_machine_topology(bf.ExponentialGraph(machine_size))
    return bf.rank(), bf.size(), bf.local_rank(), bf.local_size()
Ejemplo n.º 2
0
def cast_and_place(tensor, dtype):
    if dtype.is_cuda:
        if bf.nccl_built() and bf.local_size() > torch.cuda.device_count():
            raise EnvironmentError(
                "Cannot run number of processes in one machine more than GPU device count"
                " in NCCL environment")
        return tensor.cuda(bf.rank() % torch.cuda.device_count()).type(dtype)
    return tensor.type(dtype)
Ejemplo n.º 3
0
else:
    raise ValueError("Unknown args.virtual_topology, supporting options are " +
                     "[expo2(Default), ring, mesh, star].")

x_bar = bf.allreduce(x, average=True)
mse = [torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)]

if not args.asynchronous_mode:
    self_weight = None
    neighbor_weights = None
    send_neighbors = None

    if args.enable_dynamic_topology:
        if args.virtual_topology == "InnerOuterExpo2":
            dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
                bf.size(), local_size=bf.local_size(), self_rank=bf.rank())
        else:
            dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks(
                bf.load_topology(), bf.rank())

    for ite in range(args.max_iters):
        if args.enable_dynamic_topology:
            send_neighbors, recv_neighbors = next(
                dynamic_neighbor_allreduce_gen)
            neighbor_weights = {
                r: 1 / (len(recv_neighbors) + 1)
                for r in recv_neighbors
            }
            self_weight = 1 / (len(recv_neighbors) + 1)

        x = bf.neighbor_allreduce(x,
Ejemplo n.º 4
0
elif args.dist_optimizer == 'hierarchical_neighbor_allreduce':
    optimizer = optimizer = bf.DistributedHierarchicalNeighborAllreduceOptimizer(
        optimizer, model=model)
elif args.dist_optimizer == 'horovod':
    optimizer = optimizer = bf.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())
else:
    raise ValueError('Unknown args.dist-optimizer type -- ' +
                     args.dist_optimizer + '\n' +
                     'Please set the argument to be one of ' +
                     '[neighbor_allreduce, gradient_allreduce, allreduce, ' +
                     'win_put, horovod]')

if not args.disable_dynamic_topology and (args.dist_optimizer != 'horovod'):
    if args.dist_optimizer == 'neighbor_allreduce':
        if bf.is_homogeneous() and bf.size() > bf.local_size():
            dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
                bf.size(), local_size=bf.local_size(), self_rank=bf.rank())
        else:
            dynamic_neighbor_allreduce_gen = topology_util.GetDynamicSendRecvRanks(
                bf.load_topology(), bf.rank())
    elif args.dist_optimizer == 'hierarchical_neighbor_allreduce':
        # This optimizer can use following dynamic topo only so far.
        dynamic_machine_neighbor_allreduce_gen = topology_util.GetExp2DynamicSendRecvMachineRanks(
            world_size=bf.size(),
            local_size=bf.local_size(),
            self_rank=bf.rank(),
            local_rank=bf.local_rank())
    else:
        dynamic_neighbor_allreduce_gen = topology_util.GetDynamicSendRecvRanks(
            bf.load_topology(), bf.rank())
Ejemplo n.º 5
0
def test_bluefog_local_size(hier_setup):
    _, true_size = mpi_env_rank_and_size()
    local_size = bf.local_size()
    assert local_size == min(2, true_size)
Ejemplo n.º 6
0
        lr_adj = 1.0
    elif epoch < 60:
        lr_adj = 1e-1
    elif epoch < 80:
        lr_adj = 1e-2
    else:
        lr_adj = 1e-3
    for param_group in optimizer.param_groups:
        param_group["lr"] = (
            args.base_lr * bf.size() * args.batches_per_allreduce * lr_adj
        )


if not args.disable_dynamic_topology and (args.dist_optimizer != 'horovod'):
    if args.dist_optimizer == 'neighbor_allreduce':
        if bf.is_homogeneous() and bf.size() > bf.local_size():
            dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
                bf.size(),
                local_size=bf.local_size(),
                self_rank=bf.rank())
        else:
            dynamic_neighbor_allreduce_gen = topology_util.GetDynamicSendRecvRanks(
                bf.load_topology(), bf.rank())
    elif args.dist_optimizer == 'hierarchical_neighbor_allreduce':
        # This optimizer can use following dynamic topo only so far.
        dynamic_machine_neighbor_allreduce_gen = topology_util.GetExp2DynamicSendRecvMachineRanks(
            world_size=bf.size(),
            local_size=bf.local_size(),
            self_rank=bf.rank(),
            local_rank=bf.local_rank()
        )