def hier_setup(): os.environ['BLUEFOG_NODES_PER_MACHINE'] = '2' bf.init() assert bf.size() % 2 == 0 machine_size = int(bf.size() // 2) bf.set_machine_topology(bf.ExponentialGraph(machine_size)) return bf.rank(), bf.size(), bf.local_rank(), bf.local_size()
def cast_and_place(tensor, dtype): if dtype.is_cuda: if bf.nccl_built() and bf.local_size() > torch.cuda.device_count(): raise EnvironmentError( "Cannot run number of processes in one machine more than GPU device count" " in NCCL environment") return tensor.cuda(bf.rank() % torch.cuda.device_count()).type(dtype) return tensor.type(dtype)
else: raise ValueError("Unknown args.virtual_topology, supporting options are " + "[expo2(Default), ring, mesh, star].") x_bar = bf.allreduce(x, average=True) mse = [torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)] if not args.asynchronous_mode: self_weight = None neighbor_weights = None send_neighbors = None if args.enable_dynamic_topology: if args.virtual_topology == "InnerOuterExpo2": dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks( bf.size(), local_size=bf.local_size(), self_rank=bf.rank()) else: dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks( bf.load_topology(), bf.rank()) for ite in range(args.max_iters): if args.enable_dynamic_topology: send_neighbors, recv_neighbors = next( dynamic_neighbor_allreduce_gen) neighbor_weights = { r: 1 / (len(recv_neighbors) + 1) for r in recv_neighbors } self_weight = 1 / (len(recv_neighbors) + 1) x = bf.neighbor_allreduce(x,
elif args.dist_optimizer == 'hierarchical_neighbor_allreduce': optimizer = optimizer = bf.DistributedHierarchicalNeighborAllreduceOptimizer( optimizer, model=model) elif args.dist_optimizer == 'horovod': optimizer = optimizer = bf.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) else: raise ValueError('Unknown args.dist-optimizer type -- ' + args.dist_optimizer + '\n' + 'Please set the argument to be one of ' + '[neighbor_allreduce, gradient_allreduce, allreduce, ' + 'win_put, horovod]') if not args.disable_dynamic_topology and (args.dist_optimizer != 'horovod'): if args.dist_optimizer == 'neighbor_allreduce': if bf.is_homogeneous() and bf.size() > bf.local_size(): dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks( bf.size(), local_size=bf.local_size(), self_rank=bf.rank()) else: dynamic_neighbor_allreduce_gen = topology_util.GetDynamicSendRecvRanks( bf.load_topology(), bf.rank()) elif args.dist_optimizer == 'hierarchical_neighbor_allreduce': # This optimizer can use following dynamic topo only so far. dynamic_machine_neighbor_allreduce_gen = topology_util.GetExp2DynamicSendRecvMachineRanks( world_size=bf.size(), local_size=bf.local_size(), self_rank=bf.rank(), local_rank=bf.local_rank()) else: dynamic_neighbor_allreduce_gen = topology_util.GetDynamicSendRecvRanks( bf.load_topology(), bf.rank())
def test_bluefog_local_size(hier_setup): _, true_size = mpi_env_rank_and_size() local_size = bf.local_size() assert local_size == min(2, true_size)
lr_adj = 1.0 elif epoch < 60: lr_adj = 1e-1 elif epoch < 80: lr_adj = 1e-2 else: lr_adj = 1e-3 for param_group in optimizer.param_groups: param_group["lr"] = ( args.base_lr * bf.size() * args.batches_per_allreduce * lr_adj ) if not args.disable_dynamic_topology and (args.dist_optimizer != 'horovod'): if args.dist_optimizer == 'neighbor_allreduce': if bf.is_homogeneous() and bf.size() > bf.local_size(): dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks( bf.size(), local_size=bf.local_size(), self_rank=bf.rank()) else: dynamic_neighbor_allreduce_gen = topology_util.GetDynamicSendRecvRanks( bf.load_topology(), bf.rank()) elif args.dist_optimizer == 'hierarchical_neighbor_allreduce': # This optimizer can use following dynamic topo only so far. dynamic_machine_neighbor_allreduce_gen = topology_util.GetExp2DynamicSendRecvMachineRanks( world_size=bf.size(), local_size=bf.local_size(), self_rank=bf.rank(), local_rank=bf.local_rank() )