def MPIBroadcast(inputs, root, mpi_ranks=None, **kwargs): """Broadcast a tensor to all nodes in the ``MPIGroup``. Parameters ---------- inputs : Tensor The tensor to broadcast. root : int The world rank of root node. mpi_ranks: sequence of int, optional The world rank of nodes in group. Default is ``None`` (Use All). Returns ------- Tensor The broadcast output. Notes ----- For root, the output **shares** the input. For others, the input is **inaccessible**. """ arguments = ParseArgs(locals()) if mpi_ranks is None: num_nodes = mpi.Size() mpi_ranks = [i for i in range(0, num_nodes)] if not isinstance(mpi_ranks, list): mpi_rank = [mpi_ranks] comm, group = mpi.CreateGroup(root, incl=mpi_ranks) arguments = {'inputs': arguments['inputs'], 'comm': comm, 'group': group} return Tensor.CreateOperator('MPIBroadcast', **arguments)
def MPIGather(inputs, root, mpi_ranks=None, **kwargs): """Gather a tensor from all nodes to root in the ``MPIGroup``. Parameters ---------- inputs : Tensor The tensor to gather. root : int The world rank of root node. mpi_ranks: int, list of int or None The world rank of nodes in group. Default is ``None`` (Use All). Returns ------- Tensor or list of Tensor The gathered outputs. Notes ----- The number of outputs is decided on the number of ``mpi_ranks``. The outputs are **accessible** only for root and vice versa. """ CheckInputs(inputs, 1) arguments = ParseArguments(locals()) if mpi_ranks is None: num_nodes = mpi.Size() mpi_ranks = [i for i in range(0, num_nodes)] if not isinstance(mpi_ranks, list): mpi_ranks = [mpi_ranks] comm, group = mpi.CreateGroup(root, incl=mpi_ranks) arguments = {'inputs': arguments['inputs'], 'comm': comm, 'group': group} outputs = Tensor.CreateOperator(nout=len(mpi_ranks), op_type='MPIGather', **arguments) if inputs.shape is not None: if isinstance(outputs, list): for output in outputs: output.shape = inputs.shape[:] else: outputs.shape = inputs.shape[:] return outputs
def MPIGather(inputs, root, mpi_ranks=None, **kwargs): """Gather a tensor from all nodes to root in the ``MPIGroup``. Parameters ---------- inputs : Tensor The tensor to gather. root : int The world rank of root node. mpi_ranks: sequence of int, optional The world rank of nodes in group. Default is ``None`` (Use All). Returns ------- sequence of Tensor The gathered outputs. Notes ----- The number of outputs is decided on the number of ``mpi_ranks``. The outputs are **accessible** only for root and vice versa. """ arguments = ParseArgs(locals()) if mpi_ranks is None: num_nodes = mpi.Size() mpi_ranks = [i for i in range(0, num_nodes)] if not isinstance(mpi_ranks, list): mpi_ranks = [mpi_ranks] comm, group = mpi.CreateGroup(root, incl=mpi_ranks) arguments = { 'inputs': arguments['inputs'], 'comm': comm, 'group': group, 'num_outputs': len(mpi_ranks) } return Tensor.CreateOperator('MPIGather', **arguments)
max_iters = 360000 cfg.DATA_DIR = '/home/workspace/datasets/VOC' cfg.IMS_PER_BATCH = cfg.IMS_PER_BATCH / len(gpus) if __name__ == '__main__': # fix the random seeds (numpy and caffe) for reproducibility np.random.seed(cfg.RNG_SEED) caffe.set_random_seed(cfg.RNG_SEED) # setup caffe caffe.set_mode_gpu() # setup mpi if len(gpus) != mpi.Size(): raise ValueError('Excepted {} mpi nodes, but got {}.'.format( len(gpus), mpi.Size())) caffe.set_device(gpus[mpi.Rank()]) mpi.Parallel([i for i in xrange(len(gpus))]) mpi.Snapshot([0]) if mpi.Rank() != 0: caffe.set_root_solver(False) # setup database cfg.DATABASE = imdb_name imdb = get_imdb(imdb_name) print 'Database({}): {} images will be used to train.'.format( cfg.DATABASE, imdb.db_size) output_dir = osp.abspath( osp.join(cfg.ROOT_DIR, 'output', cfg.EXP_DIR, args.imdb_name))