Beispiel #1
0
def MPIBroadcast(inputs, root, mpi_ranks=None, **kwargs):
    """Broadcast a tensor to all nodes in the ``MPIGroup``.

    Parameters
    ----------
    inputs : Tensor
        The tensor to broadcast.
    root : int
        The world rank of root node.
    mpi_ranks: sequence of int, optional
        The world rank of nodes in group. Default is ``None`` (Use All).

    Returns
    -------
    Tensor
        The broadcast output.

    Notes
    -----
    For root, the output **shares** the input.

    For others, the input is **inaccessible**.

    """
    arguments = ParseArgs(locals())
    if mpi_ranks is None:
        num_nodes = mpi.Size()
        mpi_ranks = [i for i in range(0, num_nodes)]
    if not isinstance(mpi_ranks, list): mpi_rank = [mpi_ranks]

    comm, group = mpi.CreateGroup(root, incl=mpi_ranks)
    arguments = {'inputs': arguments['inputs'], 'comm': comm, 'group': group}
    return Tensor.CreateOperator('MPIBroadcast', **arguments)
Beispiel #2
0
def MPIGather(inputs, root, mpi_ranks=None, **kwargs):
    """Gather a tensor from all nodes to root in the ``MPIGroup``.

    Parameters
    ----------
    inputs : Tensor
        The tensor to gather.
    root : int
        The world rank of root node.
    mpi_ranks: int, list of int or None
        The world rank of nodes in group. Default is ``None`` (Use All).

    Returns
    -------
    Tensor or list of Tensor
        The gathered outputs.

    Notes
    -----
    The number of outputs is decided on the number of ``mpi_ranks``.

    The outputs are **accessible** only for root and vice versa.

    """
    CheckInputs(inputs, 1)
    arguments = ParseArguments(locals())

    if mpi_ranks is None:
        num_nodes = mpi.Size()
        mpi_ranks = [i for i in range(0, num_nodes)]
    if not isinstance(mpi_ranks, list): mpi_ranks = [mpi_ranks]

    comm, group = mpi.CreateGroup(root, incl=mpi_ranks)
    arguments = {'inputs': arguments['inputs'], 'comm': comm, 'group': group}

    outputs = Tensor.CreateOperator(nout=len(mpi_ranks),
                                    op_type='MPIGather',
                                    **arguments)

    if inputs.shape is not None:
        if isinstance(outputs, list):
            for output in outputs:
                output.shape = inputs.shape[:]
        else:
            outputs.shape = inputs.shape[:]

    return outputs
Beispiel #3
0
def MPIGather(inputs, root, mpi_ranks=None, **kwargs):
    """Gather a tensor from all nodes to root in the ``MPIGroup``.

    Parameters
    ----------
    inputs : Tensor
        The tensor to gather.
    root : int
        The world rank of root node.
    mpi_ranks: sequence of int, optional
        The world rank of nodes in group. Default is ``None`` (Use All).

    Returns
    -------
    sequence of Tensor
        The gathered outputs.

    Notes
    -----
    The number of outputs is decided on the number of ``mpi_ranks``.

    The outputs are **accessible** only for root and vice versa.

    """
    arguments = ParseArgs(locals())

    if mpi_ranks is None:
        num_nodes = mpi.Size()
        mpi_ranks = [i for i in range(0, num_nodes)]
    if not isinstance(mpi_ranks, list): mpi_ranks = [mpi_ranks]

    comm, group = mpi.CreateGroup(root, incl=mpi_ranks)

    arguments = {
        'inputs': arguments['inputs'],
        'comm': comm,
        'group': group,
        'num_outputs': len(mpi_ranks)
    }

    return Tensor.CreateOperator('MPIGather', **arguments)
max_iters = 360000

cfg.DATA_DIR = '/home/workspace/datasets/VOC'
cfg.IMS_PER_BATCH = cfg.IMS_PER_BATCH / len(gpus)

if __name__ == '__main__':

    # fix the random seeds (numpy and caffe) for reproducibility
    np.random.seed(cfg.RNG_SEED)
    caffe.set_random_seed(cfg.RNG_SEED)

    # setup caffe
    caffe.set_mode_gpu()

    # setup mpi
    if len(gpus) != mpi.Size():
        raise ValueError('Excepted {} mpi nodes, but got {}.'.format(
            len(gpus), mpi.Size()))
    caffe.set_device(gpus[mpi.Rank()])
    mpi.Parallel([i for i in xrange(len(gpus))])
    mpi.Snapshot([0])
    if mpi.Rank() != 0:
        caffe.set_root_solver(False)

    # setup database
    cfg.DATABASE = imdb_name
    imdb = get_imdb(imdb_name)
    print 'Database({}): {} images will be used to train.'.format(
        cfg.DATABASE, imdb.db_size)
    output_dir = osp.abspath(
        osp.join(cfg.ROOT_DIR, 'output', cfg.EXP_DIR, args.imdb_name))