Exemple #1
0
def decompose_survey_data(first, second, attrs, logger, smoothing, domain_factor=2,
                            angular=False, return_cartesian=False):
    """
    Perform a domain decomposition on survey data, returning the
    domain-demposed position and weight arrays for each object in the
    correlating pair.

    The domain decomposition is based on the Cartesian coordinates of
    the input data (assumed to be in sky coordinates).

    Load balancing is required since the distribution in Cartesian space
    will likely not be uniform.

    The implementation follows:

    1. Decompose the first source and balance the particle load, such that
       the first source is evenly distributed across all ranks and the
       objects are spatially tight on a given rank.
    2. Decompose the second source, ensuring a given rank holds all
       particles within the desired maximum separation.

    Parameters
    ----------
    first : CatalogSource
        the first source we are correlating
    second : CatalogSource
        the second source we are correlating
    attrs : dict
        dict of parameters from the pair counting algorithm
    logger :
        the current active logger
    smoothing :
        the maximum Cartesian separation implied by the user's binning
    domain_factor : int, optional
        the factor by which we over-sample the mesh with cells in a given
        direction; higher values can lead to better performance
    angular : bool, optional
        if ``True``, the Cartesian positions used in the domain
        decomposition are on the unit sphere
    return_cartesian : bool, optional
        whether to return the pos as (ra, dec, z), or the Cartesian (x, y, z)

    Returns
    -------
    (pos1, w1), (pos2, w2) : array_like
        the (decomposed) set of positions and weights to correlate
    """
    from nbodykit.transform import StackColumns
    comm = first.comm

    # either (ra,dec) or (ra,dec,redshift)
    poscols = [attrs['ra'], attrs['dec']]
    if not angular: poscols += [attrs['redshift']]

    # determine processor division for domain decomposition
    np = split_size_3d(comm.size)
    if comm.rank == 0:
        logger.info("using cpu grid decomposition: %s" %str(np))

    # stack position and compute
    pos1 = StackColumns(*[first[col] for col in poscols])
    pos1, w1 = first.compute(pos1, first[attrs['weight']])
    N1 = comm.allreduce(len(pos1))

    # only need cosmo if not angular
    cosmo = attrs.get('cosmo', None) if not angular else None
    if not angular and cosmo is None:
        raise ValueError("need a cosmology to decompose non-angular survey data")
    cpos1, cpos1_min, cpos1_max, rdist1 = get_cartesian(comm, pos1, cosmo=cosmo)

    # pass in comoving dist to Corrfunc instead of redshift
    if not angular:
        pos1[:,2] = rdist1

    # set up position for second too
    if second is not None:

        # stack position and compute for "second"
        pos2 = StackColumns(*[second[col] for col in poscols])
        pos2, w2 = second.compute(pos2, second[attrs['weight']])
        N2 = comm.allreduce(len(pos2))

        # get comoving dist and boxsize
        cpos2, cpos2_min, cpos2_max, rdist2 = get_cartesian(comm, pos2, cosmo=cosmo)

        # pass in comoving distance instead of redshift
        if not angular:
            pos2[:,2] = rdist2
    else:
        pos2 = pos1
        w2 = w1
        N2 = N1
        cpos2_min = cpos1_min
        cpos2_max = cpos1_max
        cpos2 = cpos1

    # determine global boxsize
    if second is None:
        cpos_min = cpos1_min
        cpos_max = cpos1_max
    else:
        cpos_min = numpy.min(numpy.vstack([cpos1_min, cpos2_min]), axis=0)
        cpos_max = numpy.max(numpy.vstack([cpos1_max, cpos2_max]), axis=0)

    boxsize = cpos_max - cpos_min

    if comm.rank == 0:
        logger.info("position variable range on rank 0 (max, min) = %s, %s" % (cpos_max, cpos_min))

    # initialize the domain
    # NOTE: over-decompose by factor of 2 to trigger load balancing
    grid = [
        numpy.linspace(cpos_min[0], cpos_max[0], domain_factor*np[0] + 1, endpoint=True),
        numpy.linspace(cpos_min[1], cpos_max[1], domain_factor*np[1] + 1, endpoint=True),
        numpy.linspace(cpos_min[2], cpos_max[2], domain_factor*np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=False)

    # balance the load
    domain.loadbalance(domain.load(cpos1))

    if comm.rank == 0:
        logger.info("Load balance done")

    # if we want to return cartesian, redefine pos
    if return_cartesian:
        pos1 = cpos1
        pos2 = cpos2

    # decompose based on cartesian positions
    layout = domain.decompose(cpos1, smoothing=0)
    pos1   = layout.exchange(pos1)
    w1     = layout.exchange(w1)

    # get the position/weight of the secondaries
    if smoothing > boxsize.max() * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
        w2   = numpy.concatenate(comm.allgather(w2), axis=0)
    else:
        layout  = domain.decompose(cpos2, smoothing=smoothing)
        pos2 = layout.exchange(pos2)
        w2   = layout.exchange(w2)

    # log the decomposition breakdown
    log_decomposition(comm, logger, N1, N2, pos1, pos2)

    return (pos1, w1), (pos2, w2)
Exemple #2
0
def decompose_survey_data(first, second, attrs, logger, smoothing, domain_factor=2,
                            angular=False, return_cartesian=False):
    """
    Perform a domain decomposition on survey data, returning the
    domain-demposed position and weight arrays for each object in the
    correlating pair.

    The domain decomposition is based on the Cartesian coordinates of
    the input data (assumed to be in sky coordinates).

    Load balancing is required since the distribution in Cartesian space
    will likely not be uniform.

    The implementation follows:

    1. Decompose the first source and balance the particle load, such that
       the first source is evenly distributed across all ranks and the
       objects are spatially tight on a given rank.
    2. Decompose the second source, ensuring a given rank holds all
       particles within the desired maximum separation.

    Parameters
    ----------
    first : CatalogSource
        the first source we are correlating
    second : CatalogSource
        the second source we are correlating
    attrs : dict
        dict of parameters from the pair counting algorithm
    logger :
        the current active logger
    smoothing :
        the maximum Cartesian separation implied by the user's binning
    domain_factor : int, optional
        the factor by which we over-sample the mesh with cells in a given
        direction; higher values can lead to better performance
    angular : bool, optional
        if ``True``, the Cartesian positions used in the domain
        decomposition are on the unit sphere
    return_cartesian : bool, optional
        whether to return the pos as (ra, dec, z), or the Cartesian (x, y, z)

    Returns
    -------
    (pos1, w1), (pos2, w2) : array_like
        the (decomposed) set of positions and weights to correlate
    """
    from nbodykit.transform import StackColumns
    comm = first.comm

    # either (ra,dec) or (ra,dec,redshift)
    poscols = [attrs['ra'], attrs['dec']]
    if not angular: poscols += [attrs['redshift']]

    # determine processor division for domain decomposition
    np = split_size_3d(comm.size)
    if comm.rank == 0:
        logger.info("using cpu grid decomposition: %s" %str(np))

    # stack position and compute
    pos1 = StackColumns(*[first[col] for col in poscols])
    pos1, w1 = first.compute(pos1, first[attrs['weight']])
    N1 = comm.allreduce(len(pos1))

    # only need cosmo if not angular
    cosmo = attrs.get('cosmo', None) if not angular else None
    if not angular and cosmo is None:
        raise ValueError("need a cosmology to decompose non-angular survey data")
    cpos1, cpos1_min, cpos1_max, rdist1 = get_cartesian(comm, pos1, cosmo=cosmo)

    # pass in comoving dist to Corrfunc instead of redshift
    if not angular:
        pos1 = pos1.copy() # we need to overwrite it; dask doesn't always return a copy after 0.18.1
        pos1[:,2] = rdist1

    # set up position for second too
    if second is not None:

        # stack position and compute for "second"
        pos2 = StackColumns(*[second[col] for col in poscols])
        pos2, w2 = second.compute(pos2, second[attrs['weight']])
        N2 = comm.allreduce(len(pos2))

        # get comoving dist and boxsize
        cpos2, cpos2_min, cpos2_max, rdist2 = get_cartesian(comm, pos2, cosmo=cosmo)

        # pass in comoving distance instead of redshift
        if not angular:
            pos2 = pos2.copy() # we need to overwrite it; dask doesn't always return a copy after 0.18.1
            pos2[:,2] = rdist2
    else:
        pos2 = pos1
        w2 = w1
        N2 = N1
        cpos2_min = cpos1_min
        cpos2_max = cpos1_max
        cpos2 = cpos1

    # determine global boxsize
    if second is None:
        cpos_min = cpos1_min
        cpos_max = cpos1_max
    else:
        cpos_min = numpy.min(numpy.vstack([cpos1_min, cpos2_min]), axis=0)
        cpos_max = numpy.max(numpy.vstack([cpos1_max, cpos2_max]), axis=0)

    boxsize = cpos_max - cpos_min

    if comm.rank == 0:
        logger.info("position variable range on rank 0 (max, min) = %s, %s" % (cpos_max, cpos_min))

    # initialize the domain
    # NOTE: over-decompose by factor of 2 to trigger load balancing
    grid = [
        numpy.linspace(cpos_min[0], cpos_max[0], domain_factor*np[0] + 1, endpoint=True),
        numpy.linspace(cpos_min[1], cpos_max[1], domain_factor*np[1] + 1, endpoint=True),
        numpy.linspace(cpos_min[2], cpos_max[2], domain_factor*np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=False)

    # balance the load
    domain.loadbalance(domain.load(cpos1))

    if comm.rank == 0:
        logger.info("Load balance done")

    # if we want to return cartesian, redefine pos
    if return_cartesian:
        pos1 = cpos1
        pos2 = cpos2

    # decompose based on cartesian positions
    layout = domain.decompose(cpos1, smoothing=0)
    pos1   = layout.exchange(pos1)
    w1     = layout.exchange(w1)

    # get the position/weight of the secondaries
    if smoothing > boxsize.max() * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
        w2   = numpy.concatenate(comm.allgather(w2), axis=0)
    else:
        layout  = domain.decompose(cpos2, smoothing=smoothing)
        pos2 = layout.exchange(pos2)
        w2   = layout.exchange(w2)

    # log the decomposition breakdown
    log_decomposition(comm, logger, N1, N2, pos1, pos2)

    return (pos1, w1), (pos2, w2)
Exemple #3
0
def fof(source, linking_length, comm, periodic, domain_factor, logger):
    """
    Run Friends-of-friends halo finder.

    Friends-of-friends was first used by Davis et al 1985 to define
    halos in hierachical structure formation of cosmological simulations.
    The algorithm is also known as DBSCAN in computer science.
    The subroutine here implements a parallel version of the FOF.

    The underlying local FOF algorithm is from `kdcount.cluster`,
    which is an adaptation of the implementation in Volker Springel's
    Gadget and Martin White's PM. It could have been done faster.

    Parameters
    ----------
    source: CatalogSource
        the input source of particles; must support 'Position' column;
        ``source.attrs['BoxSize']`` is also used
    linking_length: float
        linking length in data units. (Usually Mpc/h).
    comm: MPI.Comm
        The mpi communicator.

    Returns
    -------
    minid: array_like
        A unique label of each position. The label is not ranged from 0.
    """
    from pmesh.domain import GridND

    np = split_size_3d(comm.size)
    nd = np * domain_factor

    if periodic:
        BoxSize = source.attrs.get('BoxSize', None)
        if BoxSize is None:
            raise ValueError(
                "cannot compute FOF clustering of source without 'BoxSize' in ``attrs`` dict"
            )
        if numpy.isscalar(BoxSize):
            BoxSize = [BoxSize, BoxSize, BoxSize]

        left = [0, 0, 0]
        right = BoxSize
    else:
        BoxSize = None
        left = numpy.min(comm.allgather(
            source['Position'].min(axis=0).compute()),
                         axis=0)
        right = numpy.max(comm.allgather(
            source['Position'].max(axis=0).compute()),
                          axis=0)

    grid = [
        numpy.linspace(left[0], right[0], nd[0] + 1, endpoint=True),
        numpy.linspace(left[1], right[1], nd[1] + 1, endpoint=True),
        numpy.linspace(left[2], right[2], nd[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=periodic)

    Position = source.compute(source['Position'])
    np = comm.allgather(len(Position))
    if comm.rank == 0:
        logger.info(
            "Number of particles max/min = %d / %d before spatial decomposition"
            % (max(np), min(np)))

    # balance the load
    domain.loadbalance(domain.load(Position))

    layout = domain.decompose(Position, smoothing=linking_length * 1)

    np = comm.allgather(layout.recvlength)
    if comm.rank == 0:
        logger.info(
            "Number of particles max/min = %d / %d after spatial decomposition"
            % (max(np), min(np)))

    comm.barrier()
    minid = _fof_local(layout, Position, BoxSize, linking_length, comm)

    comm.barrier()
    minid = _fof_merge(layout, minid, comm)

    return minid
Exemple #4
0
def fof(source, linking_length, comm, periodic, domain_factor, logger):
    """
    Run Friends-of-friends halo finder.

    Friends-of-friends was first used by Davis et al 1985 to define
    halos in hierachical structure formation of cosmological simulations.
    The algorithm is also known as DBSCAN in computer science.
    The subroutine here implements a parallel version of the FOF.

    The underlying local FOF algorithm is from `kdcount.cluster`,
    which is an adaptation of the implementation in Volker Springel's
    Gadget and Martin White's PM. It could have been done faster.

    Parameters
    ----------
    source: CatalogSource
        the input source of particles; must support 'Position' column;
        ``source.attrs['BoxSize']`` is also used
    linking_length: float
        linking length in data units. (Usually Mpc/h).
    comm: MPI.Comm
        The mpi communicator.

    Returns
    -------
    minid: array_like
        A unique label of each position. The label is not ranged from 0.
    """
    from pmesh.domain import GridND

    np = split_size_3d(comm.size)
    nd = np * domain_factor

    if periodic:
        BoxSize = source.attrs.get('BoxSize', None)
        if BoxSize is None:
            raise ValueError("cannot compute FOF clustering of source without 'BoxSize' in ``attrs`` dict")
        if numpy.isscalar(BoxSize):
            BoxSize = [BoxSize, BoxSize, BoxSize]

        left = [0, 0, 0]
        right = BoxSize
    else:
        BoxSize = None
        left = numpy.min(comm.allgather(source['Position'].min(axis=0).compute()), axis=0)
        right = numpy.max(comm.allgather(source['Position'].max(axis=0).compute()), axis=0)

    grid = [
        numpy.linspace(left[0], right[0], nd[0] + 1, endpoint=True),
        numpy.linspace(left[1], right[1], nd[1] + 1, endpoint=True),
        numpy.linspace(left[2], right[2], nd[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=periodic)

    Position = source.compute(source['Position'])
    np = comm.allgather(len(Position))
    if comm.rank == 0:
        logger.info("Number of particles max/min = %d / %d before spatial decomposition" % (max(np), min(np)))

    # balance the load
    domain.loadbalance(domain.load(Position))

    layout = domain.decompose(Position, smoothing=linking_length * 1)

    np = comm.allgather(layout.newlength)
    if comm.rank == 0:
        logger.info("Number of particles max/min = %d / %d after spatial decomposition" % (max(np), min(np)))

    comm.barrier()
    minid = _fof_local(layout, Position, BoxSize, linking_length, comm)

    comm.barrier()
    minid = _fof_merge(layout, minid, comm)

    return minid