Example #1
0
    def run(self):
        """
        Compute the density proxy. This attaches the following attribute:

        - :attr:`density`

        Attributes
        ----------
        density : array_like, length: :attr:`size`
            a unit-less, proxy density value for each object on the local
            rank. This is computed as the inverse cube of the distance
            to the closest, nearest neighbor
        """

        # do the domain decomposition
        Np = split_size_3d(self.comm.size)
        edges = [
            numpy.linspace(0,
                           self.attrs['BoxSize'][d],
                           Np[d] + 1,
                           endpoint=True) for d in range(3)
        ]
        domain = GridND(comm=self.comm, periodic=True, edges=edges)

        # read all position and exchange
        pos = self._source.compute(self._source['Position'])
        layout = domain.decompose(pos,
                                  smoothing=self.attrs['margin'] *
                                  self.attrs['meansep'])
        xpos = layout.exchange(pos)

        # wait for scipy 0.19.1
        assert all(self.attrs['BoxSize'] == self.attrs['BoxSize'][0])
        xpos[...] /= self.attrs['BoxSize']
        xpos %= 1

        # KDTree
        tree = KDTree(xpos, boxsize=1.0)
        d, i = tree.query(xpos, k=[8])
        d = d[:, 0]

        # gather back to original root, taking the minimum distance
        d = layout.gather(d, mode=numpy.fmin)
        self.density = 1 / (d**3 * self.attrs['BoxSize'].prod())
Example #2
0
    def __init__(self, source, domain=None, position='Position', columns=None):
        comm = source.comm

        if domain is None:
            # determine processor division for domain decomposition
            np = split_size_3d(comm.size)

            if comm.rank == 0:
                self.logger.info("using cpu grid decomposition: %s" % str(np))

            grid = [
                numpy.linspace(0,
                               source.attrs['BoxSize'][0],
                               np[0] + 1,
                               endpoint=True),
                numpy.linspace(0,
                               source.attrs['BoxSize'][1],
                               np[1] + 1,
                               endpoint=True),
                numpy.linspace(0,
                               source.attrs['BoxSize'][2],
                               np[2] + 1,
                               endpoint=True),
            ]

            domain = GridND(grid, comm=comm)

        self.domain = domain
        self.source = source

        layout = domain.decompose(source[position].compute())

        self._size = layout.recvlength

        CatalogSource.__init__(self, comm=comm)
        self.attrs.update(source.attrs)

        self._frozen = {}
        if columns is None: columns = source.columns

        for column in columns:
            data = source[column].compute()
            self._frozen[column] = self.make_column(layout.exchange(data))
Example #3
0
    def run(self):
        """
        Compute the density proxy. This attaches the following attribute:

        - :attr:`density`

        Attributes
        ----------
        density : array_like, length: :attr:`size`
            a unit-less, proxy density value for each object on the local
            rank. This is computed as the inverse cube of the distance
            to the closest, nearest neighbor
        """

        # do the domain decomposition
        Np = split_size_3d(self.comm.size)
        edges = [numpy.linspace(0, self.attrs['BoxSize'][d], Np[d] + 1, endpoint=True) for d in range(3)]
        domain = GridND(comm=self.comm, periodic=True, edges=edges)

        # read all position and exchange
        pos = self._source.compute(self._source['Position'])
        layout = domain.decompose(pos, smoothing=self.attrs['margin'] * self.attrs['meansep'])
        xpos = layout.exchange(pos)

        # wait for scipy 0.19.1
        assert all(self.attrs['BoxSize'] == self.attrs['BoxSize'][0])
        xpos[...] /= self.attrs['BoxSize']
        xpos %= 1

        # KDTree
        tree = KDTree(xpos, boxsize=1.0)
        d, i = tree.query(xpos, k=[8])
        d = d[:, 0]

        # gather back to original root, taking the minimum distance
        d = layout.gather(d, mode=numpy.fmin)
        self.density = 1 / (d ** 3 * self.attrs['BoxSize'].prod())
Example #4
0
def decompose_box_data(first, second, attrs, logger, smoothing):
    """
    Perform a domain decomposition on simulation box data, returning the
    domain-demposed position and weight arrays for each object in the
    correlating pair.

    No load balancing is required since the particles in are assumed to
    be in a box.

    The implementation follows:

    1. Decompose the first source such that the objects are spatially
       tight on a given rank.
    2. Decompose the second source, ensuring a given rank holds all
       particles within the desired maximum separation.

    Parameters
    ----------
    first : CatalogSource
        the first source we are correlating
    second : CatalogSource
        the second source we are correlating
    attrs : dict
        dict of parameters from the pair counting algorithm
    logger :
        the current active logger
    smoothing :
        the maximum Cartesian separation implied by the user's binning

    Returns
    -------
    (pos1, w1), (pos2, w2) : array_like
        the (decomposed) set of positions and weights to correlate
    """
    comm = first.comm

    # determine processor division for domain decomposition
    np = split_size_3d(comm.size)
    if comm.rank == 0:
        logger.info("using cpu grid decomposition: %s" %str(np))

    # get the (periodic-enforced) position for first
    pos1 = first['Position']
    if attrs['periodic']:
        pos1 %= attrs['BoxSize']
    pos1, w1 = first.compute(pos1, first[attrs['weight']])
    N1 = comm.allreduce(len(pos1))

    # get the (periodic-enforced) position for second
    if second is not None:
        pos2 = second['Position']
        if attrs['periodic']:
            pos2 %= attrs['BoxSize']
        pos2, w2 = second.compute(pos2, second[attrs['weight']])
        N2 = comm.allreduce(len(pos2))
    else:
        pos2 = pos1
        w2 = w1
        N2 = N1

    # domain decomposition
    grid = [
        numpy.linspace(0, attrs['BoxSize'][0], np[0] + 1, endpoint=True),
        numpy.linspace(0, attrs['BoxSize'][1], np[1] + 1, endpoint=True),
        numpy.linspace(0, attrs['BoxSize'][2], np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm)

    # exchange first particles
    layout = domain.decompose(pos1, smoothing=0)
    pos1 = layout.exchange(pos1)
    w1 = layout.exchange(w1)

    # exchange second particles
    if smoothing > attrs['BoxSize'].max() * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
        w2   = numpy.concatenate(comm.allgather(w2), axis=0)
    else:
        layout  = domain.decompose(pos2, smoothing=smoothing)
        pos2 = layout.exchange(pos2)
        w2   = layout.exchange(w2)

    # log the decomposition breakdown
    log_decomposition(comm, logger, N1, N2, pos1, pos2)

    return (pos1, w1), (pos2, w2)
Example #5
0
def decompose_survey_data(first, second, attrs, logger, smoothing, domain_factor=2,
                            angular=False, return_cartesian=False):
    """
    Perform a domain decomposition on survey data, returning the
    domain-demposed position and weight arrays for each object in the
    correlating pair.

    The domain decomposition is based on the Cartesian coordinates of
    the input data (assumed to be in sky coordinates).

    Load balancing is required since the distribution in Cartesian space
    will likely not be uniform.

    The implementation follows:

    1. Decompose the first source and balance the particle load, such that
       the first source is evenly distributed across all ranks and the
       objects are spatially tight on a given rank.
    2. Decompose the second source, ensuring a given rank holds all
       particles within the desired maximum separation.

    Parameters
    ----------
    first : CatalogSource
        the first source we are correlating
    second : CatalogSource
        the second source we are correlating
    attrs : dict
        dict of parameters from the pair counting algorithm
    logger :
        the current active logger
    smoothing :
        the maximum Cartesian separation implied by the user's binning
    domain_factor : int, optional
        the factor by which we over-sample the mesh with cells in a given
        direction; higher values can lead to better performance
    angular : bool, optional
        if ``True``, the Cartesian positions used in the domain
        decomposition are on the unit sphere
    return_cartesian : bool, optional
        whether to return the pos as (ra, dec, z), or the Cartesian (x, y, z)

    Returns
    -------
    (pos1, w1), (pos2, w2) : array_like
        the (decomposed) set of positions and weights to correlate
    """
    from nbodykit.transform import StackColumns
    comm = first.comm

    # either (ra,dec) or (ra,dec,redshift)
    poscols = [attrs['ra'], attrs['dec']]
    if not angular: poscols += [attrs['redshift']]

    # determine processor division for domain decomposition
    np = split_size_3d(comm.size)
    if comm.rank == 0:
        logger.info("using cpu grid decomposition: %s" %str(np))

    # stack position and compute
    pos1 = StackColumns(*[first[col] for col in poscols])
    pos1, w1 = first.compute(pos1, first[attrs['weight']])
    N1 = comm.allreduce(len(pos1))

    # only need cosmo if not angular
    cosmo = attrs.get('cosmo', None) if not angular else None
    if not angular and cosmo is None:
        raise ValueError("need a cosmology to decompose non-angular survey data")
    cpos1, cpos1_min, cpos1_max, rdist1 = get_cartesian(comm, pos1, cosmo=cosmo)

    # pass in comoving dist to Corrfunc instead of redshift
    if not angular:
        pos1[:,2] = rdist1

    # set up position for second too
    if second is not None:

        # stack position and compute for "second"
        pos2 = StackColumns(*[second[col] for col in poscols])
        pos2, w2 = second.compute(pos2, second[attrs['weight']])
        N2 = comm.allreduce(len(pos2))

        # get comoving dist and boxsize
        cpos2, cpos2_min, cpos2_max, rdist2 = get_cartesian(comm, pos2, cosmo=cosmo)

        # pass in comoving distance instead of redshift
        if not angular:
            pos2[:,2] = rdist2
    else:
        pos2 = pos1
        w2 = w1
        N2 = N1
        cpos2_min = cpos1_min
        cpos2_max = cpos1_max
        cpos2 = cpos1

    # determine global boxsize
    if second is None:
        cpos_min = cpos1_min
        cpos_max = cpos1_max
    else:
        cpos_min = numpy.min(numpy.vstack([cpos1_min, cpos2_min]), axis=0)
        cpos_max = numpy.max(numpy.vstack([cpos1_max, cpos2_max]), axis=0)

    boxsize = cpos_max - cpos_min

    if comm.rank == 0:
        logger.info("position variable range on rank 0 (max, min) = %s, %s" % (cpos_max, cpos_min))

    # initialize the domain
    # NOTE: over-decompose by factor of 2 to trigger load balancing
    grid = [
        numpy.linspace(cpos_min[0], cpos_max[0], domain_factor*np[0] + 1, endpoint=True),
        numpy.linspace(cpos_min[1], cpos_max[1], domain_factor*np[1] + 1, endpoint=True),
        numpy.linspace(cpos_min[2], cpos_max[2], domain_factor*np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=False)

    # balance the load
    domain.loadbalance(domain.load(cpos1))

    if comm.rank == 0:
        logger.info("Load balance done")

    # if we want to return cartesian, redefine pos
    if return_cartesian:
        pos1 = cpos1
        pos2 = cpos2

    # decompose based on cartesian positions
    layout = domain.decompose(cpos1, smoothing=0)
    pos1   = layout.exchange(pos1)
    w1     = layout.exchange(w1)

    # get the position/weight of the secondaries
    if smoothing > boxsize.max() * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
        w2   = numpy.concatenate(comm.allgather(w2), axis=0)
    else:
        layout  = domain.decompose(cpos2, smoothing=smoothing)
        pos2 = layout.exchange(pos2)
        w2   = layout.exchange(w2)

    # log the decomposition breakdown
    log_decomposition(comm, logger, N1, N2, pos1, pos2)

    return (pos1, w1), (pos2, w2)
Example #6
0
def fof(datasource, linking_length, nmin, comm=MPI.COMM_WORLD, log_level=logging.DEBUG):
    """ Run Friend-of-friend halo finder.

        Friend-of-friend was first used by Davis et al 1985 to define
        halos in hierachical structure formation of cosmological simulations.
        The algorithm is also known as DBSCAN in computer science. 
        The subroutine here implements a parallel version of the FOF. 

        The underlying local FOF algorithm is from `kdcount.cluster`, 
        which is an adaptation of the implementation in Volker Springel's 
        Gadget and Martin White's PM. It could have been done faster.

        Parameters
        ----------
        datasource: DataSource
            datasource; must support Position.
            datasource.BoxSize is used too.
        linking_length: float
            linking length in data units. (Usually Mpc/h).
        nmin: int
            Minimal length (number of particles) of a halo. Features
            with less than nmin particles are considered noise, and
            removed from the catalogue

        comm: MPI.Comm
            The mpi communicator.

        Returns
        -------
        label: array_like
            The halo label of each position. A label of 0 standands for not in any halo.
 
    """
    if log_level is not None: logger.setLevel(log_level)

    np = split_size_3d(comm.size)

    grid = [
        numpy.linspace(0, datasource.BoxSize[0], np[0] + 1, endpoint=True),
        numpy.linspace(0, datasource.BoxSize[1], np[1] + 1, endpoint=True),
        numpy.linspace(0, datasource.BoxSize[2], np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid)

    with datasource.open() as stream:
        [[Position]] = stream.read(['Position'], full=True)

    if comm.rank == 0: logger.info("ll %g. " % linking_length)
    if comm.rank == 0: logger.debug('grid: %s' % str(grid))

    layout = domain.decompose(Position, smoothing=linking_length * 1)

    comm.barrier()
    if comm.rank == 0: logger.info("Starting local fof.")

    minid = local_fof(layout, Position, datasource.BoxSize, linking_length, comm)
    
    comm.barrier()
    if comm.rank == 0: logger.info("Finished local fof.")

    if comm.rank == 0: logger.info("Merged global FOF.")

    minid = fof_merge(layout, minid, comm)
    del layout
    # sort calculate halo catalogue
    label = fof_halo_label(minid, comm, thresh=nmin)

    return label
Example #7
0
def fof(source, linking_length, comm, periodic):
    """
    Run Friends-of-friends halo finder.

    Friends-of-friends was first used by Davis et al 1985 to define
    halos in hierachical structure formation of cosmological simulations.
    The algorithm is also known as DBSCAN in computer science.
    The subroutine here implements a parallel version of the FOF.

    The underlying local FOF algorithm is from `kdcount.cluster`,
    which is an adaptation of the implementation in Volker Springel's
    Gadget and Martin White's PM. It could have been done faster.

    Parameters
    ----------
    source: CatalogSource
        the input source of particles; must support 'Position' column;
        ``source.attrs['BoxSize']`` is also used
    linking_length: float
        linking length in data units. (Usually Mpc/h).
    comm: MPI.Comm
        The mpi communicator.

    Returns
    -------
    minid: array_like
        A unique label of each position. The label is not ranged from 0.
    """
    from pmesh.domain import GridND

    np = split_size_3d(comm.size)

    if periodic:
        BoxSize = source.attrs.get('BoxSize', None)
        if BoxSize is None:
            raise ValueError("cannot compute FOF clustering of source without 'BoxSize' in ``attrs`` dict")
        if numpy.isscalar(BoxSize):
            BoxSize = [BoxSize, BoxSize, BoxSize]

        left = [0, 0, 0]
        right = BoxSize
    else:
        BoxSize = None
        left = numpy.min(comm.allgather(source['Position'].min(axis=0).compute()), axis=0)
        right = numpy.max(comm.allgather(source['Position'].max(axis=0).compute()), axis=0)

    grid = [
        numpy.linspace(left[0], right[0], np[0] + 1, endpoint=True),
        numpy.linspace(left[1], right[1], np[1] + 1, endpoint=True),
        numpy.linspace(left[2], right[2], np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=periodic)

    Position = source.compute(source['Position'])
    layout = domain.decompose(Position, smoothing=linking_length * 1)

    comm.barrier()
    minid = _fof_local(layout, Position, BoxSize, linking_length, comm)

    comm.barrier()
    minid = _fof_merge(layout, minid, comm)

    return minid
Example #8
0
    def run(self):
        """
        Compute the cylindrical groups, saving the results to the
        :attr:`groups` attribute

        Attributes
        ----------
        groups : :class:`~nbodykit.source.catalog.array.ArrayCatalog`
            a catalog holding the result of the grouping. The length of the
            catalog is equal to the length of the input size, i.e., the length
            is equal to the :attr:`size` attribute. The relevant fields are:

            #. cgm_type :
                a flag specifying the type for each object,
                with 0 specifying CGM central and 1 denoting CGM satellite
            #. cgm_haloid :
                The index of the CGM object this object belongs to; an integer
                between 0 and the total number of CGM halos
            #. num_cgm_sats :
                The number of satellites in the CGM halo
        """
        from pmesh.domain import GridND
        from nbodykit.algorithms.fof import split_size_3d

        comm = self.comm
        rperp, rpar = self.attrs['rperp'], self.attrs['rpar']
        rankby = self.attrs['rankby']

        if self.attrs['periodic']:
            boxsize = self.attrs['BoxSize']
        else:
            boxsize = None

        np = split_size_3d(self.comm.size)
        if self.comm.rank == 0:
            self.logger.info("using cpu grid decomposition: %s" % str(np))

        # add a column for original index
        self.source['origind'] = self.source.Index

        # sort the data
        data = self.source.sort(self.attrs['rankby'],
                                usecols=['Position', 'origind'])

        # add a column to track sorted index
        data['sortindex'] = data.Index

        # global min/max across all ranks
        pos = data.compute(data['Position'])
        posmin = numpy.asarray(comm.allgather(pos.min(axis=0))).min(axis=0)
        posmax = numpy.asarray(comm.allgather(pos.max(axis=0))).max(axis=0)

        # domain decomposition
        grid = [
            numpy.linspace(posmin[0], posmax[0], np[0] + 1, endpoint=True),
            numpy.linspace(posmin[0], posmax[1], np[1] + 1, endpoint=True),
            numpy.linspace(posmin[0], posmax[2], np[2] + 1, endpoint=True),
        ]
        domain = GridND(grid, comm=comm)

        # run the CGM algorithm
        groups = cgm(comm, data, domain, rperp, rpar,
                     self.attrs['flat_sky_los'], boxsize)

        # make the final structured array
        self.groups = ArrayCatalog(groups, comm=self.comm, **self.attrs)

        # log some info
        N_cen = (groups['cgm_type'] == 0).sum()
        isolated_N_cen = ((groups['cgm_type'] == 0) &
                          (groups['num_cgm_sats'] == 0)).sum()
        N_cen = self.comm.allreduce(N_cen)
        isolated_N_cen = self.comm.allreduce(isolated_N_cen)
        if self.comm.rank == 0:
            self.logger.info("found %d CGM centrals total" % N_cen)
            self.logger.info("%d/%d are isolated centrals (no satellites)" %
                             (isolated_N_cen, N_cen))

        # delete the column we added to source
        del self.source['origind']
Example #9
0
def compute_brutal_corr(datasources,
                        redges,
                        Nmu=0,
                        comm=None,
                        subsample=1,
                        los='z',
                        poles=[]):
    r"""
    Compute the correlation function by direct pair summation, either as a function
    of separation (`R`) or as a function of separation and line-of-sight angle (`R`, `mu`)
    
    The estimator used to compute the correlation function is:
    
    .. math:: 
        
        \xi(r, \mu) = DD(r, \mu) / RR(r, \mu) - 1.
    
    where `DD` is the number of data-data pairs, and `RR` is the number of random-random pairs,
    which is determined solely by the binning used, assuming a constant number density
    
    Parameters
    ----------
    datasources : list of DataSource objects
        the list of data instances from which the 3D correlation will be computed
    redges : array_like
        the bin edges for the `R` variable
    Nmu : int, optional
        the number of desired `mu` bins, where `mu` is the cosine 
        of the angle from the line-of-sight. Default is `0`, in 
        which case the correlation function is binned as a function of `R` only
    comm : MPI.Communicator, optional
        the communicator to pass to the ``ParticleMesh`` object. If not
        provided, ``MPI.COMM_WORLD`` is used
    subsample : int, optional
        downsample the input datasources by choosing 1 out of every `N` points. 
        Default is `1` (no subsampling).
    los : str, {'x', 'y', 'z'}, optional
        the dimension to treat as the line-of-sight; default is 'z'.
    poles : list of int, optional
        integers specifying the multipoles to compute from the 2D correlation function
        
    Returns
    -------
    pc : :class:`kdcount.correlate.paircount`
        the pair counting instance 
    xi : array_like
        the correlation function result; if `poles` supplied, the shape is 
        `(len(redges)-1, len(poles))`, otherwise, the shape is either `(len(redges)-1, )`
        or `(len(redges)-1, Nmu)`
    RR : array_like
        the number of random-random pairs (used as normalization of the data-data pairs)
    """
    from pmesh.domain import GridND
    from kdcount import correlate

    # some setup
    if los not in "xyz": raise ValueError("`los` must be `x`, `y`, or `z`")
    los = "xyz".index(los)
    poles = numpy.array(poles)
    Rmax = redges[-1]
    if comm is None: comm = MPI.COMM_WORLD

    # determine processor division for domain decomposition
    for Nx in range(int(comm.size**0.3333) + 1, 0, -1):
        if comm.size % Nx == 0: break
    else:
        Nx = 1
    for Ny in range(int(comm.size**0.5) + 1, 0, -1):
        if (comm.size // Nx) % Ny == 0: break
    else:
        Ny = 1
    Nz = comm.size // Nx // Ny
    Nproc = [Nx, Ny, Nz]

    # log some info
    if comm.rank == 0:
        logger.info('Nproc = %s' % str(Nproc))
        logger.info('Rmax = %g' % Rmax)

    # domain decomposition
    grid = [
        numpy.linspace(0,
                       datasources[0].BoxSize[i],
                       Nproc[i] + 1,
                       endpoint=True) for i in range(3)
    ]
    domain = GridND(grid, comm=comm)

    # read position for field #1
    with datasources[0].open() as stream:
        [[pos1]] = stream.read(['Position'], full=True)
    pos1 = pos1[comm.rank * subsample // comm.size::subsample]
    N1 = comm.allreduce(len(pos1))

    # read position for field #2
    if len(datasources) > 1:
        with datasources[1].open() as stream:
            [[pos2]] = stream.read(['Position'], full=True)
        pos2 = pos2[comm.rank * subsample // comm.size::subsample]
        N2 = comm.allreduce(len(pos2))
    else:
        pos2 = pos1
        N2 = N1

    # exchange field #1 positions
    layout = domain.decompose(pos1, smoothing=0)
    pos1 = layout.exchange(pos1)
    if comm.rank == 0: logger.info('exchange pos1')

    # exchange field #2 positions
    if Rmax > datasources[0].BoxSize[0] * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
    else:
        layout = domain.decompose(pos2, smoothing=Rmax)
        pos2 = layout.exchange(pos2)
    if comm.rank == 0: logger.info('exchange pos2')

    # initialize the trees to hold the field points
    tree1 = correlate.points(pos1, boxsize=datasources[0].BoxSize)
    tree2 = correlate.points(pos2, boxsize=datasources[0].BoxSize)

    # log the sizes of the trees
    logger.info('rank %d correlating %d x %d' %
                (comm.rank, len(tree1), len(tree2)))
    if comm.rank == 0: logger.info('all correlating %d x %d' % (N1, N2))

    # use multipole binning
    if len(poles):
        bins = correlate.FlatSkyMultipoleBinning(redges,
                                                 poles,
                                                 los,
                                                 compute_mean_coords=True)
    # use (R, mu) binning
    elif Nmu > 0:
        bins = correlate.FlatSkyBinning(redges,
                                        Nmu,
                                        los,
                                        compute_mean_coords=True)
    # use R binning
    else:
        bins = correlate.RBinning(redges, compute_mean_coords=True)

    # do the pair counting
    # have to set usefast = False to get mean centers, or exception thrown
    pc = correlate.paircount(tree2, tree1, bins, np=0, usefast=False)
    pc.sum1[:] = comm.allreduce(pc.sum1)

    # get the mean bin values, reducing from all ranks
    pc.pair_counts[:] = comm.allreduce(pc.pair_counts)
    with numpy.errstate(invalid='ignore'):
        if bins.Ndim > 1:
            for i in range(bins.Ndim):
                pc.mean_centers[i][:] = comm.allreduce(
                    pc.mean_centers_sum[i]) / pc.pair_counts
        else:
            pc.mean_centers[:] = comm.allreduce(
                pc.mean_centers_sum[0]) / pc.pair_counts

    # compute the random pairs from the fractional volume
    RR = 1. * N1 * N2 / datasources[0].BoxSize.prod()
    if Nmu > 0:
        dr3 = numpy.diff(pc.edges[0]**3)
        dmu = numpy.diff(pc.edges[1])
        RR *= 2. / 3. * numpy.pi * dr3[:, None] * dmu[None, :]
    else:
        RR *= 4. / 3. * numpy.pi * numpy.diff(pc.edges**3)

    # return the correlation and the pair count object
    xi = (1. * pc.sum1 / RR) - 1.0
    if len(poles):
        xi = xi.T  # makes ell the second axis
        xi[:, poles != 0] += 1.0  # only monopole gets the minus one

    return pc, xi, RR
Example #10
0
def decompose_box_data(first, second, attrs, logger, smoothing):
    """
    Perform a domain decomposition on simulation box data, returning the
    domain-demposed position and weight arrays for each object in the
    correlating pair.

    No load balancing is required since the particles in are assumed to
    be in a box.

    The implementation follows:

    1. Decompose the first source such that the objects are spatially
       tight on a given rank.
    2. Decompose the second source, ensuring a given rank holds all
       particles within the desired maximum separation.

    Parameters
    ----------
    first : CatalogSource
        the first source we are correlating
    second : CatalogSource
        the second source we are correlating
    attrs : dict
        dict of parameters from the pair counting algorithm
    logger :
        the current active logger
    smoothing :
        the maximum Cartesian separation implied by the user's binning

    Returns
    -------
    (pos1, w1), (pos2, w2) : array_like
        the (decomposed) set of positions and weights to correlate
    """
    comm = first.comm

    # determine processor division for domain decomposition
    np = split_size_3d(comm.size)
    if comm.rank == 0:
        logger.info("using cpu grid decomposition: %s" %str(np))

    # get the (periodic-enforced) position for first
    pos1 = first[attrs['position']]
    if attrs['periodic']:
        pos1 %= attrs['BoxSize']
    pos1, w1 = first.compute(pos1, first[attrs['weight']])
    N1 = comm.allreduce(len(pos1))

    # get the (periodic-enforced) position for second
    if second is not None:
        pos2 = second[attrs['position']]
        if attrs['periodic']:
            pos2 %= attrs['BoxSize']
        pos2, w2 = second.compute(pos2, second[attrs['weight']])
        N2 = comm.allreduce(len(pos2))
    else:
        pos2 = pos1
        w2 = w1
        N2 = N1

    # domain decomposition
    grid = [
        numpy.linspace(0, attrs['BoxSize'][0], np[0] + 1, endpoint=True),
        numpy.linspace(0, attrs['BoxSize'][1], np[1] + 1, endpoint=True),
        numpy.linspace(0, attrs['BoxSize'][2], np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm)

    # exchange first particles
    layout = domain.decompose(pos1, smoothing=0)
    pos1 = layout.exchange(pos1)
    w1 = layout.exchange(w1)

    # exchange second particles
    if smoothing > attrs['BoxSize'].max() * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
        w2   = numpy.concatenate(comm.allgather(w2), axis=0)
    else:
        layout  = domain.decompose(pos2, smoothing=smoothing)
        pos2 = layout.exchange(pos2)
        w2   = layout.exchange(w2)

    # log the decomposition breakdown
    log_decomposition(comm, logger, N1, N2, pos1, pos2)

    return (pos1, w1), (pos2, w2)
Example #11
0
def decompose_survey_data(first, second, attrs, logger, smoothing, domain_factor=2,
                            angular=False, return_cartesian=False):
    """
    Perform a domain decomposition on survey data, returning the
    domain-demposed position and weight arrays for each object in the
    correlating pair.

    The domain decomposition is based on the Cartesian coordinates of
    the input data (assumed to be in sky coordinates).

    Load balancing is required since the distribution in Cartesian space
    will likely not be uniform.

    The implementation follows:

    1. Decompose the first source and balance the particle load, such that
       the first source is evenly distributed across all ranks and the
       objects are spatially tight on a given rank.
    2. Decompose the second source, ensuring a given rank holds all
       particles within the desired maximum separation.

    Parameters
    ----------
    first : CatalogSource
        the first source we are correlating
    second : CatalogSource
        the second source we are correlating
    attrs : dict
        dict of parameters from the pair counting algorithm
    logger :
        the current active logger
    smoothing :
        the maximum Cartesian separation implied by the user's binning
    domain_factor : int, optional
        the factor by which we over-sample the mesh with cells in a given
        direction; higher values can lead to better performance
    angular : bool, optional
        if ``True``, the Cartesian positions used in the domain
        decomposition are on the unit sphere
    return_cartesian : bool, optional
        whether to return the pos as (ra, dec, z), or the Cartesian (x, y, z)

    Returns
    -------
    (pos1, w1), (pos2, w2) : array_like
        the (decomposed) set of positions and weights to correlate
    """
    from nbodykit.transform import StackColumns
    comm = first.comm

    # either (ra,dec) or (ra,dec,redshift)
    poscols = [attrs['ra'], attrs['dec']]
    if not angular: poscols += [attrs['redshift']]

    # determine processor division for domain decomposition
    np = split_size_3d(comm.size)
    if comm.rank == 0:
        logger.info("using cpu grid decomposition: %s" %str(np))

    # stack position and compute
    pos1 = StackColumns(*[first[col] for col in poscols])
    pos1, w1 = first.compute(pos1, first[attrs['weight']])
    N1 = comm.allreduce(len(pos1))

    # only need cosmo if not angular
    cosmo = attrs.get('cosmo', None) if not angular else None
    if not angular and cosmo is None:
        raise ValueError("need a cosmology to decompose non-angular survey data")
    cpos1, cpos1_min, cpos1_max, rdist1 = get_cartesian(comm, pos1, cosmo=cosmo)

    # pass in comoving dist to Corrfunc instead of redshift
    if not angular:
        pos1 = pos1.copy() # we need to overwrite it; dask doesn't always return a copy after 0.18.1
        pos1[:,2] = rdist1

    # set up position for second too
    if second is not None:

        # stack position and compute for "second"
        pos2 = StackColumns(*[second[col] for col in poscols])
        pos2, w2 = second.compute(pos2, second[attrs['weight']])
        N2 = comm.allreduce(len(pos2))

        # get comoving dist and boxsize
        cpos2, cpos2_min, cpos2_max, rdist2 = get_cartesian(comm, pos2, cosmo=cosmo)

        # pass in comoving distance instead of redshift
        if not angular:
            pos2 = pos2.copy() # we need to overwrite it; dask doesn't always return a copy after 0.18.1
            pos2[:,2] = rdist2
    else:
        pos2 = pos1
        w2 = w1
        N2 = N1
        cpos2_min = cpos1_min
        cpos2_max = cpos1_max
        cpos2 = cpos1

    # determine global boxsize
    if second is None:
        cpos_min = cpos1_min
        cpos_max = cpos1_max
    else:
        cpos_min = numpy.min(numpy.vstack([cpos1_min, cpos2_min]), axis=0)
        cpos_max = numpy.max(numpy.vstack([cpos1_max, cpos2_max]), axis=0)

    boxsize = cpos_max - cpos_min

    if comm.rank == 0:
        logger.info("position variable range on rank 0 (max, min) = %s, %s" % (cpos_max, cpos_min))

    # initialize the domain
    # NOTE: over-decompose by factor of 2 to trigger load balancing
    grid = [
        numpy.linspace(cpos_min[0], cpos_max[0], domain_factor*np[0] + 1, endpoint=True),
        numpy.linspace(cpos_min[1], cpos_max[1], domain_factor*np[1] + 1, endpoint=True),
        numpy.linspace(cpos_min[2], cpos_max[2], domain_factor*np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=False)

    # balance the load
    domain.loadbalance(domain.load(cpos1))

    if comm.rank == 0:
        logger.info("Load balance done")

    # if we want to return cartesian, redefine pos
    if return_cartesian:
        pos1 = cpos1
        pos2 = cpos2

    # decompose based on cartesian positions
    layout = domain.decompose(cpos1, smoothing=0)
    pos1   = layout.exchange(pos1)
    w1     = layout.exchange(w1)

    # get the position/weight of the secondaries
    if smoothing > boxsize.max() * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
        w2   = numpy.concatenate(comm.allgather(w2), axis=0)
    else:
        layout  = domain.decompose(cpos2, smoothing=smoothing)
        pos2 = layout.exchange(pos2)
        w2   = layout.exchange(w2)

    # log the decomposition breakdown
    log_decomposition(comm, logger, N1, N2, pos1, pos2)

    return (pos1, w1), (pos2, w2)
Example #12
0
def fof(datasource,
        linking_length,
        nmin,
        comm=MPI.COMM_WORLD,
        log_level=logging.DEBUG):
    """ Run Friend-of-friend halo finder.

        Friend-of-friend was first used by Davis et al 1985 to define
        halos in hierachical structure formation of cosmological simulations.
        The algorithm is also known as DBSCAN in computer science. 
        The subroutine here implements a parallel version of the FOF. 

        The underlying local FOF algorithm is from `kdcount.cluster`, 
        which is an adaptation of the implementation in Volker Springel's 
        Gadget and Martin White's PM. It could have been done faster.

        Parameters
        ----------
        datasource: DataSource
            datasource; must support Position.
            datasource.BoxSize is used too.
        linking_length: float
            linking length in data units. (Usually Mpc/h).
        nmin: int
            Minimal length (number of particles) of a halo. Features
            with less than nmin particles are considered noise, and
            removed from the catalogue

        comm: MPI.Comm
            The mpi communicator.

        Returns
        -------
        label: array_like
            The halo label of each position. A label of 0 standands for not in any halo.
 
    """
    if log_level is not None: logger.setLevel(log_level)

    np = split_size_3d(comm.size)

    grid = [
        numpy.linspace(0, datasource.BoxSize[0], np[0] + 1, endpoint=True),
        numpy.linspace(0, datasource.BoxSize[1], np[1] + 1, endpoint=True),
        numpy.linspace(0, datasource.BoxSize[2], np[2] + 1, endpoint=True),
    ]
    domain = GridND(grid)

    with datasource.open() as stream:
        [[Position]] = stream.read(['Position'], full=True)

    if comm.rank == 0: logger.info("ll %g. " % linking_length)
    if comm.rank == 0: logger.debug('grid: %s' % str(grid))

    layout = domain.decompose(Position, smoothing=linking_length * 1)

    comm.barrier()
    if comm.rank == 0: logger.info("Starting local fof.")

    minid = local_fof(layout, Position, datasource.BoxSize, linking_length,
                      comm)

    comm.barrier()
    if comm.rank == 0: logger.info("Finished local fof.")

    if comm.rank == 0: logger.info("Merged global FOF.")

    minid = fof_merge(layout, minid, comm)
    del layout
    # sort calculate halo catalogue
    label = fof_halo_label(minid, comm, thresh=nmin)

    return label
Example #13
0
def compute_brutal_corr(datasources, redges, Nmu=0, comm=None, subsample=1, los='z', poles=[]):
    r"""
    Compute the correlation function by direct pair summation, either as a function
    of separation (`R`) or as a function of separation and line-of-sight angle (`R`, `mu`)
    
    The estimator used to compute the correlation function is:
    
    .. math:: 
        
        \xi(r, \mu) = DD(r, \mu) / RR(r, \mu) - 1.
    
    where `DD` is the number of data-data pairs, and `RR` is the number of random-random pairs,
    which is determined solely by the binning used, assuming a constant number density
    
    Parameters
    ----------
    datasources : list of DataSource objects
        the list of data instances from which the 3D correlation will be computed
    redges : array_like
        the bin edges for the `R` variable
    Nmu : int, optional
        the number of desired `mu` bins, where `mu` is the cosine 
        of the angle from the line-of-sight. Default is `0`, in 
        which case the correlation function is binned as a function of `R` only
    comm : MPI.Communicator, optional
        the communicator to pass to the ``ParticleMesh`` object. If not
        provided, ``MPI.COMM_WORLD`` is used
    subsample : int, optional
        downsample the input datasources by choosing 1 out of every `N` points. 
        Default is `1` (no subsampling).
    los : str, {'x', 'y', 'z'}, optional
        the dimension to treat as the line-of-sight; default is 'z'.
    poles : list of int, optional
        integers specifying the multipoles to compute from the 2D correlation function
        
    Returns
    -------
    pc : :class:`kdcount.correlate.paircount`
        the pair counting instance 
    xi : array_like
        the correlation function result; if `poles` supplied, the shape is 
        `(len(redges)-1, len(poles))`, otherwise, the shape is either `(len(redges)-1, )`
        or `(len(redges)-1, Nmu)`
    RR : array_like
        the number of random-random pairs (used as normalization of the data-data pairs)
    """
    from pmesh.domain import GridND
    from kdcount import correlate
    
    # some setup
    if los not in "xyz": raise ValueError("`los` must be `x`, `y`, or `z`")
    los   = "xyz".index(los)
    poles = numpy.array(poles)
    Rmax  = redges[-1]
    if comm is None: comm = MPI.COMM_WORLD
    
    # determine processor division for domain decomposition
    for Nx in range(int(comm.size**0.3333) + 1, 0, -1):
        if comm.size % Nx == 0: break
    else:
        Nx = 1
    for Ny in range(int(comm.size**0.5) + 1, 0, -1):
        if (comm.size // Nx) % Ny == 0: break
    else:
        Ny = 1
    Nz = comm.size // Nx // Ny
    Nproc = [Nx, Ny, Nz]
    
    # log some info
    if comm.rank == 0:
        logger.info('Nproc = %s' %str(Nproc))
        logger.info('Rmax = %g' %Rmax)
    
    # domain decomposition
    grid = [numpy.linspace(0, datasources[0].BoxSize[i], Nproc[i]+1, endpoint=True) for i in range(3)]
    domain = GridND(grid, comm=comm)

    # read position for field #1 
    with datasources[0].open() as stream:
        [[pos1]] = stream.read(['Position'], full=True)
    pos1 = pos1[comm.rank * subsample // comm.size ::subsample]
    N1 = comm.allreduce(len(pos1))
    
    # read position for field #2
    if len(datasources) > 1:
        with datasources[1].open() as stream:
            [[pos2]] = stream.read(['Position'], full=True)
        pos2 = pos2[comm.rank * subsample // comm.size ::subsample]
        N2 = comm.allreduce(len(pos2))
    else:
        pos2 = pos1
        N2 = N1
    
    # exchange field #1 positions    
    layout = domain.decompose(pos1, smoothing=0)
    pos1 = layout.exchange(pos1)
    if comm.rank == 0: logger.info('exchange pos1')
        
    # exchange field #2 positions
    if Rmax > datasources[0].BoxSize[0] * 0.25:
        pos2 = numpy.concatenate(comm.allgather(pos2), axis=0)
    else:
        layout = domain.decompose(pos2, smoothing=Rmax)
        pos2 = layout.exchange(pos2)
    if comm.rank == 0: logger.info('exchange pos2')

    # initialize the trees to hold the field points
    tree1 = correlate.points(pos1, boxsize=datasources[0].BoxSize)
    tree2 = correlate.points(pos2, boxsize=datasources[0].BoxSize)

    # log the sizes of the trees
    logger.info('rank %d correlating %d x %d' %(comm.rank, len(tree1), len(tree2)))
    if comm.rank == 0: logger.info('all correlating %d x %d' %(N1, N2))

    # use multipole binning
    if len(poles):
        bins = correlate.FlatSkyMultipoleBinning(redges, poles, los, compute_mean_coords=True)
    # use (R, mu) binning
    elif Nmu > 0:
        bins = correlate.FlatSkyBinning(redges, Nmu, los, compute_mean_coords=True)
    # use R binning
    else:
        bins = correlate.RBinning(redges, compute_mean_coords=True)

    # do the pair counting
    # have to set usefast = False to get mean centers, or exception thrown
    pc = correlate.paircount(tree2, tree1, bins, np=0, usefast=False)
    pc.sum1[:] = comm.allreduce(pc.sum1)
    
    # get the mean bin values, reducing from all ranks
    pc.pair_counts[:] = comm.allreduce(pc.pair_counts)
    with numpy.errstate(invalid='ignore'):
        if bins.Ndim > 1:
            for i in range(bins.Ndim):
                pc.mean_centers[i][:] = comm.allreduce(pc.mean_centers_sum[i]) / pc.pair_counts
        else:
            pc.mean_centers[:] = comm.allreduce(pc.mean_centers_sum[0]) / pc.pair_counts

    # compute the random pairs from the fractional volume
    RR = 1.*N1*N2 / datasources[0].BoxSize.prod()
    if Nmu > 0:
        dr3 = numpy.diff(pc.edges[0]**3)
        dmu = numpy.diff(pc.edges[1])
        RR *= 2. / 3. * numpy.pi * dr3[:,None] * dmu[None,:]
    else:
        RR *= 4. / 3. * numpy.pi * numpy.diff(pc.edges**3)
    
    # return the correlation and the pair count object
    xi = (1. * pc.sum1 / RR) - 1.0
    if len(poles):
        xi = xi.T # makes ell the second axis 
        xi[:,poles!=0] += 1.0 # only monopole gets the minus one

    return pc, xi, RR
Example #14
0
File: fof.py Project: bccp/nbodykit
def fof(source, linking_length, comm, periodic, domain_factor, logger):
    """
    Run Friends-of-friends halo finder.

    Friends-of-friends was first used by Davis et al 1985 to define
    halos in hierachical structure formation of cosmological simulations.
    The algorithm is also known as DBSCAN in computer science.
    The subroutine here implements a parallel version of the FOF.

    The underlying local FOF algorithm is from `kdcount.cluster`,
    which is an adaptation of the implementation in Volker Springel's
    Gadget and Martin White's PM. It could have been done faster.

    Parameters
    ----------
    source: CatalogSource
        the input source of particles; must support 'Position' column;
        ``source.attrs['BoxSize']`` is also used
    linking_length: float
        linking length in data units. (Usually Mpc/h).
    comm: MPI.Comm
        The mpi communicator.

    Returns
    -------
    minid: array_like
        A unique label of each position. The label is not ranged from 0.
    """
    from pmesh.domain import GridND

    np = split_size_3d(comm.size)
    nd = np * domain_factor

    if periodic:
        BoxSize = source.attrs.get('BoxSize', None)
        if BoxSize is None:
            raise ValueError("cannot compute FOF clustering of source without 'BoxSize' in ``attrs`` dict")
        if numpy.isscalar(BoxSize):
            BoxSize = [BoxSize, BoxSize, BoxSize]

        left = [0, 0, 0]
        right = BoxSize
    else:
        BoxSize = None
        left = numpy.min(comm.allgather(source['Position'].min(axis=0).compute()), axis=0)
        right = numpy.max(comm.allgather(source['Position'].max(axis=0).compute()), axis=0)

    grid = [
        numpy.linspace(left[0], right[0], nd[0] + 1, endpoint=True),
        numpy.linspace(left[1], right[1], nd[1] + 1, endpoint=True),
        numpy.linspace(left[2], right[2], nd[2] + 1, endpoint=True),
    ]
    domain = GridND(grid, comm=comm, periodic=periodic)

    Position = source.compute(source['Position'])
    np = comm.allgather(len(Position))
    if comm.rank == 0:
        logger.info("Number of particles max/min = %d / %d before spatial decomposition" % (max(np), min(np)))

    # balance the load
    domain.loadbalance(domain.load(Position))

    layout = domain.decompose(Position, smoothing=linking_length * 1)

    np = comm.allgather(layout.newlength)
    if comm.rank == 0:
        logger.info("Number of particles max/min = %d / %d after spatial decomposition" % (max(np), min(np)))

    comm.barrier()
    minid = _fof_local(layout, Position, BoxSize, linking_length, comm)

    comm.barrier()
    minid = _fof_merge(layout, minid, comm)

    return minid