Ejemplo n.º 1
0
    def __init__(self, filetype, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size; start with full file.
        lstart = self.comm.rank * self._source.size // self.comm.size
        lend = (self.comm.rank + 1) * self._source.size // self.comm.size
        self._size = lend - lstart

        self.start = 0
        self.end = self._source.size

        self._lstart = lstart  # offset in the file for this rank
        self._lend = lend  # offset in the file for this rank

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s %s" %
                             (str(args), str(kwargs)))

        CatalogSource.__init__(self, comm=comm)
Ejemplo n.º 2
0
def test_single_path(comm):

    with TemporaryDirectory() as tmpdir:

        # generate TPM-format data
        pos = numpy.random.random(size=(2048, 3)).astype('f4')
        vel = numpy.random.random(size=(2048, 3)).astype('f4')
        uid = numpy.arange(2048, dtype='u8')
        hdr = numpy.ones(28, dtype='?')

        for i, name in enumerate(['tpm.000', 'tpm.001']):
            sl = slice(i * 1024, (i + 1) * 1024)

            # write to file
            fname = os.path.join(tmpdir, name)
            with open(fname, 'wb') as ff:
                hdr.tofile(ff)
                pos[sl].tofile(ff)
                vel[sl].tofile(ff)
                uid[sl].tofile(ff)

        # single path
        f = FileStack(TPMBinaryFile,
                      os.path.join(tmpdir, 'tpm.000'),
                      precision='f4')
        assert f.size == 1024
        assert f.nfiles == 1
Ejemplo n.º 3
0
    def __init__(self, filetype, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size; start with full file.
        lstart = self.comm.rank * self._source.size // self.comm.size
        lend = (self.comm.rank  + 1) * self._source.size // self.comm.size
        self._size = lend - lstart

        self.start = 0
        self.end = self._source.size

        self._lstart = lstart # offset in the file for this rank
        self._lend = lend     # offset in the file for this rank

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs)))

        CatalogSource.__init__(self, comm=comm)
Ejemplo n.º 4
0
def test_data(comm):

    with TemporaryDirectory() as tmpdir:

        # generate TPM-format data
        pos = numpy.random.random(size=(2048, 3)).astype('f4')
        vel = numpy.random.random(size=(2048, 3)).astype('f4')
        uid = numpy.arange(2048, dtype='u8')
        hdr = numpy.ones(28, dtype='?')

        for i, name in enumerate(['tpm.000', 'tpm.001']):
            sl = slice(i * 1024, (i + 1) * 1024)

            # write to file
            fname = os.path.join(tmpdir, name)
            with open(fname, 'wb') as ff:
                hdr.tofile(ff)
                pos[sl].tofile(ff)
                vel[sl].tofile(ff)
                uid[sl].tofile(ff)

        # initialize the stack
        path = os.path.join(tmpdir, 'tpm.00*')
        f = FileStack(TPMBinaryFile, path, precision='f4')

        # check size
        assert f.size == 2048
        assert f.ndim == 1

        # and data
        numpy.testing.assert_almost_equal(pos, f['Position'][:])
        numpy.testing.assert_almost_equal(vel, f['Velocity'][:])
        numpy.testing.assert_almost_equal(uid, f['ID'][:])

        # pass a list
        paths = [os.path.join(tmpdir, f) for f in ['tpm.000', 'tpm.001']]
        f = FileStack(TPMBinaryFile, paths, precision='f4')

        # check size
        assert f.size == 2048
        assert f.nfiles == 2

        # and add and attrs
        f.attrs['size'] = 2048
Ejemplo n.º 5
0
def test_data(comm):

    with TemporaryDirectory() as tmpdir:
        
        # generate TPM-format data
        pos = numpy.random.random(size=(2048, 3)).astype('f4')
        vel = numpy.random.random(size=(2048, 3)).astype('f4')
        uid = numpy.arange(2048, dtype='u8')
        hdr = numpy.ones(28, dtype='?')
        
        for i, name in enumerate(['tpm.000', 'tpm.001']):
            sl = slice(i*1024, (i+1)*1024)
            
            # write to file
            fname = os.path.join(tmpdir, name)
            with open(fname, 'wb') as ff:
                hdr.tofile(ff)
                pos[sl].tofile(ff); vel[sl].tofile(ff); uid[sl].tofile(ff)
            
        # initialize the stack
        path = os.path.join(tmpdir, 'tpm.00*')
        f = FileStack(TPMBinaryFile, path, precision='f4')
        
        # check size
        assert f.size == 2048
        
        # and data
        numpy.testing.assert_almost_equal(pos, f['Position'][:])
        numpy.testing.assert_almost_equal(vel, f['Velocity'][:])
        numpy.testing.assert_almost_equal(uid, f['ID'][:])
        
        # pass a list
        paths = [os.path.join(tmpdir, f) for f in ['tpm.000', 'tpm.001']]
        f = FileStack(TPMBinaryFile, paths, precision='f4')
        
        # check size
        assert f.size == 2048
        assert f.nfiles == 2
        
        # and add and attrs
        f.attrs['size'] = 2048
Ejemplo n.º 6
0
    def __init__(self, filetype, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size
        start = self.comm.rank * self._source.size // self.comm.size
        end = (self.comm.rank + 1) * self._source.size // self.comm.size
        self._size = end - start

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s" % str(args))

        CatalogSource.__init__(self, comm=comm)
Ejemplo n.º 7
0
def test_bad_path(comm):

    with TemporaryDirectory() as tmpdir:
        
        # generate TPM-format data
        pos = numpy.random.random(size=(2048, 3)).astype('f4')
        vel = numpy.random.random(size=(2048, 3)).astype('f4')
        uid = numpy.arange(2048, dtype='u8')
        hdr = numpy.ones(28, dtype='?')
        
        for i, name in enumerate(['tpm.000', 'tpm.001']):
            sl = slice(i*1024, (i+1)*1024)
            
            # write to file
            fname = os.path.join(tmpdir, name)
            with open(fname, 'wb') as ff:
                hdr.tofile(ff)
                pos[sl].tofile(ff); vel[sl].tofile(ff); uid[sl].tofile(ff)
            
        # bad path name
        with pytest.raises(ValueError): 
            f = FileStack(TPMBinaryFile, ff, precision='f4')
Ejemplo n.º 8
0
class FileCatalogBase(CatalogSource):
    """
    Base class to create a source of particles from a
    single file, or multiple files, on disk.

    Files of a specific type should be subclasses of this class.

    Parameters
    ----------
    filetype : subclass of :class:`~nbodykit.io.base.FileType`
        the file-like class used to load the data from file; should be a
        subclass of :class:`nbodykit.io.base.FileType`
    args : tuple, optional
        the arguments to pass to the ``filetype`` class when constructing
        each file object
    kwargs : dict, optional
        the keyword arguments to pass to the ``filetype`` class when
        constructing each file object
    comm : MPI Communicator, optional
        the MPI communicator instance; default (``None``) sets to the
        current communicator
    """
    @CurrentMPIComm.enable
    def __init__(self, filetype, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size
        start = self.comm.rank * self._source.size // self.comm.size
        end = (self.comm.rank + 1) * self._source.size // self.comm.size
        self._size = end - start

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s" % str(args))

        CatalogSource.__init__(self, comm=comm)

    def __repr__(self):
        path = self._source.path
        name = self.__class__.__name__
        if isinstance(path, string_types):
            args = (name, self.size, os.path.basename(path))
            return "%s(size=%d, file='%s')" % args
        else:
            args = (name, self.size, self._source.nfiles)
            return "%s(size=%d, nfiles=%d)" % args

    @property
    def hardcolumns(self):
        """
        The union of the columns in the file and any transformed columns.
        """
        defaults = CatalogSource.hardcolumns.fget(self)
        return list(self._source.dtype.names) + defaults

    def get_hardcolumn(self, col):
        """
        Return a column from the underlying file source.

        Columns are returned as dask arrays.
        """
        if col in self._source.dtype.names:
            start = self.comm.rank * self._source.size // self.comm.size
            end = (self.comm.rank + 1) * self._source.size // self.comm.size
            return self._source.get_dask(col)[start:end]
        else:
            return CatalogSource.get_hardcolumn(self, col)
Ejemplo n.º 9
0
class FileCatalogBase(CatalogSource):
    """
    Base class to create a source of particles from a
    single file, or multiple files, on disk.

    Files of a specific type should be subclasses of this class.

    Parameters
    ----------
    filetype : subclass of :class:`~nbodykit.io.base.FileType`
        the file-like class used to load the data from file; should be a
        subclass of :class:`nbodykit.io.base.FileType`
    args : tuple, optional
        the arguments to pass to the ``filetype`` class when constructing
        each file object
    kwargs : dict, optional
        the keyword arguments to pass to the ``filetype`` class when
        constructing each file object
    comm : MPI Communicator, optional
        the MPI communicator instance; default (``None``) sets to the
        current communicator
    """
    @CurrentMPIComm.enable
    def __init__(self, filetype, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size; start with full file.
        lstart = self.comm.rank * self._source.size // self.comm.size
        lend = (self.comm.rank  + 1) * self._source.size // self.comm.size
        self._size = lend - lstart

        self.start = 0
        self.end = self._source.size

        self._lstart = lstart # offset in the file for this rank
        self._lend = lend     # offset in the file for this rank

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs)))

        CatalogSource.__init__(self, comm=comm)

    def query_range(self, start, end):
        """
            Seek to a range in the file catalog.

            Parameters
            ----------
            start : int
                start of the file relative to the physical file

            end : int
                end of the file relative to the physical file

            Returns
            -------
            A new catalog that only accesses the given region of the file.

            If the original catalog (self) contains any assigned columns not directly
            obtained from the file, then the function will raise ValueError, since
            the operation in that case is not well defined.

        """
        if len(CatalogSource.hardcolumns.fget(self)) > 0:
            raise ValueError("cannot seek if columns have been attached to the FileCatalog")

        other = self.copy()
        other._lstart = self.start + start +  self.comm.rank * (end - start) // self.comm.size
        other._lend = self.start + start + (self.comm.rank + 1) * (end - start) // self.comm.size
        other._size = other._lend - other._lstart
        other.start = start
        other.end = end
        CatalogSource.__init__(other, comm=self.comm)
        return other

    def __repr__(self):
        path = self._source.path
        name = self.__class__.__name__
        args = (name, self.size, repr(self._source))

        return "%s(size=%d, %s)" % args

    @property
    def hardcolumns(self):
        """
        The union of the columns in the file and any transformed columns.
        """
        defaults = CatalogSource.hardcolumns.fget(self)
        return list(self._source.dtype.names) + defaults

    def get_hardcolumn(self, col):
        """
        Return a column from the underlying file source.

        Columns are returned as dask arrays.
        """
        if col in self._source.dtype.names:
            return self._source.get_dask(col)[self._lstart:self._lend]
        else:
            return CatalogSource.get_hardcolumn(self, col)
Ejemplo n.º 10
0
class FileCatalogBase(CatalogSource):
    """
    Base class to create a source of particles from a
    single file, or multiple files, on disk.

    Files of a specific type should be subclasses of this class.

    Parameters
    ----------
    filetype : subclass of :class:`~nbodykit.io.base.FileType`
        the file-like class used to load the data from file; should be a
        subclass of :class:`nbodykit.io.base.FileType`
    args : tuple, optional
        the arguments to pass to the ``filetype`` class when constructing
        each file object
    kwargs : dict, optional
        the keyword arguments to pass to the ``filetype`` class when
        constructing each file object
    comm : MPI Communicator, optional
        the MPI communicator instance; default (``None``) sets to the
        current communicator
    """
    @CurrentMPIComm.enable
    def __init__(self, filetype, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size; start with full file.
        lstart = self.comm.rank * self._source.size // self.comm.size
        lend = (self.comm.rank + 1) * self._source.size // self.comm.size
        self._size = lend - lstart

        self.start = 0
        self.end = self._source.size

        self._lstart = lstart  # offset in the file for this rank
        self._lend = lend  # offset in the file for this rank

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s %s" %
                             (str(args), str(kwargs)))

        CatalogSource.__init__(self, comm=comm)

    def query_range(self, start, end):
        """
            Seek to a range in the file catalog.

            Parameters
            ----------
            start : int
                start of the file relative to the physical file

            end : int
                end of the file relative to the physical file

            Returns
            -------
            A new catalog that only accesses the given region of the file.

            If the original catalog (self) contains any assigned columns not directly
            obtained from the file, then the function will raise ValueError, since
            the operation in that case is not well defined.

        """
        if len(CatalogSource.hardcolumns.fget(self)) > 0:
            raise ValueError(
                "cannot seek if columns have been attached to the FileCatalog")

        other = self.copy()
        other._lstart = self.start + start + self.comm.rank * (
            end - start) // self.comm.size
        other._lend = self.start + start + (self.comm.rank + 1) * (
            end - start) // self.comm.size
        other._size = other._lend - other._lstart
        other.start = start
        other.end = end
        CatalogSource.__init__(other, comm=self.comm)
        return other

    def __repr__(self):
        path = self._source.path
        name = self.__class__.__name__
        args = (name, self.size, repr(self._source))

        return "%s(size=%d, %s)" % args

    @property
    def hardcolumns(self):
        """
        The union of the columns in the file and any transformed columns.
        """
        defaults = CatalogSource.hardcolumns.fget(self)
        return list(self._source.dtype.names) + defaults

    def get_hardcolumn(self, col):
        """
        Return a column from the underlying file source.

        Columns are returned as dask arrays.
        """
        if col in self._source.dtype.names:
            return self._source.get_dask(col)[self._lstart:self._lend]
        else:
            return CatalogSource.get_hardcolumn(self, col)