def __init__(self, filetype, args=(), kwargs={}, comm=None): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size; start with full file. lstart = self.comm.rank * self._source.size // self.comm.size lend = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = lend - lstart self.start = 0 self.end = self._source.size self._lstart = lstart # offset in the file for this rank self._lend = lend # offset in the file for this rank # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs))) CatalogSource.__init__(self, comm=comm)
def test_single_path(comm): with TemporaryDirectory() as tmpdir: # generate TPM-format data pos = numpy.random.random(size=(2048, 3)).astype('f4') vel = numpy.random.random(size=(2048, 3)).astype('f4') uid = numpy.arange(2048, dtype='u8') hdr = numpy.ones(28, dtype='?') for i, name in enumerate(['tpm.000', 'tpm.001']): sl = slice(i * 1024, (i + 1) * 1024) # write to file fname = os.path.join(tmpdir, name) with open(fname, 'wb') as ff: hdr.tofile(ff) pos[sl].tofile(ff) vel[sl].tofile(ff) uid[sl].tofile(ff) # single path f = FileStack(TPMBinaryFile, os.path.join(tmpdir, 'tpm.000'), precision='f4') assert f.size == 1024 assert f.nfiles == 1
def test_data(comm): with TemporaryDirectory() as tmpdir: # generate TPM-format data pos = numpy.random.random(size=(2048, 3)).astype('f4') vel = numpy.random.random(size=(2048, 3)).astype('f4') uid = numpy.arange(2048, dtype='u8') hdr = numpy.ones(28, dtype='?') for i, name in enumerate(['tpm.000', 'tpm.001']): sl = slice(i * 1024, (i + 1) * 1024) # write to file fname = os.path.join(tmpdir, name) with open(fname, 'wb') as ff: hdr.tofile(ff) pos[sl].tofile(ff) vel[sl].tofile(ff) uid[sl].tofile(ff) # initialize the stack path = os.path.join(tmpdir, 'tpm.00*') f = FileStack(TPMBinaryFile, path, precision='f4') # check size assert f.size == 2048 assert f.ndim == 1 # and data numpy.testing.assert_almost_equal(pos, f['Position'][:]) numpy.testing.assert_almost_equal(vel, f['Velocity'][:]) numpy.testing.assert_almost_equal(uid, f['ID'][:]) # pass a list paths = [os.path.join(tmpdir, f) for f in ['tpm.000', 'tpm.001']] f = FileStack(TPMBinaryFile, paths, precision='f4') # check size assert f.size == 2048 assert f.nfiles == 2 # and add and attrs f.attrs['size'] = 2048
def test_data(comm): with TemporaryDirectory() as tmpdir: # generate TPM-format data pos = numpy.random.random(size=(2048, 3)).astype('f4') vel = numpy.random.random(size=(2048, 3)).astype('f4') uid = numpy.arange(2048, dtype='u8') hdr = numpy.ones(28, dtype='?') for i, name in enumerate(['tpm.000', 'tpm.001']): sl = slice(i*1024, (i+1)*1024) # write to file fname = os.path.join(tmpdir, name) with open(fname, 'wb') as ff: hdr.tofile(ff) pos[sl].tofile(ff); vel[sl].tofile(ff); uid[sl].tofile(ff) # initialize the stack path = os.path.join(tmpdir, 'tpm.00*') f = FileStack(TPMBinaryFile, path, precision='f4') # check size assert f.size == 2048 # and data numpy.testing.assert_almost_equal(pos, f['Position'][:]) numpy.testing.assert_almost_equal(vel, f['Velocity'][:]) numpy.testing.assert_almost_equal(uid, f['ID'][:]) # pass a list paths = [os.path.join(tmpdir, f) for f in ['tpm.000', 'tpm.001']] f = FileStack(TPMBinaryFile, paths, precision='f4') # check size assert f.size == 2048 assert f.nfiles == 2 # and add and attrs f.attrs['size'] = 2048
def __init__(self, filetype, args=(), kwargs={}, comm=None): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size start = self.comm.rank * self._source.size // self.comm.size end = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = end - start # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s" % str(args)) CatalogSource.__init__(self, comm=comm)
def test_bad_path(comm): with TemporaryDirectory() as tmpdir: # generate TPM-format data pos = numpy.random.random(size=(2048, 3)).astype('f4') vel = numpy.random.random(size=(2048, 3)).astype('f4') uid = numpy.arange(2048, dtype='u8') hdr = numpy.ones(28, dtype='?') for i, name in enumerate(['tpm.000', 'tpm.001']): sl = slice(i*1024, (i+1)*1024) # write to file fname = os.path.join(tmpdir, name) with open(fname, 'wb') as ff: hdr.tofile(ff) pos[sl].tofile(ff); vel[sl].tofile(ff); uid[sl].tofile(ff) # bad path name with pytest.raises(ValueError): f = FileStack(TPMBinaryFile, ff, precision='f4')
class FileCatalogBase(CatalogSource): """ Base class to create a source of particles from a single file, or multiple files, on disk. Files of a specific type should be subclasses of this class. Parameters ---------- filetype : subclass of :class:`~nbodykit.io.base.FileType` the file-like class used to load the data from file; should be a subclass of :class:`nbodykit.io.base.FileType` args : tuple, optional the arguments to pass to the ``filetype`` class when constructing each file object kwargs : dict, optional the keyword arguments to pass to the ``filetype`` class when constructing each file object comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator """ @CurrentMPIComm.enable def __init__(self, filetype, args=(), kwargs={}, comm=None): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size start = self.comm.rank * self._source.size // self.comm.size end = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = end - start # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s" % str(args)) CatalogSource.__init__(self, comm=comm) def __repr__(self): path = self._source.path name = self.__class__.__name__ if isinstance(path, string_types): args = (name, self.size, os.path.basename(path)) return "%s(size=%d, file='%s')" % args else: args = (name, self.size, self._source.nfiles) return "%s(size=%d, nfiles=%d)" % args @property def hardcolumns(self): """ The union of the columns in the file and any transformed columns. """ defaults = CatalogSource.hardcolumns.fget(self) return list(self._source.dtype.names) + defaults def get_hardcolumn(self, col): """ Return a column from the underlying file source. Columns are returned as dask arrays. """ if col in self._source.dtype.names: start = self.comm.rank * self._source.size // self.comm.size end = (self.comm.rank + 1) * self._source.size // self.comm.size return self._source.get_dask(col)[start:end] else: return CatalogSource.get_hardcolumn(self, col)
class FileCatalogBase(CatalogSource): """ Base class to create a source of particles from a single file, or multiple files, on disk. Files of a specific type should be subclasses of this class. Parameters ---------- filetype : subclass of :class:`~nbodykit.io.base.FileType` the file-like class used to load the data from file; should be a subclass of :class:`nbodykit.io.base.FileType` args : tuple, optional the arguments to pass to the ``filetype`` class when constructing each file object kwargs : dict, optional the keyword arguments to pass to the ``filetype`` class when constructing each file object comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator """ @CurrentMPIComm.enable def __init__(self, filetype, args=(), kwargs={}, comm=None): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size; start with full file. lstart = self.comm.rank * self._source.size // self.comm.size lend = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = lend - lstart self.start = 0 self.end = self._source.size self._lstart = lstart # offset in the file for this rank self._lend = lend # offset in the file for this rank # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs))) CatalogSource.__init__(self, comm=comm) def query_range(self, start, end): """ Seek to a range in the file catalog. Parameters ---------- start : int start of the file relative to the physical file end : int end of the file relative to the physical file Returns ------- A new catalog that only accesses the given region of the file. If the original catalog (self) contains any assigned columns not directly obtained from the file, then the function will raise ValueError, since the operation in that case is not well defined. """ if len(CatalogSource.hardcolumns.fget(self)) > 0: raise ValueError("cannot seek if columns have been attached to the FileCatalog") other = self.copy() other._lstart = self.start + start + self.comm.rank * (end - start) // self.comm.size other._lend = self.start + start + (self.comm.rank + 1) * (end - start) // self.comm.size other._size = other._lend - other._lstart other.start = start other.end = end CatalogSource.__init__(other, comm=self.comm) return other def __repr__(self): path = self._source.path name = self.__class__.__name__ args = (name, self.size, repr(self._source)) return "%s(size=%d, %s)" % args @property def hardcolumns(self): """ The union of the columns in the file and any transformed columns. """ defaults = CatalogSource.hardcolumns.fget(self) return list(self._source.dtype.names) + defaults def get_hardcolumn(self, col): """ Return a column from the underlying file source. Columns are returned as dask arrays. """ if col in self._source.dtype.names: return self._source.get_dask(col)[self._lstart:self._lend] else: return CatalogSource.get_hardcolumn(self, col)
class FileCatalogBase(CatalogSource): """ Base class to create a source of particles from a single file, or multiple files, on disk. Files of a specific type should be subclasses of this class. Parameters ---------- filetype : subclass of :class:`~nbodykit.io.base.FileType` the file-like class used to load the data from file; should be a subclass of :class:`nbodykit.io.base.FileType` args : tuple, optional the arguments to pass to the ``filetype`` class when constructing each file object kwargs : dict, optional the keyword arguments to pass to the ``filetype`` class when constructing each file object comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator """ @CurrentMPIComm.enable def __init__(self, filetype, args=(), kwargs={}, comm=None): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size; start with full file. lstart = self.comm.rank * self._source.size // self.comm.size lend = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = lend - lstart self.start = 0 self.end = self._source.size self._lstart = lstart # offset in the file for this rank self._lend = lend # offset in the file for this rank # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs))) CatalogSource.__init__(self, comm=comm) def query_range(self, start, end): """ Seek to a range in the file catalog. Parameters ---------- start : int start of the file relative to the physical file end : int end of the file relative to the physical file Returns ------- A new catalog that only accesses the given region of the file. If the original catalog (self) contains any assigned columns not directly obtained from the file, then the function will raise ValueError, since the operation in that case is not well defined. """ if len(CatalogSource.hardcolumns.fget(self)) > 0: raise ValueError( "cannot seek if columns have been attached to the FileCatalog") other = self.copy() other._lstart = self.start + start + self.comm.rank * ( end - start) // self.comm.size other._lend = self.start + start + (self.comm.rank + 1) * ( end - start) // self.comm.size other._size = other._lend - other._lstart other.start = start other.end = end CatalogSource.__init__(other, comm=self.comm) return other def __repr__(self): path = self._source.path name = self.__class__.__name__ args = (name, self.size, repr(self._source)) return "%s(size=%d, %s)" % args @property def hardcolumns(self): """ The union of the columns in the file and any transformed columns. """ defaults = CatalogSource.hardcolumns.fget(self) return list(self._source.dtype.names) + defaults def get_hardcolumn(self, col): """ Return a column from the underlying file source. Columns are returned as dask arrays. """ if col in self._source.dtype.names: return self._source.get_dask(col)[self._lstart:self._lend] else: return CatalogSource.get_hardcolumn(self, col)