def _repr_nosync(self): # main line r = '%s(' % type(self).__name__ if self.name: r += '%s, ' % self.name r += '%s, ' % str(self._shape) r += '%s, ' % str(self._dtype) r += 'chunks=%s, ' % str(self._chunks) r += 'order=%s' % self._order r += ')' # storage size info r += '\n nbytes: %s' % human_readable_size(self._nbytes) if self.nbytes_stored > 0: r += '; nbytes_stored: %s' % human_readable_size( self.nbytes_stored) r += '; ratio: %.1f' % (self._nbytes / self.nbytes_stored) r += '; initialized: %s/%s' % (self.nchunks_initialized, self._nchunks) # filters if self._filters: # first line r += '\n filters: %r' % self._filters[0] # subsequent lines for f in self._filters[1:]: r += '\n %r' % f # compressor if self._compressor: r += '\n compressor: %r' % self._compressor # storage and synchronizer classes r += '\n store: %s' % type(self._store).__name__ if self._store != self._chunk_store: r += '; chunk_store: %s' % type(self._chunk_store).__name__ if self._synchronizer is not None: r += '; synchronizer: %s' % type(self._synchronizer).__name__ return r
def test_human_readable_size(): eq('100', human_readable_size(100)) eq('1.0K', human_readable_size(2**10)) eq('1.0M', human_readable_size(2**20)) eq('1.0G', human_readable_size(2**30)) eq('1.0T', human_readable_size(2**40)) eq('1.0P', human_readable_size(2**50))
def test_human_readable_size(): assert '100' == human_readable_size(100) assert '1.0K' == human_readable_size(2**10) assert '1.0M' == human_readable_size(2**20) assert '1.0G' == human_readable_size(2**30) assert '1.0T' == human_readable_size(2**40) assert '1.0P' == human_readable_size(2**50)
# # * [zarr](http://zarr.readthedocs.io/en/latest/?badge=latest) keeps the h5py interface (which is similar to numpy's), but allows different choices for file compression and is fully multithreaded. See [Alistair Miles original blog entry](http://alimanfoo.github.io/2016/05/16/cpu-blues.html) for a discussion of the motivation behind zarr. # # ## dask # # * [dask](http://dask.pydata.org/en/latest/) is a Python library that implements lazy data structures (array, dataframe, bag) and a clever thread/process scheduler. It integrates with zarr to allow calculations on datasets that don't fit into core memory, either in a single node or across a cluster. # %% [markdown] # ### Example, write and read zarr arrays using multiple threads # %% [markdown] # ### Create 230 Mbytes of fake data # %% wvel_data = np.random.normal(2000, 1000, size=[8000, 7500]).astype(np.float32) human_readable_size(wvel_data.nbytes) # %% [markdown] # ### Copy to a zarr file on disk, using multiple threads # %% item = 'disk1_data' store = zarr.DirectoryStore(item) group = zarr.hierarchy.group(store=store, overwrite=True, synchronizer=zarr.ThreadSynchronizer()) the_var = 'wvel' out_zarr1 = group.zeros(the_var, shape=wvel_data.shape, dtype=wvel_data.dtype, chunks=[2000, 7500])