Beispiel #1
0
def convert_data_set(path, data_set, batch_size=1000):
    loader = DataLoader(data_set,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=4)

    num_examples = len(data_set)

    os.makedirs(path, exist_ok=True)
    with zarr.LMDBStore(path) as store:
        root = zarr.group(store=store, overwrite=True)
        images_set = root.zeros('images',
                                shape=(num_examples, 3, 96, 96),
                                chunks=(1, None, None, None),
                                dtype='u1')
        labels_set = root.zeros('labels',
                                shape=(num_examples, ),
                                chunks=(1, ),
                                dtype='u1')
        current_iter = 0
        for images, labels in tqdm(loader):
            size = images.shape[0]
            images_set[current_iter:current_iter + size] = images
            labels_set[current_iter:current_iter + size] = labels
            current_iter += size
Beispiel #2
0
def get_storage_map(fs, path, memcache=2 ** 26, lock=True, storage_cache=2 ** 28):
    store = _get_storage_map(fs, path)
    cache_path = get_cache_path(path)
    if storage_cache and storage_cache > 0:
        os.makedirs(cache_path, exist_ok=True)
        store = LRUCache(
            zarr.LMDBStore(cache_path, buffers=True, lock=lock), store, storage_cache
        )
    if memcache and memcache > 0:
        store = LRUCache(zarr.MemoryStore(), store, memcache)
    return store
Beispiel #3
0
 def _initialise(self, filename=None):
     """
     Initialise the basic state of the data container.
     """
     self.store = None
     self.data = zarr.group()
     if filename is not None:
         self.store = zarr.LMDBStore(filename, subdir=False)
         self.data = zarr.open_group(store=self.store)
     self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME
     self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION
     self.data.attrs["uuid"] = str(uuid.uuid4())
    def __getitem__(self, idx):
        if self.datasets is None:
            store = zarr.LMDBStore(self.path)
            zarr_db = zarr.group(store=store)
            self.datasets = {key: zarr_db[key] for key in self.keys}

        items = []
        for key in self.keys:
            item = self.datasets[key][idx]
            if key in self.transforms:
                item = self.transforms[key](item)
            items.append(item)
        return items
Beispiel #5
0
 def _open_readonly(self, filename):
     # We set the mapsize here because LMBD will map 1TB of virtual memory if
     # we don't, making it hard to figure out how much memory we're actually
     # using.
     map_size = None
     try:
         map_size = os.path.getsize(filename)
     except OSError:
         # Ignore any exceptions here and let LMDB handle them.
         pass
     self.store = zarr.LMDBStore(filename,
                                 map_size=map_size,
                                 readonly=True,
                                 subdir=False,
                                 lock=False)
     self.data = zarr.open_group(store=self.store)
     self.check_format()
def get_obs_vector(session_ID, var, layer="X"):
    save_dir = save_analysis_path + str(session_ID) + "/"

    if (use_zarr is True):
        zarr_cache_dir = save_dir + "adata_cache" + ".zarr"
        if (os.path.exists(zarr_cache_dir) is True):
            with zarr.LMDBStore(zarr_cache_dir) as store_store:
                store = zarr.open_group(store=store_store, mode='r')

                if (var in store.obs.keys()):
                    ret = list(store.obs[var])
                else:
                    idx = list(store.var["gene_ID"]).index(var)
                    if (layer == "X"):
                        ret = store["X_dense"][:, idx]
                    else:
                        ret = (store["layers_dense"][layer])[:, idx]
            return ret
Beispiel #7
0
    def __init__(self,
                 data_dir_pth,
                 desired_chunk_size_bytes=1. * 1024**2,
                 datastore_type=DatastoreType.LMDB,
                 compression_type=CompressionType.BLOSC):
        """
        :param data_dir_pth: Path to the zarr lmdb file
        :param desired_chunk_size_bytes: The size (in bytes) of chunk each array is split into
        :param datastore_type: LMDB uses the lmdb database which needs to be installed on the system. If not available, use DIRECTORY type, which uses os filesystem
        :param compression_type: BLOSC uses the blosc library through numcodecs, but requires the blosc library to be installed on the system, or have a compatible system where blosc can be automatically installed when installing numcodes. If blosc is not available, use LZMA, which uses the python built-in compression library LZMA.
        """

        import zarr

        self.zarr = zarr

        self.datastore_type = datastore_type
        if datastore_type == DatastoreType.LMDB:
            self.store = zarr.LMDBStore(data_dir_pth)
        elif datastore_type == DatastoreType.DIRECTORY:
            self.store = zarr.DirectoryStore(data_dir_pth)
        else:
            raise RuntimeError(
                'Unknown datastore type: {}'.format(datastore_type))

        if compression_type == CompressionType.BLOSC:
            from numcodecs import Blosc
            self.compressor = Blosc(cname='blosclz',
                                    clevel=9,
                                    shuffle=Blosc.BITSHUFFLE)
        elif compression_type == CompressionType.LZMA:
            # import lzma
            # lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)]
            from numcodecs import LZMA
            self.compressor = LZMA()

        self.desired_chunk_size_bytes = desired_chunk_size_bytes

        if not os.path.exists(data_dir_pth):
            self.f = zarr.group(store=self.store, overwrite=True)
        else:
            self.f = zarr.group(store=self.store, overwrite=False)

        self.i = 0
    def __init__(self, path, transforms=None):
        self.path = path
        self.keys = ('images', 'labels')
        assert os.path.exists(path), 'file `{}` not exists!'.format(path)

        with zarr.LMDBStore(path) as store:
            zarr_db = zarr.group(store=store)
            self.num_examples = zarr_db['labels'].shape[0]
        self.datasets = None

        if transforms is None:
            transforms = {
                'labels':
                lambda v: torch.tensor(v, dtype=torch.long),
                'images':
                lambda v: torch.tensor(
                    (v - 127.5) / 127.5, dtype=torch.float32)
            }
        self.transforms = transforms
Beispiel #9
0
    def __init__(self, data_dir_pth, desired_chunk_size_bytes=1. * 1024 ** 2):
        """
        :param data_dir_pth: Path to the zarr lmdb file
        self.desired_chunk_size_bytes = desired_chunk_size_bytes
        """

        import zarr
        from numcodecs import Blosc

        self.zarr = zarr

        self.store = zarr.LMDBStore(data_dir_pth)
        self.compressor = Blosc(cname='blosclz', clevel=9, shuffle=Blosc.BITSHUFFLE)

        self.desired_chunk_size_bytes = desired_chunk_size_bytes

        if not os.path.exists(data_dir_pth):
            self.f = zarr.group(store=self.store, overwrite=True)
        else:
            self.f = zarr.group(store=self.store, overwrite=False)

        self.i = 0
Beispiel #10
0
"""
Quick script to patch up the sequence length attribute in version
1.0 sample files to make sure they are not 0. Older version supported
this but new versions will not.
"""

import tsinfer
import zarr
import sys
import os.path

filename = sys.argv[1]
sample_data = tsinfer.load(filename)
sequence_length = sample_data.sites_position[-1] + 1
sample_data.close()

# Add a megabyte to the map size in the file size goes up.
map_size = os.path.getsize(filename) + 1024**2
store = zarr.LMDBStore(filename, subdir=False, map_size=map_size)
data = zarr.open(store=store, mode="w+")
data.attrs["sequence_length"] = sequence_length
store.close()

sample_data = tsinfer.load(filename)
print("patched up sequence length")
print(sample_data)