Beispiel #1
0
    def test_write_single_time(self):
        fpath = _get_tempfile()
        array = np.arange(0, 100, dtype='float32').reshape(-1, 5)

        with MmapArrayWriter(path=fpath,
                             shape=array.shape,
                             dtype=array.dtype,
                             remove_exist=True) as f:
            f.write(array)
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))

        with MmapArrayWriter(path=fpath, remove_exist=False) as f:
            f.write(array)
        x = MmapArray(fpath)
        self.assertTrue(np.all(np.concatenate([array, array], axis=0) == x))
Beispiel #2
0
    def test_write_multiple_time(self):
        fpath = _get_tempfile()
        array = np.arange(0, 1000, dtype='float32').reshape(-1, 2, 5)

        with MmapArrayWriter(path=fpath,
                             shape=(0, ) + array.shape[1:],
                             dtype=array.dtype,
                             remove_exist=True) as f:
            for i in range(0, array.shape[0], 8):
                f.write(array[i:i + 8])
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))

        array1 = np.arange(0, 100, dtype='float32').reshape(-1, 2, 5)
        array[10:10 + array1.shape[0]] = array1
        with MmapArrayWriter(path=fpath, remove_exist=False) as f:
            f.write(array1, start_position=10)
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))
Beispiel #3
0
 def __init__(self,
              path='~/tensorflow_datasets/3dshapes.h5',
              cache_dir=None,
              seed=8):
     path = os.path.abspath(os.path.expanduser(path))
     assert os.path.exists(path), "Path to file %s must exists" % path
     self.path = path
     if cache_dir is None:
         cache_dir = os.path.dirname(path)
     if not os.path.exists(cache_dir):
         os.mkdir(cache_dir)
     image_path = os.path.join(cache_dir, '3dshapes.images')
     label_path = os.path.join(cache_dir, '3dshapes.labels')
     # ====== read the dataset and cache it again ====== #
     if not os.path.exists(image_path) or not os.path.exists(label_path):
         import h5py
         with h5py.File(path, 'r') as dataset:
             images = dataset['images']
             labels = dataset['labels']
             with MmapArrayWriter(image_path,
                                  shape=images.shape,
                                  dtype=images.dtype,
                                  remove_exist=True) as img, \
               MmapArrayWriter(label_path,
                               shape=labels.shape,
                               dtype=labels.dtype,
                               remove_exist=True) as lab:
                 for start, end in tqdm(list(
                         batching(8000, n=images.shape[0])),
                                        desc="Caching data"):
                     img.write(images[start:end])
                     lab.write(labels[start:end])
     # ====== load the data ====== #
     self.images = MmapArray(image_path)
     self.factors = MmapArray(label_path)
     # ====== split the dataset ====== #
     rand = np.random.RandomState(seed=seed)
     n = len(self.images)
     ids = rand.permutation(n)
     # train:85% valid:5% test:10%
     self.train_indices = ids[:int(0.85 * n)]
     self.valid_indices = ids[int(0.85 * n):int(0.9 * n)]
     self.test_indices = ids[int(0.9 * n):]
Beispiel #4
0
    def test_write_multiprocessing(self):
        fpath = _get_tempfile()
        jobs = [(i, np.random.rand(12, 25, 8), fpath, (300, 25, 8))
                for i in range(25)]
        with Pool(2) as pool:
            pool.map(_fn_write, jobs)

        # checking the output
        array = np.concatenate([x[1] for x in jobs], axis=0)
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))
Beispiel #5
0
 def test_read_multiprocessing(self):
     fpath = _get_tempfile()
     array = np.random.rand(1200, 25, 8)
     # first write the array
     with MmapArrayWriter(fpath, (None, 25, 8), array.dtype) as f:
         f.write(array)
     x = MmapArray(fpath)
     self.assertTrue(np.all(array == x))
     # use multiprocessing to randomly read the array
     jobs = [(x,
              sorted(
                  np.random.randint(0,
                                    array.shape[0],
                                    size=(2, ),
                                    dtype='int32'))) for i in range(25)]
     with Pool(2) as pool:
         for start, end, data in pool.map(_fn_read, jobs):
             data = zlib.decompress(data)
             data = np.frombuffer(data).reshape(-1, 25, 8)
             self.assertTrue(np.all(data == array[start:end]))
Beispiel #6
0
# ====== reading ====== #
print()

start = timeit.default_timer()
with open(numpy_path, 'rb') as f:
    y = np.load(f)
numpy_open_time = timeit.default_timer() - start
print('Load Numpy array:', numpy_open_time, 's')

start = timeit.default_timer()
hdf5 = h5py.File(hdf5_path, 'r')
h5py_open_time = timeit.default_timer() - start
print('Load HDF5 data  :', h5py_open_time, 's')

start = timeit.default_timer()
mmap = MmapArray(mmap_path)
mmap_open_time = timeit.default_timer() - start
print('Load Memmap data:', mmap_open_time, 's')

print()
print('Test correctness of stored data')
print('Numpy :', np.all(y == X))
print('HDF5  :', np.all(hdf5['X'][:] == X))
print('Memmap:', np.all(mmap[:] == X))

# ====== iterating over dataset ====== #
print()
start = timeit.default_timer()
for epoch in range(0, 3):
    for i in range(0, N, 256):
        x = X[i:i + 256]
Beispiel #7
0
    def transform(self,
                  X,
                  indices=None,
                  sad=None,
                  save_ivecs=False,
                  keep_stats=False,
                  name=None):
        """
    Parameters
    ----------
    X : ndarray
      Training data [n_samples, n_features]
    indices : {Mapping, tuple, list}
      in case the data is given by a list of files, `indices`
      act as file indicator mapping from
      'file_name' -> (start_index_in_X, end_index_in_X)
      This mapping can be provided by a dictionary, or list of
      tuple.
    sad : ndarray
      inspired by the "Speech Activity Detection" (SAD) indexing,
      this array is indicator of which samples will be taken into
      training; the shape should be [n_samples,] or [n_samples, 1]
    save_ivecs : bool
      if True, save extracted i-vectors to disk at path `ivec_[name]`
      if False, return directly the i-vectors without saving

    keep_stats : bool
      if True, keep the zero and first order statistics.
      The first order statistics could consume huge amount
      of disk space. Otherwise, they are deleted after training
    name : {None, str}
      identity of the i-vectors (for re-using in future).
      If None, a random name is used
    """
        if not self.is_fitted:
            raise ValueError(
                "Ivector has not been fitted, call Ivector.fit(...) first")
        n_files = X.shape[0] if indices is None else len(indices)
        if name is None:
            name = uuid(length=8)
        else:
            name = str(name)
        # ====== init ====== #
        z_path = self.get_z_path(name)
        f_path = self.get_f_path(name)
        if save_ivecs:
            i_path = self.get_i_path(name)
        else:
            i_path = None
        name_path = self.get_name_path(name)
        # ====== check exist i-vector file ====== #
        if i_path is not None and os.path.exists(i_path):
            ivec = MmapArray(path=i_path)
            assert ivec.shape[0] == n_files and ivec.shape[1] == self.tv_dim,\
            "Need i-vectors for %d files, found exists data at path:'%s' with shape:%s" % \
            (n_files, i_path, ivec.shape)
            return ivec
        # ====== extract Z and F ====== #
        if os.path.exists(z_path) and os.path.exists(f_path):
            pass
        else:
            if os.path.exists(z_path):
                os.remove(z_path)
            if os.path.exists(f_path):
                os.remove(f_path)
            if os.path.exists(name_path):
                os.remove(name_path)
            _extract_zero_and_first_stats(X=X,
                                          sad=sad,
                                          indices=indices,
                                          gmm=self.gmm,
                                          z_path=z_path,
                                          f_path=f_path,
                                          name_path=name_path)
        Z = MmapArray(path=z_path)
        F = MmapArray(path=f_path)
        # ====== extract I-vec ====== #
        ivec = self.tmat.transform_to_disk(path=i_path,
                                           Z=Z,
                                           F=F,
                                           dtype='float32')
        # ====== clean ====== #
        Z.close()
        F.close()
        if not keep_stats:
            if os.path.exists(z_path):
                os.remove(z_path)
            if os.path.exists(f_path):
                os.remove(f_path)
        else:
            print("Zero-order stats saved at:", ctext(z_path, 'cyan'))
            print("First-order stats saved at:", ctext(f_path, 'cyan'))
        return ivec
Beispiel #8
0
    def fit(self,
            X,
            indices=None,
            sad=None,
            refit_gmm=False,
            refit_tmat=False,
            extract_ivecs=False,
            keep_stats=False):
        """
    Parameters
    ----------
    X : ndarray
      Training data [n_samples, n_features]

    indices : {Mapping, tuple, list}
      in case the data is given by a list of files, `indices`
      act as file indicator mapping from
      'file_name' -> (start_index_in_X, end_index_in_X)
      This mapping can be provided by a dictionary, or list of
      tuple.
      Note: the order provided in indices will be preserved

    sad : ndarray
      inspired by the "Speech Activity Detection" (SAD) indexing,
      this array is indicator of which samples will be taken into
      training; the shape should be [n_samples,] or [n_samples, 1]

    refit_gmm : bool
      if True, re-fit the GMM even though it is fitted,
      consequently, the T-matrix will be re-fitted

    refit_tmat : bool
      if True, re-fit the T-matrix even though it is fitted

    extract_ivecs : bool
      if True, extract the i-vector for training data

    keep_stats : bool
      if True, keep the zero and first order statistics.
      The first order statistics could consume huge amount
      of disk space. Otherwise, they are deleted after training
    """
        new_gmm = (not self.gmm.is_fitted or refit_gmm)
        # ====== clean error files ====== #
        if os.path.exists(self.z_path):
            Z = MmapArray(self.z_path)
            if Z.shape[0] == 0:  # empty file
                os.remove(self.z_path)
            Z.close()
        if os.path.exists(self.f_path):
            F = MmapArray(self.f_path)
            if F.shape[0] == 0:  # empty file
                os.remove(self.f_path)
            F.close()
        if os.path.exists(self.ivec_path):
            ivec = MmapArray(self.ivec_path)
            if ivec.shape[0] == 0:  # empty file
                os.remove(self.ivec_path)
            ivec.close()
        # ====== Training the GMM first ====== #
        if new_gmm:
            input_data = [X]
            if sad is not None:
                input_data.append(sad)
            if indices is not None:
                input_data.append(indices)
            self.gmm.fit(input_data)
        # ====== some fun, and confusing logics ====== #
        # GMM need to be fitted before creating T-matrix model
        new_tmat = (not self.tmat.is_fitted or new_gmm or refit_tmat)
        # New I-vector is need when:
        # - only when `extract_ivecs=True`
        # - and new T-matrix is trained but no I-vector is extracted
        new_ivec = extract_ivecs and \
        (new_tmat or not os.path.exists(self.ivec_path))
        # new stats is only needed when
        # - GMM is updated
        # - training new Tmatrix and the Z and F not exist
        # - extracting new I-vector and the Z and F not exist
        if not new_gmm and \
        (os.path.exists(self.z_path) and os.path.exists(self.f_path)):
            new_stats = False
        else:
            new_stats = new_gmm or new_tmat or new_ivec
        # ====== extract the statistics ====== #
        if new_stats:
            _extract_zero_and_first_stats(X=X,
                                          sad=sad,
                                          indices=indices,
                                          gmm=self.gmm,
                                          z_path=self.z_path,
                                          f_path=self.f_path,
                                          name_path=self.name_path)
        # ====== Training the T-matrix and extract i-vector ====== #
        if new_tmat or new_ivec:
            Z = MmapArray(path=self.z_path)
            F = MmapArray(path=self.f_path)
            if new_tmat:
                self.tmat.fit((Z, F))
            if new_ivec:
                self.tmat.transform_to_disk(path=self.ivec_path,
                                            Z=Z,
                                            F=F,
                                            dtype='float32',
                                            device='gpu',
                                            override=True)
            Z.close()
            F.close()
        # ====== clean ====== #
        if not keep_stats:
            if os.path.exists(self.z_path):
                os.remove(self.z_path)
            if os.path.exists(self.f_path):
                os.remove(self.f_path)
        return self
Beispiel #9
0
def _parse_data_descriptor(path, read_only):
    """ Return mapping: name -> (dtype, shape, Data, path) """
    if not os.path.isfile(path):
        return None
    file_ext = os.path.splitext(path)[-1].lower()
    file_name = os.path.basename(path)
    # ====== ignore ====== #
    if os.path.basename(path) in _ignore_files:
        return None
    # ====== audio file ====== #
    if file_ext in _audio_ext:
        return [(file_name, ('audio', 'unknown', None, path))]
    # ====== image file ====== #
    if file_ext in _image_ext:
        return [(file_name, ('image', 'unknown', None, path))]
    # ====== text file .txt ====== #
    if file_ext in ('.txt', ):
        return [(file_name, ('txt', 'unknown', None, path))]
    # ====== check if is csv file ====== #
    if file_ext in ('.csv', '.tsv'):
        sep = _infer_separator(path)
        data = []
        # read by manually open file much faster than numpy.genfromtxt
        with open(path, 'r') as f:
            for line in f:
                line = line[:-1]
                data.append(line.split(sep))
            data = np.array(data, dtype=str)
        return [('.'.join(file_name.split('.')[:-1]), ('csv', data.shape, data,
                                                       path))]
    # ====== check if a file is Data ====== #
    try:
        dtype, shape = read_mmaparray_header(path)
        # ensure read-only mode here,
        # i.e. change only applied in memory, not saved to disk
        kw = dict(mode='c') if read_only else dict()
        data = MmapArray(path, **kw)
        assert np.dtype(dtype) == data.dtype and shape == data.shape, \
          "Metadata mismatch for MmapArray"
        return [(file_name, (data.dtype, data.shape, data, path))]
    except Exception:  # cannot read the header of MmapArray
        pass
    # ====== try to load pickle file if possible ====== #
    try:  # try with unpickling
        with open(path, 'rb') as f:
            data = cPickle.load(f)
            shape_info = 0
            if hasattr(data, 'shape'):
                shape_info = data.shape
            elif hasattr(data, '__len__'):
                shape_info = len(data)
            return [(file_name, (str(data.dtype) if hasattr(data, 'dtype') else
                                 type(data).__name__, shape_info, data, path))]
    except cPickle.UnpicklingError:
        try:  # try again with numpy load
            with open(path, 'rb') as f:
                data = np.load(f)
                return [(file_name,
                         (str(data.dtype)
                          if hasattr(data, 'dtype') else type(data).__name__,
                          len(data) if hasattr(data, '__len__') else 0, data,
                          path))]
        except Exception:
            pass
    # ====== load memmap dict ====== #
    try:
        data = MmapDict(path, read_only=read_only)
        return [(file_name, ('memdict', len(data), data, path))]
    except Exception as e:
        pass
    # ====== load SQLiteDict ====== #
    if '.db' in os.path.splitext(path)[1]:
        try:
            db = SQLiteDict(path, read_only=read_only)
            name = os.path.basename(path).replace('.db', '')
            return [(tab if tab != SQLiteDict._DEFAULT_TABLE else name,
                     ('sqlite', len(db.set_table(tab)), db.as_table(tab),
                      path)) for tab in db.get_all_tables()]
        except Exception as e:
            pass
    # ====== unknown datatype ====== #
    return [(file_name, ('unknown', 'unknown', None, path))]
Beispiel #10
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
Beispiel #11
0
mmap.write(X)
print('Writing data to Memmap:', timeit.default_timer() - start, 's')

hdf5.flush()
hdf5.close()
mmap.flush()
mmap.close()

# ====== reading ====== #
print()
start = timeit.default_timer()
hdf5 = h5py.File('tmp.hdf5', 'r')
print('Load HDF5 data  :', timeit.default_timer() - start, 's')

start = timeit.default_timer()
mmap = MmapArray('tmp.mmap')
print('Load Memmap data:', timeit.default_timer() - start, 's')

print()
print('Test correctness of stored data')
print('HDF5  :', np.all(hdf5['X'][:] == X))
print('Memmap:', np.all(mmap[:] == X))

# ====== iterating over dataset ====== #
print()
start = timeit.default_timer()
for epoch in range(0, 3):
    for i in range(0, N, 256):
        x = X[i:i + 256]
print('Iterate Numpy data   :', timeit.default_timer() - start, 's')