def test_write_single_time(self): fpath = _get_tempfile() array = np.arange(0, 100, dtype='float32').reshape(-1, 5) with MmapArrayWriter(path=fpath, shape=array.shape, dtype=array.dtype, remove_exist=True) as f: f.write(array) x = MmapArray(fpath) self.assertTrue(np.all(array == x)) with MmapArrayWriter(path=fpath, remove_exist=False) as f: f.write(array) x = MmapArray(fpath) self.assertTrue(np.all(np.concatenate([array, array], axis=0) == x))
def test_write_multiple_time(self): fpath = _get_tempfile() array = np.arange(0, 1000, dtype='float32').reshape(-1, 2, 5) with MmapArrayWriter(path=fpath, shape=(0, ) + array.shape[1:], dtype=array.dtype, remove_exist=True) as f: for i in range(0, array.shape[0], 8): f.write(array[i:i + 8]) x = MmapArray(fpath) self.assertTrue(np.all(array == x)) array1 = np.arange(0, 100, dtype='float32').reshape(-1, 2, 5) array[10:10 + array1.shape[0]] = array1 with MmapArrayWriter(path=fpath, remove_exist=False) as f: f.write(array1, start_position=10) x = MmapArray(fpath) self.assertTrue(np.all(array == x))
def __init__(self, path='~/tensorflow_datasets/3dshapes.h5', cache_dir=None, seed=8): path = os.path.abspath(os.path.expanduser(path)) assert os.path.exists(path), "Path to file %s must exists" % path self.path = path if cache_dir is None: cache_dir = os.path.dirname(path) if not os.path.exists(cache_dir): os.mkdir(cache_dir) image_path = os.path.join(cache_dir, '3dshapes.images') label_path = os.path.join(cache_dir, '3dshapes.labels') # ====== read the dataset and cache it again ====== # if not os.path.exists(image_path) or not os.path.exists(label_path): import h5py with h5py.File(path, 'r') as dataset: images = dataset['images'] labels = dataset['labels'] with MmapArrayWriter(image_path, shape=images.shape, dtype=images.dtype, remove_exist=True) as img, \ MmapArrayWriter(label_path, shape=labels.shape, dtype=labels.dtype, remove_exist=True) as lab: for start, end in tqdm(list( batching(8000, n=images.shape[0])), desc="Caching data"): img.write(images[start:end]) lab.write(labels[start:end]) # ====== load the data ====== # self.images = MmapArray(image_path) self.factors = MmapArray(label_path) # ====== split the dataset ====== # rand = np.random.RandomState(seed=seed) n = len(self.images) ids = rand.permutation(n) # train:85% valid:5% test:10% self.train_indices = ids[:int(0.85 * n)] self.valid_indices = ids[int(0.85 * n):int(0.9 * n)] self.test_indices = ids[int(0.9 * n):]
def test_write_multiprocessing(self): fpath = _get_tempfile() jobs = [(i, np.random.rand(12, 25, 8), fpath, (300, 25, 8)) for i in range(25)] with Pool(2) as pool: pool.map(_fn_write, jobs) # checking the output array = np.concatenate([x[1] for x in jobs], axis=0) x = MmapArray(fpath) self.assertTrue(np.all(array == x))
def test_read_multiprocessing(self): fpath = _get_tempfile() array = np.random.rand(1200, 25, 8) # first write the array with MmapArrayWriter(fpath, (None, 25, 8), array.dtype) as f: f.write(array) x = MmapArray(fpath) self.assertTrue(np.all(array == x)) # use multiprocessing to randomly read the array jobs = [(x, sorted( np.random.randint(0, array.shape[0], size=(2, ), dtype='int32'))) for i in range(25)] with Pool(2) as pool: for start, end, data in pool.map(_fn_read, jobs): data = zlib.decompress(data) data = np.frombuffer(data).reshape(-1, 25, 8) self.assertTrue(np.all(data == array[start:end]))
# ====== reading ====== # print() start = timeit.default_timer() with open(numpy_path, 'rb') as f: y = np.load(f) numpy_open_time = timeit.default_timer() - start print('Load Numpy array:', numpy_open_time, 's') start = timeit.default_timer() hdf5 = h5py.File(hdf5_path, 'r') h5py_open_time = timeit.default_timer() - start print('Load HDF5 data :', h5py_open_time, 's') start = timeit.default_timer() mmap = MmapArray(mmap_path) mmap_open_time = timeit.default_timer() - start print('Load Memmap data:', mmap_open_time, 's') print() print('Test correctness of stored data') print('Numpy :', np.all(y == X)) print('HDF5 :', np.all(hdf5['X'][:] == X)) print('Memmap:', np.all(mmap[:] == X)) # ====== iterating over dataset ====== # print() start = timeit.default_timer() for epoch in range(0, 3): for i in range(0, N, 256): x = X[i:i + 256]
def transform(self, X, indices=None, sad=None, save_ivecs=False, keep_stats=False, name=None): """ Parameters ---------- X : ndarray Training data [n_samples, n_features] indices : {Mapping, tuple, list} in case the data is given by a list of files, `indices` act as file indicator mapping from 'file_name' -> (start_index_in_X, end_index_in_X) This mapping can be provided by a dictionary, or list of tuple. sad : ndarray inspired by the "Speech Activity Detection" (SAD) indexing, this array is indicator of which samples will be taken into training; the shape should be [n_samples,] or [n_samples, 1] save_ivecs : bool if True, save extracted i-vectors to disk at path `ivec_[name]` if False, return directly the i-vectors without saving keep_stats : bool if True, keep the zero and first order statistics. The first order statistics could consume huge amount of disk space. Otherwise, they are deleted after training name : {None, str} identity of the i-vectors (for re-using in future). If None, a random name is used """ if not self.is_fitted: raise ValueError( "Ivector has not been fitted, call Ivector.fit(...) first") n_files = X.shape[0] if indices is None else len(indices) if name is None: name = uuid(length=8) else: name = str(name) # ====== init ====== # z_path = self.get_z_path(name) f_path = self.get_f_path(name) if save_ivecs: i_path = self.get_i_path(name) else: i_path = None name_path = self.get_name_path(name) # ====== check exist i-vector file ====== # if i_path is not None and os.path.exists(i_path): ivec = MmapArray(path=i_path) assert ivec.shape[0] == n_files and ivec.shape[1] == self.tv_dim,\ "Need i-vectors for %d files, found exists data at path:'%s' with shape:%s" % \ (n_files, i_path, ivec.shape) return ivec # ====== extract Z and F ====== # if os.path.exists(z_path) and os.path.exists(f_path): pass else: if os.path.exists(z_path): os.remove(z_path) if os.path.exists(f_path): os.remove(f_path) if os.path.exists(name_path): os.remove(name_path) _extract_zero_and_first_stats(X=X, sad=sad, indices=indices, gmm=self.gmm, z_path=z_path, f_path=f_path, name_path=name_path) Z = MmapArray(path=z_path) F = MmapArray(path=f_path) # ====== extract I-vec ====== # ivec = self.tmat.transform_to_disk(path=i_path, Z=Z, F=F, dtype='float32') # ====== clean ====== # Z.close() F.close() if not keep_stats: if os.path.exists(z_path): os.remove(z_path) if os.path.exists(f_path): os.remove(f_path) else: print("Zero-order stats saved at:", ctext(z_path, 'cyan')) print("First-order stats saved at:", ctext(f_path, 'cyan')) return ivec
def fit(self, X, indices=None, sad=None, refit_gmm=False, refit_tmat=False, extract_ivecs=False, keep_stats=False): """ Parameters ---------- X : ndarray Training data [n_samples, n_features] indices : {Mapping, tuple, list} in case the data is given by a list of files, `indices` act as file indicator mapping from 'file_name' -> (start_index_in_X, end_index_in_X) This mapping can be provided by a dictionary, or list of tuple. Note: the order provided in indices will be preserved sad : ndarray inspired by the "Speech Activity Detection" (SAD) indexing, this array is indicator of which samples will be taken into training; the shape should be [n_samples,] or [n_samples, 1] refit_gmm : bool if True, re-fit the GMM even though it is fitted, consequently, the T-matrix will be re-fitted refit_tmat : bool if True, re-fit the T-matrix even though it is fitted extract_ivecs : bool if True, extract the i-vector for training data keep_stats : bool if True, keep the zero and first order statistics. The first order statistics could consume huge amount of disk space. Otherwise, they are deleted after training """ new_gmm = (not self.gmm.is_fitted or refit_gmm) # ====== clean error files ====== # if os.path.exists(self.z_path): Z = MmapArray(self.z_path) if Z.shape[0] == 0: # empty file os.remove(self.z_path) Z.close() if os.path.exists(self.f_path): F = MmapArray(self.f_path) if F.shape[0] == 0: # empty file os.remove(self.f_path) F.close() if os.path.exists(self.ivec_path): ivec = MmapArray(self.ivec_path) if ivec.shape[0] == 0: # empty file os.remove(self.ivec_path) ivec.close() # ====== Training the GMM first ====== # if new_gmm: input_data = [X] if sad is not None: input_data.append(sad) if indices is not None: input_data.append(indices) self.gmm.fit(input_data) # ====== some fun, and confusing logics ====== # # GMM need to be fitted before creating T-matrix model new_tmat = (not self.tmat.is_fitted or new_gmm or refit_tmat) # New I-vector is need when: # - only when `extract_ivecs=True` # - and new T-matrix is trained but no I-vector is extracted new_ivec = extract_ivecs and \ (new_tmat or not os.path.exists(self.ivec_path)) # new stats is only needed when # - GMM is updated # - training new Tmatrix and the Z and F not exist # - extracting new I-vector and the Z and F not exist if not new_gmm and \ (os.path.exists(self.z_path) and os.path.exists(self.f_path)): new_stats = False else: new_stats = new_gmm or new_tmat or new_ivec # ====== extract the statistics ====== # if new_stats: _extract_zero_and_first_stats(X=X, sad=sad, indices=indices, gmm=self.gmm, z_path=self.z_path, f_path=self.f_path, name_path=self.name_path) # ====== Training the T-matrix and extract i-vector ====== # if new_tmat or new_ivec: Z = MmapArray(path=self.z_path) F = MmapArray(path=self.f_path) if new_tmat: self.tmat.fit((Z, F)) if new_ivec: self.tmat.transform_to_disk(path=self.ivec_path, Z=Z, F=F, dtype='float32', device='gpu', override=True) Z.close() F.close() # ====== clean ====== # if not keep_stats: if os.path.exists(self.z_path): os.remove(self.z_path) if os.path.exists(self.f_path): os.remove(self.f_path) return self
def _parse_data_descriptor(path, read_only): """ Return mapping: name -> (dtype, shape, Data, path) """ if not os.path.isfile(path): return None file_ext = os.path.splitext(path)[-1].lower() file_name = os.path.basename(path) # ====== ignore ====== # if os.path.basename(path) in _ignore_files: return None # ====== audio file ====== # if file_ext in _audio_ext: return [(file_name, ('audio', 'unknown', None, path))] # ====== image file ====== # if file_ext in _image_ext: return [(file_name, ('image', 'unknown', None, path))] # ====== text file .txt ====== # if file_ext in ('.txt', ): return [(file_name, ('txt', 'unknown', None, path))] # ====== check if is csv file ====== # if file_ext in ('.csv', '.tsv'): sep = _infer_separator(path) data = [] # read by manually open file much faster than numpy.genfromtxt with open(path, 'r') as f: for line in f: line = line[:-1] data.append(line.split(sep)) data = np.array(data, dtype=str) return [('.'.join(file_name.split('.')[:-1]), ('csv', data.shape, data, path))] # ====== check if a file is Data ====== # try: dtype, shape = read_mmaparray_header(path) # ensure read-only mode here, # i.e. change only applied in memory, not saved to disk kw = dict(mode='c') if read_only else dict() data = MmapArray(path, **kw) assert np.dtype(dtype) == data.dtype and shape == data.shape, \ "Metadata mismatch for MmapArray" return [(file_name, (data.dtype, data.shape, data, path))] except Exception: # cannot read the header of MmapArray pass # ====== try to load pickle file if possible ====== # try: # try with unpickling with open(path, 'rb') as f: data = cPickle.load(f) shape_info = 0 if hasattr(data, 'shape'): shape_info = data.shape elif hasattr(data, '__len__'): shape_info = len(data) return [(file_name, (str(data.dtype) if hasattr(data, 'dtype') else type(data).__name__, shape_info, data, path))] except cPickle.UnpicklingError: try: # try again with numpy load with open(path, 'rb') as f: data = np.load(f) return [(file_name, (str(data.dtype) if hasattr(data, 'dtype') else type(data).__name__, len(data) if hasattr(data, '__len__') else 0, data, path))] except Exception: pass # ====== load memmap dict ====== # try: data = MmapDict(path, read_only=read_only) return [(file_name, ('memdict', len(data), data, path))] except Exception as e: pass # ====== load SQLiteDict ====== # if '.db' in os.path.splitext(path)[1]: try: db = SQLiteDict(path, read_only=read_only) name = os.path.basename(path).replace('.db', '') return [(tab if tab != SQLiteDict._DEFAULT_TABLE else name, ('sqlite', len(db.set_table(tab)), db.as_table(tab), path)) for tab in db.get_all_tables()] except Exception as e: pass # ====== unknown datatype ====== # return [(file_name, ('unknown', 'unknown', None, path))]
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
mmap.write(X) print('Writing data to Memmap:', timeit.default_timer() - start, 's') hdf5.flush() hdf5.close() mmap.flush() mmap.close() # ====== reading ====== # print() start = timeit.default_timer() hdf5 = h5py.File('tmp.hdf5', 'r') print('Load HDF5 data :', timeit.default_timer() - start, 's') start = timeit.default_timer() mmap = MmapArray('tmp.mmap') print('Load Memmap data:', timeit.default_timer() - start, 's') print() print('Test correctness of stored data') print('HDF5 :', np.all(hdf5['X'][:] == X)) print('Memmap:', np.all(mmap[:] == X)) # ====== iterating over dataset ====== # print() start = timeit.default_timer() for epoch in range(0, 3): for i in range(0, N, 256): x = X[i:i + 256] print('Iterate Numpy data :', timeit.default_timer() - start, 's')