def get_dataset(clazz, ext='', override=False): # ====== all path ====== # name = clazz.get_name(ext) + '.zip' path = base64.decodebytes(DataLoader.ORIGIN).decode() + name zip_path = clazz.get_zip_path(ext) out_path = clazz.get_ds_path(ext) # ====== check out_path ====== # if os.path.isfile(out_path): raise RuntimeError("Found a file at path: %s, we need a folder " "to unzip downloaded files." % out_path) elif os.path.isdir(out_path): if override or len(os.listdir(out_path)) == 0: shutil.rmtree(out_path) else: return Dataset(out_path, read_only=True) # ====== download the file ====== # if os.path.exists(zip_path) and override: os.remove(zip_path) if not os.path.exists(zip_path): get_file(name, path, DataLoader.BASE_DIR) # ====== upzip dataset ====== # unzip_aes(in_path=zip_path, out_path=out_path) ds = Dataset(out_path, read_only=True) if os.path.exists(zip_path): os.remove(zip_path) return ds
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True): download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = _CITEseq_CBMC_PREPROCESSED if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: if verbose: print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED) shutil.rmtree(_CITEseq_CBMC_PREPROCESSED) os.mkdir(_CITEseq_CBMC_PREPROCESSED) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) download_file(filename=zip_path, url=url, override=False, md5=r"beb76d01a67707c61c21bfb188e1b69f") # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data # ====== post-processing ====== # X = np.array(data_dict['X'].astype('float32')) X_row, X_col = data_dict['X_row'], data_dict['X_col'] X, X_col = remove_allzeros_columns(matrix=X, colname=X_col) assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) sco._inplace_subset_var(result.gene_subset) with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(set(sco.var_names.values), f) del sco # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"cbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col']) if filtered_genes: with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) sco._inplace_subset_var([i in top_genes for i in sco.var_names]) return sco
def read_FACS(n_protein, override=False, verbose=False): download_path = os.path.join(DOWNLOAD_DIR, "FACS_original") if not os.path.exists(download_path): os.mkdir(download_path) n_protein = int(n_protein) assert n_protein in (2, 5) preprocessed_path = _FACS_PREPROCESSED % n_protein if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') % n_protein base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) urlretrieve(url=url, filename=zip_path) # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data if verbose: print('%-12s' % base_name, ':', data.shape) # ====== post-processing ====== # X = data_dict['X'].astype('float32') X = np.array(X) X_row, X_col = data_dict['X_row'], data_dict['X_col'] assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # ====== filter zero columns ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) save_to_dataset(path=preprocessed_path, X=X, X_col=X_col, y=y, y_col=y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) return ds