def read_Hemato(override=False, verbose=False): preprocessed_path = select_path(os.path.join(DATA_DIR, 'HEMATO_preprocessed'), create_new=True) if override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ====== copy the dataset from scVI ====== # if not os.path.exists(os.path.join(preprocessed_path, 'X')): try: from scvi.dataset import HematoDataset except ImportError: raise RuntimeError("Require `scVI` package for HEMATO dataset") gene_dataset = HematoDataset( save_path=os.path.join(DOWNLOAD_DIR, 'HEMATO/')) X = gene_dataset._X gene_names = np.array(gene_dataset.gene_names) assert len(gene_names) == X.shape[1] y = gene_dataset.meta.values[:, 1:] label_names = np.array(gene_dataset.cell_types_levels) assert len(label_names) == y.shape[1] cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])]) _save_data_to_path(preprocessed_path, X, y, gene_names, label_names, cell_names, verbose) # create a binary classes for testing label_names = np.array(["Erythroblasts", "Granulocytes"]) min_y = np.min(gene_dataset.labels) max_y = np.max(gene_dataset.labels) y_val = 2 * (gene_dataset.labels - min_y) / (max_y - min_y) - 1 y_bin = np.argmax( np.hstack(( gene_dataset.meta.iloc[:, 1].values[:, None], # Er gene_dataset.meta.iloc[:, 2].values[:, None])), # Gr axis=-1) with open(os.path.join(preprocessed_path, 'labels_name'), 'wb') as f: pickle.dump(label_names, f) with open(os.path.join(preprocessed_path, 'labels_bin'), 'wb') as f: pickle.dump(y_bin, f) with open(os.path.join(preprocessed_path, 'labels_val'), 'wb') as f: pickle.dump(y_val, f) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def _read_scvi_dataset(name, clazz_name, override, verbose): preprocessed_path = select_path(os.path.join(DATA_DIR, '%s_preprocessed' % name), create_new=True) if override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ====== copy the dataset from scVI ====== # if not os.path.exists(os.path.join(preprocessed_path, 'X')): try: import scvi.dataset as scvi_dataset except ImportError: raise RuntimeError("Require `scVI` package for PBMC dataset") clazz = getattr(scvi_dataset, clazz_name) gene_dataset = clazz(save_path=DOWNLOAD_DIR) X = gene_dataset._X if hasattr(X, 'todense'): X = np.array(X.todense()) gene_names = np.array(gene_dataset.gene_names) # convert gene identifier to gene symbol (i.e. name) if hasattr(gene_dataset, 'de_metadata'): from sisua.data.utils import get_gene_id2name meta = gene_dataset.de_metadata converter = {i: j for i, j in zip(meta.ENSG, meta.GS)} pbmc8kconverter = get_gene_id2name() gene_names = np.array([ pbmc8kconverter[i] if i in pbmc8kconverter else converter[i] for i in gene_names ]) assert len(gene_names) == X.shape[1] label_names = np.array(gene_dataset.cell_types) y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names)) assert len(label_names) == y.shape[1] cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])]) _save_data_to_path(preprocessed_path, X, y, gene_names, label_names, cell_names, verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
import os import pickle import numpy as np from scipy.io import savemat from odin import fuel as F, visual as V from odin.utils import ctext, Progbar, get_exppath, select_path from odin.stats import train_valid_test_split, sampling_iter from odin.preprocessing.signal import segment_axis HOME_PATH = os.path.expanduser('~') # fixed path to 'voxceleb1_wav' folder PATH_TO_WAV = select_path('/media/data2/SRE_DATA/voxceleb', '/mnt/sdb1/SRE_DATA/voxceleb', os.path.join(HOME_PATH, 'data', 'voxceleb'), os.path.join(HOME_PATH, 'voxceleb'), create_new=False) # path to folder contains experiment results PATH_EXP = get_exppath('voxceleb') # output path for acoustic features directory PATH_ACOUSTIC_FEAT = os.path.join(PATH_EXP, 'voxceleb_feat') if not os.path.exists(PATH_ACOUSTIC_FEAT): os.mkdir(PATH_ACOUSTIC_FEAT) # ====== remove '_quarter' if you want full training data ====== # FILE_LIST = "voxceleb_files_quarter" TRAIN_LIST = "voxceleb_sys_train_with_labels_quarter" TRIAL_LIST = "voxceleb_trials" # ====== Load the file list ====== # ds = F.load_voxceleb_list() WAV_FILES = {} # dictionary mapping 'file_path' -> 'file_name'
EXP_DIR = get_exppath('sre', override=False) # this folder store extracted vectors for training backend and extracting scores VECTORS_DIR = os.path.join(EXP_DIR, 'vectors') if not os.path.exists(VECTORS_DIR): os.mkdir(VECTORS_DIR) # this folder store the results RESULT_DIR = os.path.join(EXP_DIR, 'results') if not os.path.exists(RESULT_DIR): os.mkdir(RESULT_DIR) # this folder store the analysis ANALYSIS_DIR = os.path.join(EXP_DIR, 'analysis') if not os.path.exists(ANALYSIS_DIR): os.mkdir(ANALYSIS_DIR) # ====== raw data ====== # PATH_BASE = select_path('/media/data2/SRE_DATA', '/mnt/sda1/SRE_DATA', '/mnt/sdb1/SRE_DATA', default='') # path to directory contain following folders: ############## # * fisher # * mx6 # * sre04 # * sre05 # * sre06 # * sre08 # * sre10 # * swb # * voxceleb1 # * voxceleb2 ############### # * musan
import pickle import numpy as np from scipy.io import savemat from odin import fuel as F, visual as V from odin.utils import ctext, Progbar, get_exppath, select_path from odin.stats import train_valid_test_split, sampling_iter from odin.preprocessing.signal import segment_axis HOME_PATH = os.path.expanduser('~') # fixed path to 'voxceleb1_wav' folder PATH_TO_WAV = select_path( '/media/data2/SRE_DATA/voxceleb', '/mnt/sdb1/SRE_DATA/voxceleb', os.path.join(HOME_PATH, 'data', 'voxceleb'), os.path.join(HOME_PATH, 'voxceleb'), create_new=False ) # path to folder contains experiment results PATH_EXP = get_exppath('voxceleb') # output path for acoustic features directory PATH_ACOUSTIC_FEAT = os.path.join(PATH_EXP, 'voxceleb_feat') if not os.path.exists(PATH_ACOUSTIC_FEAT): os.mkdir(PATH_ACOUSTIC_FEAT) # ====== remove '_quarter' if you want full training data ====== # FILE_LIST = "voxceleb_files_quarter" TRAIN_LIST = "voxceleb_sys_train_with_labels_quarter" TRIAL_LIST = "voxceleb_trials" # ====== Load the file list ====== # ds = F.load_voxceleb_list()
import os from os.path import expanduser from odin.utils import get_script_path, select_path DEFAULT_BASE_DIR = expanduser("~") if 'SISUA_DATA' in os.environ: DATA_DIR = os.environ['SISUA_DATA'] if not os.path.exists(DATA_DIR): os.mkdir(DATA_DIR) elif os.path.isfile(DATA_DIR): raise RuntimeError("Store data path at '%s' must be a folder" % DATA_DIR) else: DATA_DIR = select_path(os.path.join(DEFAULT_BASE_DIR, 'bio_data'), create_new=True) DOWNLOAD_DIR = select_path(os.path.join(DATA_DIR, 'downloads'), create_new=True) # PATH for saving experiments results if 'SISUA_EXP' in os.environ: EXP_DIR = os.environ['SISUA_EXP'] if not os.path.exists(EXP_DIR): os.mkdir(EXP_DIR) elif os.path.isfile(EXP_DIR): raise RuntimeError("Experiment path at '%s' must be a folder" % EXP_DIR) else: EXP_DIR = select_path(os.path.join(DEFAULT_BASE_DIR, 'bio_exp'), create_new=True)
# this folder store extracted vectors for training backend and extracting scores VECTORS_DIR = os.path.join(EXP_DIR, 'vectors') if not os.path.exists(VECTORS_DIR): os.mkdir(VECTORS_DIR) # this folder store the results RESULT_DIR = os.path.join(EXP_DIR, 'results') if not os.path.exists(RESULT_DIR): os.mkdir(RESULT_DIR) # this folder store the analysis ANALYSIS_DIR = os.path.join(EXP_DIR, 'analysis') if not os.path.exists(ANALYSIS_DIR): os.mkdir(ANALYSIS_DIR) # ====== raw data ====== # PATH_BASE = select_path( '/media/data2/SRE_DATA', '/mnt/sda1/SRE_DATA', '/mnt/sdb1/SRE_DATA', default='') # path to directory contain following folders: ############## # * fisher # * mx6 # * sre04 # * sre05 # * sre06 # * sre08 # * sre10 # * swb # * voxceleb1 # * voxceleb2 ###############
# Const # =========================================================================== # top 5000 variable genes _URL_5000 = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL0dTRTEwMDg2Nl9QQk1DLnJhd0Nv\ndW50RGF0YS41MDAwLmh2Zy5jc3Yuemlw\n' _MD5_5000 = '46150f63e5a3c81d4f07445a759faa2b' # raw Count Gene _URL_FULL = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL0dTRTEwMDg2Nl9QQk1DLnJhd0Nv\ndW50RGF0YS5jc3Yuemlw\n' _MD5_FULL = '7481cc9d20adef4d06fdb601d9d99e77' # protein _URL_PROTEIN = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL0dTRTEwMDg2Nl9QQk1DLnJhd0Nv\ndW50UHJvdGVpbi5jc3Yuemlw\n' _MD5_PROTEIN = '7dc5f64c2916d864568f1b739679717e' _CITEseq_PBMC_PREPROCESSED = select_path(os.path.join( DATA_DIR, 'PBMC_citeseq_preprocessed'), create_new=True) _5000_PBMC_PREPROCESSED = select_path(os.path.join( DATA_DIR, 'PBMC_citeseq_5000_preprocessed'), create_new=True) _PASSWORD = '******' # =========================================================================== # Main # =========================================================================== def read_CITEseq_PBMC(override=False, verbose=True, filtered_genes=False) -> SingleCellOMIC: download_path = os.path.join(