Beispiel #1
0
def read_Hemato(override=False, verbose=False):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 'HEMATO_preprocessed'),
                                    create_new=True)

    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            from scvi.dataset import HematoDataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for HEMATO dataset")

        gene_dataset = HematoDataset(
            save_path=os.path.join(DOWNLOAD_DIR, 'HEMATO/'))

        X = gene_dataset._X
        gene_names = np.array(gene_dataset.gene_names)
        assert len(gene_names) == X.shape[1]

        y = gene_dataset.meta.values[:, 1:]
        label_names = np.array(gene_dataset.cell_types_levels)
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])

        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)

        # create a binary classes for testing
        label_names = np.array(["Erythroblasts", "Granulocytes"])
        min_y = np.min(gene_dataset.labels)
        max_y = np.max(gene_dataset.labels)
        y_val = 2 * (gene_dataset.labels - min_y) / (max_y - min_y) - 1
        y_bin = np.argmax(
            np.hstack((
                gene_dataset.meta.iloc[:, 1].values[:, None],  # Er
                gene_dataset.meta.iloc[:, 2].values[:, None])),  # Gr
            axis=-1)
        with open(os.path.join(preprocessed_path, 'labels_name'), 'wb') as f:
            pickle.dump(label_names, f)
        with open(os.path.join(preprocessed_path, 'labels_bin'), 'wb') as f:
            pickle.dump(y_bin, f)
        with open(os.path.join(preprocessed_path, 'labels_val'), 'wb') as f:
            pickle.dump(y_val, f)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Beispiel #2
0
def _read_scvi_dataset(name, clazz_name, override, verbose):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 '%s_preprocessed' % name),
                                    create_new=True)
    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            import scvi.dataset as scvi_dataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for PBMC dataset")
        clazz = getattr(scvi_dataset, clazz_name)
        gene_dataset = clazz(save_path=DOWNLOAD_DIR)

        X = gene_dataset._X
        if hasattr(X, 'todense'):
            X = np.array(X.todense())

        gene_names = np.array(gene_dataset.gene_names)
        # convert gene identifier to gene symbol (i.e. name)
        if hasattr(gene_dataset, 'de_metadata'):
            from sisua.data.utils import get_gene_id2name
            meta = gene_dataset.de_metadata
            converter = {i: j for i, j in zip(meta.ENSG, meta.GS)}
            pbmc8kconverter = get_gene_id2name()
            gene_names = np.array([
                pbmc8kconverter[i] if i in pbmc8kconverter else converter[i]
                for i in gene_names
            ])
        assert len(gene_names) == X.shape[1]

        label_names = np.array(gene_dataset.cell_types)
        y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names))
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])
        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Beispiel #3
0
import os
import pickle

import numpy as np
from scipy.io import savemat

from odin import fuel as F, visual as V
from odin.utils import ctext, Progbar, get_exppath, select_path
from odin.stats import train_valid_test_split, sampling_iter
from odin.preprocessing.signal import segment_axis

HOME_PATH = os.path.expanduser('~')
# fixed path to 'voxceleb1_wav' folder
PATH_TO_WAV = select_path('/media/data2/SRE_DATA/voxceleb',
                          '/mnt/sdb1/SRE_DATA/voxceleb',
                          os.path.join(HOME_PATH, 'data', 'voxceleb'),
                          os.path.join(HOME_PATH, 'voxceleb'),
                          create_new=False)
# path to folder contains experiment results
PATH_EXP = get_exppath('voxceleb')
# output path for acoustic features directory
PATH_ACOUSTIC_FEAT = os.path.join(PATH_EXP, 'voxceleb_feat')
if not os.path.exists(PATH_ACOUSTIC_FEAT):
    os.mkdir(PATH_ACOUSTIC_FEAT)
# ====== remove '_quarter' if you want full training data ====== #
FILE_LIST = "voxceleb_files_quarter"
TRAIN_LIST = "voxceleb_sys_train_with_labels_quarter"
TRIAL_LIST = "voxceleb_trials"
# ====== Load the file list ====== #
ds = F.load_voxceleb_list()
WAV_FILES = {}  # dictionary mapping 'file_path' -> 'file_name'
Beispiel #4
0
EXP_DIR = get_exppath('sre', override=False)
# this folder store extracted vectors for training backend and extracting scores
VECTORS_DIR = os.path.join(EXP_DIR, 'vectors')
if not os.path.exists(VECTORS_DIR):
    os.mkdir(VECTORS_DIR)
# this folder store the results
RESULT_DIR = os.path.join(EXP_DIR, 'results')
if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)
# this folder store the analysis
ANALYSIS_DIR = os.path.join(EXP_DIR, 'analysis')
if not os.path.exists(ANALYSIS_DIR):
    os.mkdir(ANALYSIS_DIR)
# ====== raw data ====== #
PATH_BASE = select_path('/media/data2/SRE_DATA',
                        '/mnt/sda1/SRE_DATA',
                        '/mnt/sdb1/SRE_DATA',
                        default='')
# path to directory contain following folders:
##############
#   * fisher
#   * mx6
#   * sre04
#   * sre05
#   * sre06
#   * sre08
#   * sre10
#   * swb
#   * voxceleb1
#   * voxceleb2
###############
#   * musan
Beispiel #5
0
import pickle

import numpy as np
from scipy.io import savemat

from odin import fuel as F, visual as V
from odin.utils import ctext, Progbar, get_exppath, select_path
from odin.stats import train_valid_test_split, sampling_iter
from odin.preprocessing.signal import segment_axis

HOME_PATH = os.path.expanduser('~')
# fixed path to 'voxceleb1_wav' folder
PATH_TO_WAV = select_path(
    '/media/data2/SRE_DATA/voxceleb',
    '/mnt/sdb1/SRE_DATA/voxceleb',
    os.path.join(HOME_PATH, 'data', 'voxceleb'),
    os.path.join(HOME_PATH, 'voxceleb'),
    create_new=False
)
# path to folder contains experiment results
PATH_EXP = get_exppath('voxceleb')
# output path for acoustic features directory
PATH_ACOUSTIC_FEAT = os.path.join(PATH_EXP, 'voxceleb_feat')
if not os.path.exists(PATH_ACOUSTIC_FEAT):
  os.mkdir(PATH_ACOUSTIC_FEAT)
# ====== remove '_quarter' if you want full training data ====== #
FILE_LIST = "voxceleb_files_quarter"
TRAIN_LIST = "voxceleb_sys_train_with_labels_quarter"
TRIAL_LIST = "voxceleb_trials"
# ====== Load the file list ====== #
ds = F.load_voxceleb_list()
Beispiel #6
0
import os
from os.path import expanduser

from odin.utils import get_script_path, select_path

DEFAULT_BASE_DIR = expanduser("~")

if 'SISUA_DATA' in os.environ:
    DATA_DIR = os.environ['SISUA_DATA']
    if not os.path.exists(DATA_DIR):
        os.mkdir(DATA_DIR)
    elif os.path.isfile(DATA_DIR):
        raise RuntimeError("Store data path at '%s' must be a folder" %
                           DATA_DIR)
else:
    DATA_DIR = select_path(os.path.join(DEFAULT_BASE_DIR, 'bio_data'),
                           create_new=True)

DOWNLOAD_DIR = select_path(os.path.join(DATA_DIR, 'downloads'),
                           create_new=True)

# PATH for saving experiments results
if 'SISUA_EXP' in os.environ:
    EXP_DIR = os.environ['SISUA_EXP']
    if not os.path.exists(EXP_DIR):
        os.mkdir(EXP_DIR)
    elif os.path.isfile(EXP_DIR):
        raise RuntimeError("Experiment path at '%s' must be a folder" %
                           EXP_DIR)
else:
    EXP_DIR = select_path(os.path.join(DEFAULT_BASE_DIR, 'bio_exp'),
                          create_new=True)
Beispiel #7
0
# this folder store extracted vectors for training backend and extracting scores
VECTORS_DIR = os.path.join(EXP_DIR, 'vectors')
if not os.path.exists(VECTORS_DIR):
  os.mkdir(VECTORS_DIR)
# this folder store the results
RESULT_DIR = os.path.join(EXP_DIR, 'results')
if not os.path.exists(RESULT_DIR):
  os.mkdir(RESULT_DIR)
# this folder store the analysis
ANALYSIS_DIR = os.path.join(EXP_DIR, 'analysis')
if not os.path.exists(ANALYSIS_DIR):
  os.mkdir(ANALYSIS_DIR)
# ====== raw data ====== #
PATH_BASE = select_path(
    '/media/data2/SRE_DATA',
    '/mnt/sda1/SRE_DATA',
    '/mnt/sdb1/SRE_DATA',
default='')
# path to directory contain following folders:
##############
#   * fisher
#   * mx6
#   * sre04
#   * sre05
#   * sre06
#   * sre08
#   * sre10
#   * swb
#   * voxceleb1
#   * voxceleb2
###############
Beispiel #8
0
# Const
# ===========================================================================
# top 5000 variable genes
_URL_5000 = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL0dTRTEwMDg2Nl9QQk1DLnJhd0Nv\ndW50RGF0YS41MDAwLmh2Zy5jc3Yuemlw\n'
_MD5_5000 = '46150f63e5a3c81d4f07445a759faa2b'

# raw Count Gene
_URL_FULL = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL0dTRTEwMDg2Nl9QQk1DLnJhd0Nv\ndW50RGF0YS5jc3Yuemlw\n'
_MD5_FULL = '7481cc9d20adef4d06fdb601d9d99e77'

# protein
_URL_PROTEIN = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL0dTRTEwMDg2Nl9QQk1DLnJhd0Nv\ndW50UHJvdGVpbi5jc3Yuemlw\n'
_MD5_PROTEIN = '7dc5f64c2916d864568f1b739679717e'

_CITEseq_PBMC_PREPROCESSED = select_path(os.path.join(
    DATA_DIR, 'PBMC_citeseq_preprocessed'),
                                         create_new=True)
_5000_PBMC_PREPROCESSED = select_path(os.path.join(
    DATA_DIR, 'PBMC_citeseq_5000_preprocessed'),
                                      create_new=True)

_PASSWORD = '******'


# ===========================================================================
# Main
# ===========================================================================
def read_CITEseq_PBMC(override=False,
                      verbose=True,
                      filtered_genes=False) -> SingleCellOMIC:
  download_path = os.path.join(