Esempio n. 1
0
def experiment_seurat_ari(data_names, namespace):
    datasets, genes_list, n_cells = load_names(data_names, norm=False)
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)
    X_dimred = reduce_dimensionality(normalize(X))

    name = 'data/{}'.format(namespace)
    Ns = [500, 1000, 2000, 5000, 10000]

    if not os.path.isfile('{}/matrix.mtx'.format(name)):
        save_mtx(name, csr_matrix(X), genes)
    log('Seurat clustering full dataset...')
    cluster_labels_full = seurat_cluster(name)
    log('Seurat clustering done.')

    for N in Ns:
        gs_idx = gs(X_dimred, N)
        save_mtx(name + '/gs{}'.format(N), csr_matrix(X[gs_idx, :]), genes)
        log('Seurat clustering GS N = {}...'.format(N))
        seurat_labels = seurat_cluster(name + '/gs{}'.format(N))
        log('Seurat clustering GS N = {} done.'.format(N))
        cluster_labels = label_approx(X_dimred, X_dimred[gs_idx],
                                      seurat_labels)
        log('N = {}, GS ARI = {}'.format(
            N, adjusted_rand_score(cluster_labels_full, cluster_labels)))

        uni_idx = uniform(X_dimred, N)
        save_mtx(name + '/uni{}'.format(N), csr_matrix(X[uni_idx, :]), genes)
        log('Seurat clustering uniform N = {}...'.format(N))
        seurat_labels = seurat_cluster(name + '/uni{}'.format(N))
        log('Seurat clustering uniform N = {} done.'.format(N))
        cluster_labels = label_approx(X_dimred, X_dimred[uni_idx],
                                      seurat_labels)
        log('N = {}, Uniform ARI = {}'.format(
            N, adjusted_rand_score(cluster_labels_full, cluster_labels)))
def load_data():
    data_names = [
        'data/norman2019_k562',
    ]

    [ X ], [ genes ], n_cells = load_names(data_names, norm=False)

    qc_idx, perturbs = load_meta(data_names[0])
    X = X[qc_idx]
    X = normalize(X, norm='l1') * 1e5
    X = X.log1p()

    adata = AnnData(X)
    adata.var_names = genes
    adata.var_names_make_unique()
    adata.obs['perturb'] = perturbs

    sc.pp.highly_variable_genes(adata, n_top_genes=5000)

    return adata
Esempio n. 3
0

def kl_divergence(cell_labels, samp_idx, expected):
    cluster_labels = cell_labels[samp_idx]
    clusters = sorted(set(cell_labels))
    max_cluster = max(clusters)
    cluster_hist = np.zeros(max_cluster + 1)
    for c in range(max_cluster + 1):
        if c in clusters:
            cluster_hist[c] = np.sum(cluster_labels == c)
    cluster_hist /= np.sum(cluster_hist)
    return scipy.stats.entropy(cluster_hist, expected)


if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names, norm=False)
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)

    k = DIMRED
    U, s, Vt = pca(normalize(X), k=k)
    X_dimred = U[:, :k] * s[:k]

    Xs = []
    labels = []
    translate = X_dimred.max(0)
    for i in range(3):
        rand_idx = np.random.choice(X.shape[0],
                                    size=int(X.shape[0] / (10**i)),
                                    replace=False)
        Xs.append(X_dimred[rand_idx, :] + (translate * 2 * i))
Esempio n. 4
0
data_names = [
    'data/mouse_brain/nuclei',
    'data/mouse_brain/dropviz/Cerebellum_ALT',
    'data/mouse_brain/dropviz/Cortex_noRep5_FRONTALonly',
    'data/mouse_brain/dropviz/Cortex_noRep5_POSTERIORonly',
    'data/mouse_brain/dropviz/EntoPeduncular',
    'data/mouse_brain/dropviz/GlobusPallidus',
    'data/mouse_brain/dropviz/Hippocampus',
    'data/mouse_brain/dropviz/Striatum',
    'data/mouse_brain/dropviz/SubstantiaNigra',
    'data/mouse_brain/dropviz/Thalamus',
]

if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names)

    datasets, genes = merge_datasets(datasets, genes_list, ds_names=data_names)

    datasets_dimred, genes = process_data(datasets, genes, verbose=True)

    t0 = time()
    datasets_dimred = assemble(
        datasets_dimred,
        batch_size=BATCH_SIZE,
    )
    print('Integrated panoramas in {:.3f}s'.format(time() - t0))

    t0 = time()
    datasets_dimred, datasets, genes = correct(datasets,
                                               genes_list,
Esempio n. 5
0
from scipy.sparse import vstack
from sklearn.preprocessing import normalize, LabelEncoder
import sys
from process import load_names

NAMESPACE = 'polarized'

data_names = [
    'data/macrophage/gmcsf_day6_1',
    'data/macrophage/gmcsf_day6_2',
    'data/macrophage/mcsf_day6_1',
    'data/macrophage/mcsf_day6_2',
]

if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names, log1p=True)
    datasets, genes = merge_datasets(datasets, genes_list)
    datasets_dimred, genes = process_data(datasets, genes)

    labels = []
    names = []
    curr_label = 0
    for i, a in enumerate(datasets):
        labels += list(np.zeros(a.shape[0]) + curr_label)
        names.append(data_names[i])
        curr_label += 1
    labels = np.array(labels, dtype=int)

    polarized_genes = ['PRDX1']

    #embedding = visualize(datasets_dimred,
import numpy as np
from process import load_names, merge_datasets
from utils import *

NAMESPACE = 'human_cordblood_ica'
DIMRED = 100
DR_METHOD = 'svd'

data_names = [
    'data/ica/ica_cord_blood_h5',
]
namespaces = [
    'ica_cord_blood',
]

[X], [genes], _ = load_names(data_names)

umi_sum = np.sum(X, axis=1)
gt_idx = [i for i, s in enumerate(umi_sum) if s >= 500]
low_idx = [
    idx for idx, gene in enumerate(genes)
    if gene.startswith('RPS') or gene.startswith('RPL')
]
lt_idx = [
    i for i, s in enumerate(np.sum(X[:, low_idx], axis=1) / umi_sum)
    if s <= 0.5
]

qc_idx = sorted(set(gt_idx) & set(lt_idx))
X = csr_matrix(X[qc_idx])
Esempio n. 7
0
from process import load_names
from scanorama import *

NAMESPACE = 'hsc'

data_names = [
    'data/hsc/hsc_mars',
    'data/hsc/hsc_ss2',
]

# Computes the probability that the corrected SS2 dataset
# comes from the original SS2 distribution or from the same
# distribution as the corrected MARS-Seq dataset.
if __name__ == '__main__':
    # Load data.
    datasets, genes_list, n_cells = load_names(data_names, verbose=False)
    datasets, genes = merge_datasets(datasets, genes_list, verbose=False)
    datasets, genes = process_data(datasets, genes)
    datasets = [normalize(ds, axis=1) for ds in datasets]

    # Fit initial mixture models.
    gm_ss2 = (GaussianMixture(n_components=3, n_init=3).fit(datasets[1]))

    # Do batch correction.
    datasets = assemble(datasets,
                        verbose=False,
                        knn=KNN,
                        sigma=SIGMA,
                        approx=APPROX)
    datasets = [normalize(ds, axis=1) for ds in datasets]
Esempio n. 8
0
from pancreas_tests import *
from process import load_names

NAMESPACE = 'pancreas'

data_names = [
    'data/pancreas/pancreas_inDrop',
    'data/pancreas/pancreas_multi_celseq2_expression_matrix',
    'data/pancreas/pancreas_multi_celseq_expression_matrix',
    'data/pancreas/pancreas_multi_fluidigmc1_expression_matrix',
    'data/pancreas/pancreas_multi_smartseq2_expression_matrix',
]

if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names)

    labels = []
    names = []
    curr_label = 0
    for i, a in enumerate(datasets):
        labels += list(np.zeros(a.shape[0]) + curr_label)
        names.append(data_names[i])
        curr_label += 1
    labels = np.array(labels, dtype=int)

    datasets_dimred, datasets, genes = correct(datasets,
                                               genes_list,
                                               ds_names=data_names,
                                               return_dimred=True)