Example #1
0
def get_data(numgroups):
    with localconverter(ro.default_converter + pandas2ri.converter):
        if numgroups == 2:
            r.source('~/Documents/rscripts/splatter-2.R')
        elif numgroups == 6:
            r.source('~/Documents/rscripts/splatter-6.R')
        counts = r2py(r['counts'])  # cell-by-gene dataframe
        cellinfo = r2py(r['cellinfo'])  # Cell, Batch, Group
        geneinfo = r2py(r['geneinfo'])  # Gene

        sim = sc.AnnData(counts.values, obs=cellinfo, var=geneinfo)
        sim.obs_names = cellinfo.Cell
        sim.var_names = geneinfo.Gene
        if numgroups == 2:
            sc.pp.filter_genes(
                sim, min_counts=1
            )  # omitted in 6 case so we can generalize to diff dropout %s

        truecounts = r2py(r['truecounts'])
        dropout = r2py(r['dropout'])
        print("percent dropout: {}".format(
            np.sum(dropout.values) / (sim.n_obs * sim.n_vars)))

        sim_true = sc.AnnData(truecounts.values, obs=cellinfo, var=geneinfo)
        sim_true.obs_names = cellinfo.Cell
        sim_true.var_names = geneinfo.Gene
        sim_true = sim_true[:, sim.var_names]

        return [sim, sim_true]
Example #2
0
def test_api():
    # reference adata
    X = np.random.normal(size=(1000, 10))
    adata = sc.AnnData(X=X)
    adata.obs['cell_type'] = list("ABCDE") * 200
    adata.obs['condition'] = list("MNOP") * 250
    adata.raw = adata
    adata.obs['size_factors'] = 1.0

    # print(len(adata.obs['condition'].unique().tolist()))
    # print(len(adata.obs['cell_type'].unique().tolist()))

    model = sca.models.scArches(10, list('MNOP'))
    model.train(adata, "condition", 0.8)

    print(model.network_kwargs)
    # query adata
    X = np.random.normal(size=(1000, 10))
    adata = sc.AnnData(X=X)
    adata.obs['cell_type'] = list("ABCL") * 250
    adata.obs['condition'] = list("QRST") * 250
    adata.raw = adata
    adata.obs['size_factors'] = 1.0

    new_model = sca.operate(model, "new_task",
                            adata.obs['condition'].unique().tolist())
    print(new_model.network_kwargs)
    # print(new_model.n_conditions, new_model.n_mmd_conditions, new_model.condition_encoder)
    new_model.train(adata, "condition", 0.8, n_epochs=1)
Example #3
0
    def latent_as_anndata(self):
        if type(self.outer_model) is TOTALVI:
            latent = self.outer_model.get_latent_representation(self.adata)
        else:
            if self.modified:
                latents = self.model.sample_from_posterior_z(
                    self.x_tensor,
                    y=self.label_tensor,
                    batch_index=self.batch_tensor)
            else:
                latents = self.model.sample_from_posterior_z(
                    self.x_tensor,
                    y=self.label_tensor,
                )

            if self.annotated:
                latent = latents.cpu().detach().numpy()
                latent2, _, _ = self.model.encoder_z2_z1(
                    latents, self.label_tensor)
                latent2 = latent2.cpu().detach().numpy()
                post_adata_2 = sc.AnnData(latent2)
                post_adata_2.obs['cell_type'] = self.cell_types
                post_adata_2.obs['batch'] = self.batch_names
                self.post_adata_2 = post_adata_2
            else:
                latent = latents.cpu().detach().numpy()

        post_adata = sc.AnnData(latent)
        post_adata.obs['cell_type'] = self.cell_types
        post_adata.obs['batch'] = self.batch_names
        return post_adata
Example #4
0
def check_rep_results(func, X, **kwargs):
    """Checks that the results of a computation add values/ mutate the anndata object in a consistent way."""
    # Gen data
    adata_X = sc.AnnData(
        X=X.copy(),
        layers={"layer": np.zeros(shape=X.shape, dtype=X.dtype)},
        obsm={"obsm": np.zeros(shape=X.shape, dtype=X.dtype)},
    )
    adata_layer = sc.AnnData(
        X=np.zeros(shape=X.shape, dtype=X.dtype),
        layers={"layer": X.copy()},
        obsm={"obsm": np.zeros(shape=X.shape, dtype=X.dtype)},
    )
    adata_obsm = sc.AnnData(
        X=np.zeros(shape=X.shape, dtype=X.dtype),
        layers={"layer": np.zeros(shape=X.shape, dtype=X.dtype)},
        obsm={"obsm": X.copy()},
    )

    # Apply function
    func(adata_X, **kwargs)
    func(adata_layer, layer="layer", **kwargs)
    func(adata_obsm, obsm="obsm", **kwargs)

    # Reset X
    adata_X.X = np.zeros(shape=X.shape, dtype=X.dtype)
    adata_layer.layers["layer"] = np.zeros(shape=X.shape, dtype=X.dtype)
    adata_obsm.obsm["obsm"] = np.zeros(shape=X.shape, dtype=X.dtype)

    # Check equality
    assert_equal(adata_X, adata_layer)
    assert_equal(adata_X, adata_obsm)
Example #5
0
def test_obs_df():
    adata = sc.AnnData(
        X=np.ones((2, 2)),
        obs=pd.DataFrame({"obs1": [0, 1], "obs2": ["a", "b"]}, index=["cell1", "cell2"]),
        var=pd.DataFrame({"gene_symbols": ["genesymbol1", "genesymbol2"]}, index=["gene1", "gene2"]),
        obsm={"eye": np.eye(2), "sparse": sparse.csr_matrix(np.eye(2))},
        layers={"double": np.ones((2, 2)) * 2}
    )
    adata.raw = sc.AnnData(
        X=np.zeros((2, 2)),
        var=pd.DataFrame({"gene_symbols": ["raw1", "raw2"]}, index=["gene1", "gene2"])
    )
    assert np.all(np.equal(
        sc.get.obs_df(adata, keys=["gene2", "obs1"], obsm_keys=[("eye", 0), ("sparse", 1)]),
        pd.DataFrame({"gene2": [1, 1], "obs1": [0, 1], "eye-0": [1, 0], "sparse-1": [0, 1]}, index=adata.obs_names)
    ))
    assert np.all(np.equal(
        sc.get.obs_df(adata, keys=["genesymbol2", "obs1"], obsm_keys=[("eye", 0), ("sparse", 1)], gene_symbols="gene_symbols"),
        pd.DataFrame({"genesymbol2": [1, 1], "obs1": [0, 1], "eye-0": [1, 0], "sparse-1": [0, 1]}, index=adata.obs_names)
    ))
    assert np.all(np.equal(
        sc.get.obs_df(adata, keys=["gene2", "obs1"], layer="double"),
        pd.DataFrame({"gene2": [2, 2], "obs1": [0, 1]}, index=adata.obs_names)
    ))
    assert np.all(np.equal(
        sc.get.obs_df(adata, keys=["raw2", "obs1"], gene_symbols="gene_symbols", use_raw=True),
        pd.DataFrame({"raw2": [0, 0], "obs1": [0, 1]}, index=adata.obs_names)
    ))
    badkeys = ["badkey1", "badkey2"]
    with pytest.raises(KeyError) as badkey_err:
        sc.get.obs_df(adata, keys=badkeys)
    with pytest.raises(AssertionError):
        sc.get.obs_df(adata, keys=["gene1"], use_raw=True, layer="double")
    assert all(badkey_err.match(k) for k in badkeys)
Example #6
0
def do_latent_evaluation(
    spliced_net, sc_dual_full_dataset, outdir: str, prefix: str = ""
):
    """
    Pull out latent space and write to file
    """
    logging.info("Inferring latent representations")
    encoded_from_rna, encoded_from_atac = spliced_net.get_encoded_layer(
        sc_dual_full_dataset
    )

    if hasattr(sc_dual_full_dataset.dataset_x, "data_raw"):
        encoded_from_rna_adata = sc.AnnData(
            encoded_from_rna,
            obs=sc_dual_full_dataset.dataset_x.data_raw.obs.copy(deep=True),
        )
        encoded_from_rna_adata.write(
            os.path.join(outdir, f"{prefix}_rna_encoded_adata.h5ad".strip("_"))
        )
    if hasattr(sc_dual_full_dataset.dataset_y, "data_raw"):
        encoded_from_atac_adata = sc.AnnData(
            encoded_from_atac,
            obs=sc_dual_full_dataset.dataset_y.data_raw.obs.copy(deep=True),
        )
        encoded_from_atac_adata.write(
            os.path.join(outdir, f"{prefix}_atac_encoded_adata.h5ad".strip("_"))
        )
Example #7
0
def generate_simulated_pca(path, actual_data, clust_typ, source_cell, sim_data,
                           first_cell):
    target = actual_data[actual_data.obs["clusters"] == clust_typ]
    target = sc.AnnData(
        target.X,
        obs={"cell_type": ["Target_" + clust_typ] * len(target)},
        var={"var_names": target.var_names})

    source = actual_data[actual_data.obs["clusters"] == source_cell]

    source = sc.AnnData(
        source.X,
        obs={"cell_type": ["Source_" + source_cell] * len(source)},
        var={"var_names": source.var_names})
    predicted = sc.AnnData(sim_data.X,
                           obs={"cell_type": ["Predicted"] * len(sim_data)},
                           var={"var_names": sim_data.var_names})

    combined_data = source.concatenate(target)
    combined_data = combined_data.concatenate(predicted)

    sc.pp.neighbors(combined_data)
    sc.tl.pca(combined_data, svd_solver='arpack')
    sc.pl.pca(combined_data,
              color=["cell_type"],
              legend_fontsize=12,
              palette=['r', 'k', 'y'],
              frameon=True,
              s=35,
              save="_" + first_cell + "_to_" + clust_typ + "_celltypes.pdf")
Example #8
0
def group_cells(data1, data2):
    meta_cells = data1.obs['ClusterID'].unique()
    meta1 = []
    meta2 = []
    weight = []
    for cluster in meta_cells:
        idx = np.where(data1.obs['ClusterID'] == cluster)
        bc_set = data1.obs['ClusterID'].index[idx]
        try:
            meta1.append(data1.layers['norm_data'][idx].mean(axis=0))
        except:
            meta1.append(data1.X[idx].mean(axis=0))
        meta2.append(data2[bc_set, ].X.mean(axis=0))
        weight.append(len(idx[0]))
    df1 = pd.DataFrame(np.array(meta1),
                       columns=data1.var_names,
                       index=meta_cells)
    df2 = pd.DataFrame(np.array(meta2),
                       columns=data2.var_names,
                       index=meta_cells)
    adata1 = sc.AnnData(df1)
    adata2 = sc.AnnData(df2)
    adata1.obs['Weights'] = weight
    adata2.obs['Weights'] = weight
    return adata1, adata2
Example #9
0
def test_normalize_total(typ):
    adata = sc.AnnData(typ(X_total, dtype='float32'))
    sc.pp.normalize_total(adata, key_added='n_counts')
    assert np.allclose(np.ravel(adata.X.sum(axis=1)), [3., 3., 3.])
    sc.pp.normalize_total(adata, target_sum=1, key_added='n_counts2')
    assert np.allclose(np.ravel(adata.X.sum(axis=1)), [1., 1., 1.])

    adata = sc.AnnData(typ(X_frac, dtype='float32'))
    sc.pp.normalize_total(adata, fraction=0.7)
    assert np.allclose(np.ravel(adata.X[:, 1:3].sum(axis=1)), [1., 1., 1.])
Example #10
0
def adatas():
    pbmc = sc.datasets.pbmc68k_reduced()
    n_split = 500
    adata_ref = sc.AnnData(pbmc.X[:n_split, :], obs=pbmc.obs.iloc[:n_split])
    adata_new = sc.AnnData(pbmc.X[n_split:, :])

    sc.pp.pca(adata_ref)
    sc.pp.neighbors(adata_ref)
    sc.tl.umap(adata_ref)

    return adata_ref, adata_new
Example #11
0
def test_regress_out_constants_equivalent():
    # Tests that constant values don't change results
    # (since support for constant values is implemented by us)
    from sklearn.datasets import make_blobs

    X, cat = make_blobs(100, 20)
    a = sc.AnnData(np.hstack([X, np.zeros((100, 5))]), obs={"cat": pd.Categorical(cat)})
    b = sc.AnnData(X, obs={"cat": pd.Categorical(cat)})

    sc.pp.regress_out(a, "cat")
    sc.pp.regress_out(b, "cat")

    np.testing.assert_equal(a[:, b.var_names].X, b.X)
Example #12
0
def generate_simulated_reg_plots(path, actual_data, clust_typ, cells):
    os.chdir(path)
    actual_data_temp = actual_data[actual_data.obs["cell_type"] == clust_typ]
    reg_mean_vals = []
    for file in glob.glob("*.h5ad"):
        print(file)
        adata = sc.read(file)

        pred_data = sc.AnnData(adata.X,
                               obs={"comparison_typ": ["pred"] * len(adata)},
                               var={"var_names": adata.var_names})
        actual_data_temp = sc.AnnData(
            actual_data_temp.X,
            obs={"comparison_typ": ["actual"] * len(actual_data_temp)},
            var={"var_names": actual_data_temp.var_names})

        first_cell = file[0:file.find('.')]
        #print(first_cell)

        plot_data = actual_data_temp.concatenate(pred_data)

        top_100_gene_list = list(
            actual_data.uns["rank_genes_groups"]['names'][clust_typ])
        #print(top_100_gene_list)

        reg_val = reg_mean_plot(plot_data,
                                condition_key="comparison_typ",
                                axis_keys={
                                    "x": "actual",
                                    "y": "pred"
                                },
                                path_to_save="./reg_mean_" + file + "_TO_" +
                                clust_typ + ".png",
                                legend=False,
                                labels={
                                    "x": "actual",
                                    "y": "pred"
                                },
                                show=False,
                                gene_list=top_100_gene_list[:5],
                                top_100_genes=top_100_gene_list,
                                fontsize=14,
                                textsize=14)
        reg_mean_vals.append(list([first_cell, reg_val[0], reg_val[1]]))

        if reg_val[1] >= 0.40:
            source_cell = [string for string in cells if string in file]
            source_cell = source_cell[0]
            generate_simulated_umaps(path, actual_data, clust_typ, source_cell,
                                     adata, first_cell)
    return reg_mean_vals
Example #13
0
def merge_matrix(ad,obskeys = None,use_raw = False,keep_only_mutual=False):
    '''merge matrix stored in ad
    ad: dictionary of anndata to merge
    obskeys: list to merge within anndata
    use_raw: if True, merge from .raw.X'''
    
    smp_list = list(ad.keys())
    obs_dict = defaultdict(list)
    obs_names = []
    
    for smp in smp_list:
        ad[smp].obs['name'] = smp
    
    if not obskeys:
        obskey_list = []
        obskeys = []
        for sample in smp_list:
            obskey_list.extend(list(ad[sample].obs.columns))
        for (obskey, number) in Counter(obskey_list).items():
            if number == len(smp_list):
                obskeys.append(obskey)
            else:
                if keep_only_mutual:
                    pass
                else:
                    for sample in smp_list:
                        if obskey not in ad[sample].obs.columns:
                            ad[sample].obs[obskey]='n/a'
                    obskeys.append(obskey)
                               
    for sample in smp_list:
        obs_names.extend(list(ad[sample].obs_names))
        for key in obskeys:   
            obs_dict[key].extend(list(ad[sample].obs[key]))
    
    from scipy.sparse import vstack
    if use_raw == True:
        stack = vstack([ad[x].raw.X for x in smp_list]) # stack data
        adata = sc.AnnData(stack, var = ad[smp_list[0]].raw.var)
    else:
        stack = vstack([ad[x].X for x in smp_list]) # stack data
        adata = sc.AnnData(stack, var = ad[smp_list[0]].var)
      
    
    adata.obs_names = obs_names
    print(len(adata))
    for obs_col in obs_dict:
        print(obs_col)
        adata.obs[obs_col] = obs_dict[obs_col]
    return adata
Example #14
0
def test_qc_metrics_format():
    a = np.random.binomial(100, .005, (1000, 1000))
    init_var = pd.DataFrame({
        "mito":
        np.concatenate((np.ones(100, dtype=bool), np.zeros(900, dtype=bool)))
    })
    adata_dense = sc.AnnData(X=a, var=init_var.copy())
    sc.pp.calculate_qc_metrics(adata_dense, qc_vars=["mito"], inplace=True)
    for fmt in [sparse.csr_matrix, sparse.csc_matrix, sparse.coo_matrix]:
        adata = sc.AnnData(X=fmt(a), var=init_var.copy())
        sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)
        assert np.allclose(adata.obs, adata_dense.obs)
        for col in adata.var:  # np.allclose doesn't like mix of types
            assert np.allclose(adata.var[col], adata_dense.var[col])
Example #15
0
def regress_batch_v2(adata,batch_key,confounder_key):
    '''batch regression tool
    batch_key=list of observation categories to be regressed out
    confounder_key=list of observation categories to be kept
    returns ndata with corrected X'''

    from sklearn.linear_model import Ridge
    
    print('fitting linear model...')
    dummy = pd.get_dummies(adata.obs[batch_key+confounder_key],drop_first=False)
    X_exp = adata.X # scaled data
    if scipy.sparse.issparse(X_exp):
        X_exp = X_exp.todense()
    LR = Ridge(fit_intercept=False,alpha=1.0)
    LR.fit(dummy,X_exp)

    if len(batch_key)>1:
        batch_index = np.logical_or.reduce(np.vstack([dummy.columns.str.startswith(x) for x in batch_key]))
    else:
        batch_index = np.vstack([dummy.columns.str.startswith(x) for x in batch_key])[0]
    
    print('corrcting batch...')
    dm = np.array(dummy)[:,batch_index]
    X_explained = dm.dot(LR.coef_[:,batch_index].T)
    X_remain = X_exp - X_explained
    ndata = sc.AnnData(X_remain)
    ndata.obs = adata.obs
    ndata.var = adata.var
    return ndata, X_explained
def calcular_leiden(array, res, subres, seed):
    print('Calculando leiden')

    adata = sc.AnnData(X=np.nan_to_num(array))
    sc.pp.neighbors(adata)
    sc.tl.leiden(adata, resolution=res, random_state=seed)

    if subres > 0:
        array_return = np.zeros(len(adata))
        clusters = list(dict.fromkeys(adata.obs['leiden'].values))
        n_clusters = 0

        for cluster in clusters:
            index_cluster = np.argwhere(
                adata.obs['leiden'] == cluster).flatten()
            subadata = adata[index_cluster].copy()
            subadata.X = np.nan_to_num(subadata.X)
            sc.pp.neighbors(subadata)
            sc.tl.leiden(subadata, resolution=subres, random_state=seed)
            array_return[index_cluster] = subadata.obs['leiden'].values.astype(
                int) + n_clusters
            n_clusters += len(list(dict.fromkeys(subadata.obs['leiden'])))

        return array_return
    else:
        return adata.obs['leiden'].values.astype(int)
Example #17
0
def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs):
    """Checks that the results of a computation add values/ mutate the anndata object in a consistent way."""
    # Gen data
    empty_X = np.zeros(shape=X.shape, dtype=X.dtype)
    adata = sc.AnnData(
        X=empty_X.copy(),
        layers={"layer": empty_X.copy()},
        obsm={"obsm": empty_X.copy()},
    )

    adata_X = adata.copy()
    adata_X.X = X.copy()

    adatas_proc = {}
    for field in fields:
        cur = adata.copy()
        sc.get._set_obs_rep(cur, X.copy(), **{field: field})
        adatas_proc[field] = cur

    # Apply function
    func(adata_X, **kwargs)
    for field in fields:
        func(adatas_proc[field], **{field: field}, **kwargs)

    # Reset X
    adata_X.X = empty_X.copy()
    for field in fields:
        sc.get._set_obs_rep(adatas_proc[field], empty_X.copy(),
                            **{field: field})

    for field_a, field_b in permutations(fields, 2):
        assert_equal(adatas_proc[field_a], adatas_proc[field_b])
    for field in fields:
        assert_equal(adata_X, adatas_proc[field])
Example #18
0
def check_rep_mutation(func, X, *, fields=["layer", "obsm"], **kwargs):
    """Check that only the array meant to be modified is modified."""
    adata = sc.AnnData(X=X.copy(), dtype=X.dtype)
    for field in fields:
        sc.get._set_obs_rep(adata, X, **{field: field})
    X_array = asarray(X)

    adata_X = func(adata, copy=True, **kwargs)
    adatas_proc = {
        field: func(adata, copy=True, **{field: field}, **kwargs)
        for field in fields
    }

    # Modified fields
    for field in fields:
        result_array = asarray(
            sc.get._get_obs_rep(adatas_proc[field], **{field: field}))
        np.testing.assert_array_equal(asarray(adata_X.X), result_array)

    # Unmodified fields
    for field in fields:
        np.testing.assert_array_equal(X_array, asarray(adatas_proc[field].X))
        np.testing.assert_array_equal(
            X_array, asarray(sc.get._get_obs_rep(adata_X, **{field: field})))
    for field_a, field_b in permutations(fields, 2):
        result_array = asarray(
            sc.get._get_obs_rep(adatas_proc[field_a], **{field_b: field_b}))
        np.testing.assert_array_equal(X_array, result_array)
Example #19
0
def data_process(dta, min_genes = 100, min_cells = 10, mt_pct = 10, npcs = 50, oversd = None):
    adata = sc.AnnData(dta)
    adata.var_names_make_unique()
    # quality control
    sc.pp.filter_cells(adata, min_genes = min_genes)
    sc.pp.filter_genes(adata, min_cells = min_cells)
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)

    if oversd is not None:
        mu = np.mean(adata.obs.n_genes_by_counts)
        sd = np.std(adata.obs.n_genes_by_counts)
        thres = mu + oversd * sd
        adata = adata[adata.obs.n_genes_by_counts < thres, :]
        
    adata = adata[adata.obs.pct_counts_mt < mt_pct, :]
    # normalization
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    # find highly variable gene
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    # pca
    sc.tl.pca(adata, svd_solver='arpack', n_comps = npcs)
    
    return adata
Example #20
0
def visualize_trained_network_results(data_dict,
                                      z_dim=100,
                                      subsample=None,
                                      arch_style=1):
    plt.close("all")
    data_name = data_dict['name']
    metadata_path = data_dict['metadata']
    cell_type_key = data_dict['cell_type']
    spec_cell_type = data_dict.get("spec_cell_types", None)

    data = sc.read(
        f"../data/{data_name}/anna/processed_adata_Cusanovich_brain_May29_2019_5000.h5ad"
    )
    data.X += abs(data.X.min())
    if subsample is not None:
        data = data[:subsample]
    cell_types = data.obs[cell_type_key].unique().tolist()

    path_to_save = f"../results/VAE/{data_name}/{arch_style}-{z_dim}/Visualizations/"
    os.makedirs(path_to_save, exist_ok=True)
    sc.settings.figdir = os.path.abspath(path_to_save)

    train_data = data.copy()

    network = trvae.VAE(
        x_dimension=data.shape[1],
        z_dimension=z_dim,
        arch_style=arch_style,
        model_path=f"../models/VAE/{data_name}-{arch_style}/{z_dim}/",
    )

    network.restore_model()

    if sparse.issparse(data.X):
        data.X = data.X.A

    feed_data = data.X

    latent = network.to_z_latent(feed_data)

    latent = sc.AnnData(X=latent)
    latent.obs[cell_type_key] = data.obs[cell_type_key].values

    color = [cell_type_key]

    sc.pp.neighbors(train_data)
    sc.tl.umap(train_data)
    sc.pl.umap(train_data,
               color=color,
               save=f'_{data_name}_train_data.pdf',
               show=False)

    sc.pp.neighbors(latent)
    sc.tl.umap(latent)
    sc.pl.umap(latent,
               color=color,
               save=f"_{data_name}_latent.pdf",
               show=False)

    plt.close("all")
Example #21
0
def test_linear_works():
    X, y = make_regression(
        n_samples=1000,
        n_features=100,
        n_informative=10,
        n_targets=1,
    )

    # quantiles for y -> string day labels
    yq = pd.qcut(y, 3, labels=["0", "1", "2"])
    obs = pd.DataFrame({"day": yq})
    obs.index = obs.index.map(str)

    adata = sc.AnnData(X=X, obs=obs)

    _ = parallel_runs(
        adata,
        n_processes=4,
        n_bootstraps=32,
        X_noise=0.01,
        y_noise=0.5,
        alpha=0.9,
        lambda_path=np.geomspace(10, 0.01, num=10),
        target_col="day",
        target_map={"0": 0, "1": 1, "2": 2},
    )
Example #22
0
def run_pca(data, n_components=300, use_hvg=True):
    """Run PCA

    :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components
    :param n_components: Number of principal components
    :return: PCA projections of the data and the explained variance
    """
    if type(data) is sc.AnnData:
        ad = data
    else:
        ad = sc.AnnData(data.values)

    # Run PCA
    if not use_hvg:
        n_comps = n_components
    else:
        sc.pp.pca(ad, n_comps=1000, use_highly_variable=True, zero_center=False)
        try:
            n_comps = np.where(np.cumsum(ad.uns['pca']['variance_ratio']) > 0.85)[0][0]
        except IndexError:
            n_comps = n_components

    # Rerun with selection number of components
    sc.pp.pca(ad, n_comps=n_comps, use_highly_variable=use_hvg, zero_center=False)

    # Return PCA projections if it is a dataframe
    pca_projections = pd.DataFrame(ad.obsm['X_pca'], index=ad.obs_names)
    return pca_projections, ad.uns['pca']['variance_ratio']
Example #23
0
def scanpy_first():
    # results_file = 'scanpy_output\scanpy_output.h5ad'
    # file_path = '../dataset/scanpy_data/'
    # file_path = 'human_brain_output/scIGANs-brainTags.csv-src_label.txt-100-15-16-5.0-2.0.csv'
    file_path = '../dataset/human_brain/brainTags.csv'

    # label_path = '../dataset/pollen_labels.txt'
    # label_set = pd.read_table(label_path, header=None, index_col=False)
    # src_label = pd.Categorical(label_set.iloc[:, 1]).codes

    adata = sc.AnnData(pd.read_csv(file_path, header=0, index_col=0).transpose())

    # adata = sc.read_10x_mtx(file_path, var_names='gene_symbols', cache=True)
    # adata = ad.read_csv(file_path, first_column_names=True)
    # adata.var_names_make_unique()       # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

    # print('X:', adata.X, ' \ncells:', adata.obs, ' \ngenes:', adata.var)
    # print('cell name:', adata.obs_names, '\ngene name:', adata.var_names)
    # print(adata.obs.shape)
    # print(adata.var.shape)

    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    # sc.pl.highly_variable_genes(adata)
    # 保存原始数据
    adata.raw = adata

    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
    sc.tl.leiden(adata)
    sc.tl.umap(adata)
    sc.pl.umap(adata, color=['leiden'])
Example #24
0
def do_evaluation_atac_from_rna(
    spliced_net,
    sc_dual_full_dataset,
    gene_names: str,
    atac_names: str,
    outdir: str,
    ext: str,
    marker_genes: List[str],
    prefix: str = "",
):
    ### RNA > ATAC
    logging.info("Inferring ATAC from RNA")
    sc_rna_atac_full_preds = spliced_net.translate_1_to_2(sc_dual_full_dataset)
    sc_rna_atac_full_preds_anndata = sc.AnnData(
        scipy.sparse.csr_matrix(sc_rna_atac_full_preds),
        obs=sc_dual_full_dataset.dataset_x.data_raw.obs,
    )
    sc_rna_atac_full_preds_anndata.var_names = atac_names
    logging.info("Writing ATAC from RNA")
    sc_rna_atac_full_preds_anndata.write(
        os.path.join(outdir, f"{prefix}_rna_atac_adata.h5ad".strip("_"))
    )

    if hasattr(sc_dual_full_dataset.dataset_y, "data_raw") and ext is not None:
        logging.info("Plotting ATAC from RNA")
        plot_utils.plot_auroc(
            utils.ensure_arr(sc_dual_full_dataset.dataset_y.data_raw.X).flatten(),
            utils.ensure_arr(sc_rna_atac_full_preds).flatten(),
            title_prefix=f"{DATASET_NAME} RNA > ATAC".strip(),
            fname=os.path.join(outdir, f"{prefix}_rna_atac_auroc.{ext}".strip("_")),
        )
Example #25
0
def simulate_multiple_cell(path, data, model, z_dim, feature):
    variable_names = data.var_names
    data_latent = model.to_latent(data.X)
    latent_df = pd.DataFrame(data_latent)
    latent_df[feature] = list(data.obs[feature])
    cells = list(set(data.obs[feature]))
    try:
        os.makedirs(path + "/gene_heatmaps/")
    except OSError:
        pass
    x_dim = data.shape[1]

    for cell in cells:
        data_ast = latent_df[latent_df[feature] == cell]
        cell_one = data_ast.iloc[[0], [0, 1, 2, 3, 4]]

        for dim in range(z_dim):
            increment_range = np.arange(min(data_latent[:, dim]),
                                        max(data_latent[:, dim]), 0.01)
            result_array = np.empty((0, x_dim))
            for inc in increment_range:
                cell_latent = cell_one
                #print(cell_latent)
                #print(cell_latent.shape)
                cell_latent.iloc[:, dim] = inc
                cell_recon = model.reconstruct(cell_latent)
                result_array = np.append(result_array, cell_recon, axis=0)

            result_adata = sc.AnnData(result_array,
                                      obs={"inc_vals": increment_range},
                                      var={"var_names": variable_names})
            result_adata.write(path + "/gene_heatmaps/" + str(cell) + "_" +
                               str(dim) + ".h5ad")
Example #26
0
def simulate_one_cell(path, data, cell, model, z_dim, feature):
    variable_names = data.var_names
    data_latent = model.to_latent(data.X)
    try:
        os.makedirs(path + "/gene_heatmaps/")
    except OSError:
        pass
    x_dim = data.shape[1]
    data_ast = data[data.obs[feature] == cell]
    cell_one = data_ast[0, :].X
    cell_one = np.reshape(cell_one, (1, x_dim))
    cell_one = model.to_latent(cell_one)

    for dim in range(z_dim):
        increment_range = np.arange(min(data_latent[:, dim]),
                                    max(data_latent[:, dim]), 0.01)
        result_array = np.empty((0, x_dim))
        for inc in increment_range:
            cell_latent = cell_one
            #print(cell_latent)
            #print(cell_latent.shape)
            cell_latent[:, dim] = inc
            cell_recon = model.reconstruct(cell_latent)
            result_array = np.append(result_array, cell_recon, axis=0)

        result_adata = sc.AnnData(result_array,
                                  obs={"inc_vals": increment_range},
                                  var={"var_names": variable_names})
        result_adata.write(path + "/gene_heatmaps/" + str(cell) + "_" +
                           str(dim) + ".h5ad")
Example #27
0
def test_rank_genes_groups_df():
    a = np.zeros((20, 3))
    a[:10, 0] = 5
    adata = sc.AnnData(
        a,
        obs=pd.DataFrame(
            {"celltype": list(chain(repeat("a", 10), repeat("b", 10)))},
            index=[f"cell{i}" for i in range(a.shape[0])]),
        var=pd.DataFrame(index=[f"gene{i}" for i in range(a.shape[1])]),
    )
    sc.tl.rank_genes_groups(adata, groupby="celltype", method="wilcoxon")
    dedf = sc.get.rank_genes_groups_df(adata, "a")
    assert dedf["pvals"].value_counts()[1.] == 2
    assert sc.get.rank_genes_groups_df(adata, "a", log2fc_max=.1).shape[0] == 2
    assert sc.get.rank_genes_groups_df(adata, "a", log2fc_min=.1).shape[0] == 1
    assert sc.get.rank_genes_groups_df(adata, "a",
                                       pval_cutoff=.9).shape[0] == 1
    del adata.uns["rank_genes_groups"]
    sc.tl.rank_genes_groups(adata,
                            groupby="celltype",
                            method="wilcoxon",
                            key_added="different_key")
    with pytest.raises(KeyError):
        sc.get.rank_genes_groups_df(adata, "a")
    dedf2 = sc.get.rank_genes_groups_df(adata, "a", key="different_key")
    pd.testing.assert_frame_equal(dedf, dedf2)
Example #28
0
def impute_neighbor(bdata, n_neighbor=10):
    from scipy.spatial import cKDTree
    from sklearn.neighbors import KDTree
    import multiprocessing as mp

    n_jobs = mp.cpu_count()

    # Get neighborhood structure based on
    ckd = cKDTree(bdata.obsm["X_umap"])
    ckdout = ckd.query(x=bdata.obsm["X_umap"], k=n_neighbor, n_jobs=n_jobs)
    indices = ckdout[1]

    sum_list = []
    import scipy
    for i in range(0, bdata.raw.X.shape[0], 10000):
        start = i
        end = min(i + 10000, bdata.raw.X.shape[0])
        X_list = [
            bdata.raw.X[indices[start:end, i]] for i in range(n_neighbor)
        ]
        X_sum = scipy.sparse.csr_matrix(np.sum(X_list) / n_neighbor)
        sum_list.append(X_sum)
        print(i)

    imputed = scipy.sparse.vstack(sum_list)
    idata = sc.AnnData(imputed)
    idata.obs = bdata.obs.copy()
    idata.var = bdata.raw.var.copy()
    idata.obsm = bdata.obsm.copy()
    idata.uns = bdata.uns.copy()

    return idata
Example #29
0
def blobs(n_variables=11, n_centers=5, cluster_std=1.0, n_observations=640):
    """Gaussian Blobs.

    Parameters
    ----------
    n_variables : `int`, optional (default: 11)
        Dimension of feature space.
    n_centers : `int`, optional (default: 5)
        Number of cluster centers.
    cluster_std : `float`, optional (default: 1.0)
        Standard deviation of clusters.
    n_observations : `int`, optional (default: 640)
        Number of observations. By default, this is the same observation number as in
        ``sc.datasets.krumsiek11()``.

    Returns
    -------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix containing a observation annotation 'blobs' that
        indicates cluster identity.
    """
    import sklearn.datasets
    X, y = sklearn.datasets.make_blobs(n_samples=n_observations,
                                       n_features=n_variables,
                                       centers=n_centers,
                                       cluster_std=cluster_std,
                                       random_state=0)
    return sc.AnnData(X, obs={'blobs': y.astype(str)})
Example #30
0
def test_logistic_works():
    X, y, coef = make_regression(n_samples=1000,
                                 n_features=10,
                                 n_informative=1,
                                 n_targets=1,
                                 coef=True)
    y = expit(y / 100)

    obs = pd.DataFrame({"day": y})
    obs["day"] = pd.qcut(obs["day"], 2, labels=["0", "1"])

    obs.index = obs.index.map(str)
    var = pd.DataFrame({"Gene name": [str(i) for i in range(X.shape[1])]})
    var.index = var.index.astype(str)

    adata = sc.AnnData(X=X, obs=obs, var=var)

    _ = parallel_runs(
        adata,
        n_processes=4,
        n_bootstraps=32,
        X_noise=0.01,
        alpha=0.9,
        lambda_path=np.geomspace(10, 0.01, num=10),
        target_col="day",
    )