def zeisel():
    """Prepare Zeisel dataset

    Cell types in the mouse cortex and hippocampus revealed by single-cell
    RNA-seq by Zeisel, et al. in Science. 2015. 
    """
    df = pd.read_csv(
        "data/zeisel/expression_mRNA_17-Aug-2014.txt",
        sep="\t",
        header=0,
        index_col=0,
        skiprows=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    ).T
    zeisel = AnnData(df.values[1:, :])
    zeisel.obs_names = df.index[1:]
    zeisel.var_names = df.columns
    anndf = pd.read_csv(
        "data/zeisel/expression_mRNA_17-Aug-2014.txt",
        sep="\t",
        header=0,
        index_col=1,
        nrows=10,
    ).T
    annotations = anndf.iloc[1:, :-1]
    zeisel.obs["group"] = annotations["group #"]
    zeisel.obs["sex"] = annotations["sex"]
    annotations.columns
    zeisel.obs["tot mRNA"] = annotations["total mRNA mol"]
    zeisel.obs["age"] = annotations["age"]
    zeisel.obs["diameter"] = annotations["diameter"]
    pr.read.process_clusts(zeisel, "group")
    sc.write("data/zeisel/zeisel.h5ad", zeisel)
    ft = pr.performance.FoldTester(zeisel)
    ft.makefolds(random=True)
    ft.savefolds("output/zeisel_folds.npz")
def green():
    """Prepare the Green dataset

    A Comprehensive Roadmap of Murine Spermatogenesis Defined by Single-Cell
    RNA-Seq by Green et al. in Developmental Cell. 2018.
    """
    adata = sc.read_csv("data/green/GSE112393_MergedAdultMouseST25_DGE.txt.gz",
                        delimiter="\t").T
    adata.X = scipy.sparse.csc_matrix(adata.X)
    df = pd.read_csv(
        "data/green/GSE112393_MergedAdultMouseST25_PerCellAttributes.txt.gz",
        sep="\t",
        skiprows=3,
    )
    df = df.set_index("#CellBarcode")
    adata.obs = adata.obs.merge(df,
                                how="left",
                                left_index=True,
                                right_index=True,
                                validate="1:1")
    sc.write("data/green/green.h5ad", adata)
    pr.performance.process_clusts(adata, "CellType")
    ft = pr.performance.FoldTester(adata)
    ft.makefolds(random=True)
    ft.savefolds("output/green_folds.npz")
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"):
    train_path = f"../data/train_{data_name}.h5ad"
    if data_name == "pbmc":
        ctrl_key = "control"
        stim_key = "stimulated"
        cell_type_key = "cell_type"
    elif data_name == "hpoly":
        ctrl_key = "Control"
        stim_key = "Hpoly.Day10"
        cell_type_key = "cell_label"
    elif data_name == "salmonella":
        ctrl_key = "Control"
        stim_key = "Salmonella"
        cell_type_key = "cell_label"
    data = sc.read(train_path)
    print("data has been loaded!")
    train = data[~((data.obs["condition"] == stim_key) &
                   (data.obs[cell_type_key] == cell_type))]
    pca = PCA(n_components=100)

    pca.fit(train.X.A)

    train_real_cd = train[train.obs["condition"] == "control", :]
    if p_type == "unbiased":
        train_real_cd = scgen.util.balancer(train_real_cd)
    train_real_stimulated = train[train.obs["condition"] == "stimulated", :]
    if p_type == "unbiased":
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)

    import scipy.sparse as sparse
    if sparse.issparse(train_real_cd.X):
        train_real_cd.X = train_real_cd.X.A
        train_real_stimulated.X = train_real_stimulated.X.A

    train_real_stimulated_PCA = pca.transform(train_real_stimulated.X)
    train_real_cd_PCA = pca.transform(train_real_cd.X)

    adata_list = scgen.util.extractor(data, cell_type, {
        "ctrl": ctrl_key,
        "stim": stim_key
    })
    if sparse.issparse(adata_list[1].X):
        adata_list[1].X = adata_list[1].X.A
        adata_list[2].X = adata_list[2].X.A
    ctrl_CD4T_PCA = pca.transform(adata_list[1].X)
    predicted_cells = predict(pca, train_real_cd_PCA,
                              train_real_stimulated_PCA, ctrl_CD4T_PCA, p_type)

    all_Data = sc.AnnData(
        np.concatenate([adata_list[1].X, adata_list[2].X, predicted_cells]))
    all_Data.obs["condition"] = ["ctrl"] * len(adata_list[1].X) + ["real_stim"] * len(adata_list[2].X) + \
                                ["pred_stim"] * len(predicted_cells)
    all_Data.var_names = adata_list[3].var_names
    if p_type == "unbiased":
        sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T.h5ad", all_Data)
    else:
        sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T_biased.h5ad",
                 all_Data)
def reconstruct():
    train_path = "../data/train_pbmc.h5ad"
    data = sc.read(train_path)
    ctrl_key = "control"
    stim_key = "stimulated"
    all_data = anndata.AnnData()
    print(data.obs["cell_type"].unique().tolist())
    for idx, cell_type in enumerate(data.obs["cell_type"].unique().tolist()):
        pca = PCA(n_components=100)
        train = data[~((data.obs["condition"] == stim_key) &
                       (data.obs["cell_type"] == cell_type))]
        pca.fit(train.X.A)
        print(cell_type, end="\t")
        train_real_stimulated = data[data.obs["condition"] == stim_key, :]
        train_real_stimulated = train_real_stimulated[
            train_real_stimulated.obs["cell_type"] != cell_type]
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)
        train_real_stimulated_PCA = pca.transform(train_real_stimulated.X)

        train_real_cd = data[data.obs["condition"] == ctrl_key, :]
        train_real_cd = scgen.util.balancer(train_real_cd)
        train_real_cd_PCA = pca.transform(train_real_cd.X)

        cell_type_adata = data[data.obs["cell_type"] == cell_type]
        cell_type_ctrl = cell_type_adata[cell_type_adata.obs["condition"] ==
                                         ctrl_key]
        cell_type_stim = cell_type_adata[cell_type_adata.obs["condition"] ==
                                         stim_key]
        if sparse.issparse(cell_type_ctrl.X):
            cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X.A)
        else:
            cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X)
        predicted_cells = predict(pca, train_real_cd_PCA,
                                  train_real_stimulated_PCA,
                                  cell_type_ctrl_PCA)
        if sparse.issparse(cell_type_ctrl.X):
            all_Data = sc.AnnData(
                np.concatenate(
                    [cell_type_ctrl.X.A, cell_type_stim.X.A, predicted_cells]))
        else:
            all_Data = sc.AnnData(
                np.concatenate(
                    [cell_type_ctrl.X, cell_type_stim.X, predicted_cells]))
        all_Data.obs["condition"] = [f"{cell_type}_ctrl"] * cell_type_ctrl.shape[0] + [f"{cell_type}_real_stim"] * \
                                    cell_type_stim.shape[0] + \
                                    [f"{cell_type}_pred_stim"] * len(predicted_cells)
        all_Data.obs["cell_type"] = [f"{cell_type}"] * (
            cell_type_ctrl.shape[0] + cell_type_stim.shape[0] +
            len(predicted_cells))
        all_Data.var_names = cell_type_adata.var_names

        if idx == 0:
            all_data = all_Data
        else:
            all_data = all_data.concatenate(all_Data)
        print(cell_type)
    sc.write("../data/reconstructed/PCAVecArithm/PCA_pbmc.h5ad", all_data)
Esempio n. 5
0
    def predict(self,
                adata,
                colnames=None,
                dimreduce=True,
                reconstruct=True,
                error=True):

        res = {}
        colnames = adata.var_names.values if colnames is None else colnames
        rownames = adata.obs_names.values

        print('Calculating low dimensional representations...')

        res['reduced'] = self.encoder.predict({
            'count':
            adata.X,
            'size_factors':
            adata.obs.size_factors
        })

        print('Calculating reconstructions...')
        res['mean'] = self.model.predict({
            'count': adata.X,
            'size_factors': adata.obs.size_factors
        })

        res['mean_norm'] = self.extra_models['mean_norm'].predict(adata.X)

        if self.file_path:
            print('Saving files...')
            os.makedirs(self.file_path, exist_ok=True)

            write_text_matrix(res['reduced'],
                              os.path.join(self.file_path, 'reduced.tsv'),
                              rownames=rownames,
                              transpose=False)

            #write_text_matrix(res['decoded'], os.path.join(self.file_path, 'decoded.tsv'))
            write_text_matrix(res['mean'],
                              os.path.join(self.file_path, 'mean.tsv'),
                              rownames=rownames,
                              colnames=colnames,
                              transpose=True)

            sc.settings.writedir = self.file_path + '/'
            sc.write('output', adata)
            write_text_matrix(res['mean_norm'],
                              os.path.join(self.file_path, 'mean_norm.tsv'),
                              rownames=rownames,
                              colnames=colnames,
                              transpose=True)

        return res
def zheng():
    """Prepare the Zheng dataset
    
    Massively parallel digital transcriptional profiling of single cells. by
    Zheng GX, et al. in Nature Communications. 2017.
    """
    pbmc_68k = sc.read_10x_mtx("data/zheng/filtered_matrices_mex/hg19/")
    bl = pd.read_csv("data/zheng/zheng17_bulk_lables.txt", header=None)
    pbmc_68k.obs["bulk_labels"] = bl.values
    pr.read.process_clusts(pbmc_68k, "bulk_labels")
    sc.write("data/zheng/fresh_68k_bulk_labels.h5ad", pbmc_68k)
    ft = pr.performance.FoldTester(pbmc_68k)
    ft.makefolds(random=True)
    ft.savefolds("output/zheng_folds.npz")
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"):
    train_path = f"../data/train_{data_name}.h5ad"
    if data_name == "pbmc":
        ctrl_key = "control"
        stim_key = "stimulated"
        cell_type_key = "cell_type"
    elif data_name == "hpoly":
        ctrl_key = "Control"
        stim_key = "Hpoly.Day10"
        cell_type_key = "cell_label"
    elif data_name == "salmonella":
        ctrl_key = "Control"
        stim_key = "Salmonella"
        cell_type_key = "cell_label"
    data = sc.read(train_path)
    print("data has been loaded!")
    ctrl_cell = data[(data.obs["condition"] == ctrl_key) & (data.obs[cell_type_key] == cell_type)]
    stim_cell = data[(data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type)]

    train_real_cd = data[data.obs["condition"] == "control", :]
    if p_type == "unbiased":
        train_real_cd = scgen.util.balancer(train_real_cd)
    train_real_stimulated = data[data.obs["condition"] == "stimulated", :]
    train_real_stimulated = train_real_stimulated[train_real_stimulated.obs["cell_type"] != "CD4T"]
    if p_type == "unbiased":
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)

    import scipy.sparse as sparse
    if sparse.issparse(train_real_cd.X):
        train_real_cd = train_real_cd.X.A
        train_real_stimulated = train_real_stimulated.X.A
    else:
        train_real_cd = train_real_cd.X
        train_real_stimulated = train_real_stimulated.X
    if sparse.issparse(ctrl_cell.X):
        ctrl_cell.X = ctrl_cell.X.A
        stim_cell.X = stim_cell.X.A
    predicted_cells = predict(train_real_cd, train_real_stimulated, ctrl_cell.X)

    print("Prediction has been finished")
    all_Data = sc.AnnData(np.concatenate([ctrl_cell.X, stim_cell.X, predicted_cells]))
    all_Data.obs["condition"] = ["ctrl"] * ctrl_cell.shape[0] + ["real_stim"] * stim_cell.shape[0] + \
                                ["pred_stim"] * len(predicted_cells)
    all_Data.var_names = ctrl_cell.var_names
    if p_type == "unbiased":
        sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T.h5ad", all_Data)
    else:
        sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T_biased.h5ad", all_Data)
Esempio n. 8
0
def write(adata, version, name):
    '''write adata into [name]'''
    name = version + name
    sc.write(name, adata)
    print("_".join(name.split(".")) + " = '%s'" % name)
Esempio n. 9
0
import scanpy.api as sc
import scipy.sparse as sp_sparse

# andata = sc.read_h5ad("./ExprMatrix.h5ad")
andata = sc.read_h5ad("./100_test_data.h5ad")
print("Finished reading.")
andata.var_names_make_unique()
if sp_sparse.issparse(andata.X):
    andata.X = andata.X.toarray()
    # andata = andata
partial_data = andata[:100, :]
print("Finished processing")
sc.write("100_test_data.h5ad", partial_data)
print("Finished writing.")