Exemple #1
0
 def __init__(
     self,
     save_path: str = "data/",
     save_path_10X: str = None,
     remove_extracted_data: bool = False,
     delayed_populating: bool = False,
 ):
     self.save_path_10X = save_path_10X if save_path_10X is not None else save_path
     self.remove_extracted_data = remove_extracted_data
     self.barcodes = None
     super().__init__(
         urls=[
             "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv",
             "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle",
         ],
         filenames=["gene_info_pbmc.csv", "pbmc_metadata.pickle"],
         save_path=save_path,
         delayed_populating=delayed_populating,
     )
     # this downloads the necessary file for a future call to populate
     if delayed_populating:
         Dataset10X("pbmc8k",
                    save_path=self.save_path_10X,
                    delayed_populating=True)
         Dataset10X("pbmc4k",
                    save_path=self.save_path_10X,
                    delayed_populating=True)
Exemple #2
0
    def populate(self):
        self.de_metadata = pd.read_csv(os.path.join(self.save_path,
                                                    "gene_info_pbmc.csv"),
                                       sep=",")
        pbmc_metadata = pickle.load(
            open(os.path.join(self.save_path, "pbmc_metadata.pickle"), "rb"))
        datasets = [
            Dataset10X(
                "pbmc8k",
                save_path=self.save_path_10X,
                remove_extracted_data=self.remove_extracted_data,
                measurement_names_column=0,
            ),
            Dataset10X(
                "pbmc4k",
                save_path=self.save_path_10X,
                remove_extracted_data=self.remove_extracted_data,
                measurement_names_column=0,
            ),
        ]
        self.populate_from_datasets(datasets)
        # filter cells according to barcodes
        dict_barcodes = dict(zip(self.barcodes, np.arange(len(self.barcodes))))
        subset_cells = []
        barcodes_metadata = (
            pbmc_metadata["barcodes"].index.values.ravel().astype(np.str))
        for barcode in barcodes_metadata:
            if (barcode in dict_barcodes
                ):  # barcodes with end -11 filtered on 10X website (49 cells)
                subset_cells += [dict_barcodes[barcode]]
        self.update_cells(subset_cells=np.asarray(subset_cells))
        idx_metadata = np.asarray(
            [not barcode.endswith("11") for barcode in barcodes_metadata],
            dtype=np.bool)
        labels = pbmc_metadata["clusters"][idx_metadata].reshape(-1,
                                                                 1)[:len(self)]
        self.labels, self.n_labels = remap_categories(labels)
        self.cell_types = pbmc_metadata["list_clusters"][:self.n_labels]

        genes_to_keep = list(self.de_metadata["ENSG"].values
                             )  # only keep the genes for which we have de data
        difference = list(set(genes_to_keep).difference(set(
            self.gene_names)))  # Non empty only for unit tests
        for gene in difference:
            genes_to_keep.remove(gene)
        self.filter_genes_by_attribute(genes_to_keep)
        self.de_metadata = self.de_metadata.head(
            len(genes_to_keep))  # this would only affect the unit tests
        self.design = pbmc_metadata["design"][idx_metadata]
        self.raw_qc = pbmc_metadata["raw_qc"][idx_metadata]
        self.qc_names = self.raw_qc.columns
        self.qc = self.raw_qc.values

        self.qc_pc = pbmc_metadata["qc_pc"][idx_metadata]
        self.normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]
Exemple #3
0
 def test_populate_and_train_one_v1(self):
     dataset = Dataset10X(
         dataset_name="cd4_t_helper",
         remove_extracted_data=True,
         save_path="tests/data/10X",
     )
     unsupervised_training_one_epoch(dataset)
Exemple #4
0
 def populate(self):
     datasets = []
     for dataset_name in self.dataset_names:
         dataset = Dataset10X(
             dataset_name,
             save_path=self.save_path,
             remove_extracted_data=self.remove_extracted_data,
         )
         dataset.initialize_mapped_attribute(
             "labels", "cell_types",
             np.asarray([dataset_name], dtype="<U128"))
         datasets += [dataset]
     self.populate_from_datasets(datasets)
Exemple #5
0
    def __init__(
        self,
        save_path: str = "data/",
        subset_datasets: Union[List[int], np.ndarray] = None,
        remove_extracted_data: bool = False,
        delayed_populating: bool = False,
    ):
        self.dataset_names = np.asarray([
            "cd4_t_helper",
            "regulatory_t",
            "naive_t",
            "memory_t",
            "cytotoxic_t",
            "naive_cytotoxic",
            "b_cells",
            "cd4_t_helper",
            "cd34",
            "cd56_nk",
            "cd14_monocytes",
        ])
        subset_datasets = subset_datasets if subset_datasets else slice(None)
        self.remove_extracted_data = remove_extracted_data
        self.dataset_names = self.dataset_names[subset_datasets]
        super().__init__(save_path=save_path,
                         delayed_populating=delayed_populating)

        if delayed_populating:
            for dataset_name in self.dataset_names:
                Dataset10X(
                    dataset_name,
                    save_path=save_path,
                    delayed_populating=delayed_populating,
                )

        self.filter_genes_by_count()
        self.filter_cells_by_count()
    """
    bayes_f = np.log(bayes_f + 1e-8) - np.log(1 - bayes_f + 1e-8)
    auc_1 = roc_auc_score(np.abs(log_fold_change) >= 0.6, np.abs(bayes_f))
    auc_2 = roc_auc_score(np.abs(log_fold_change) >= 0.8, np.abs(bayes_f))
    spear = spearmanr(bayes_f, log_fold_change)[0]
    kend = kendalltau(bayes_f, log_fold_change)[0]
    return auc_1, auc_2, spear, kend


save_path = "../symsim_scVI/symsim_result/DE/"

pbmc = PbmcDataset()
de_data  = pbmc.de_metadata
pbmc.update_cells(pbmc.batch_indices.ravel()==0)

donor = Dataset10X('fresh_68k_pbmc_donor_a')
donor.gene_names = donor.gene_symbols
donor.labels = np.repeat(0,len(donor)).reshape(len(donor),1)
donor.cell_types = ['unlabelled']
donor.subsample_genes(donor.nb_genes)

gene_dataset = GeneExpressionDataset.concat_datasets(pbmc, donor)


################## Generate Mis-labels
######################################################################################
labels = np.asarray(gene_dataset.labels.ravel())
# pop1 = np.where(gene_dataset.cell_types=='B cells')[0][0]
# pop2 = np.where(gene_dataset.cell_types=='Dendritic Cells')[0][0]
pop1 = np.where(gene_dataset.cell_types=='CD4 T cells')[0][0]
pop2 = np.where(gene_dataset.cell_types=='CD8 T cells')[0][0]