Esempio n. 1
0
    def __init__(self,
                 filename,
                 save_path='data/',
                 type='filtered',
                 dense=False,
                 remote=True):

        self.remote = remote
        self.save_path = save_path
        if self.remote:
            group = to_groups[filename]
            self.url = (
                "http://cf.10xgenomics.com/samples/cell-exp/%s/%s/%s_%s_gene_bc_matrices.tar.gz"
                % (group, filename, filename, type))
            self.save_path = os.path.join(save_path, '10X/%s/' % filename)
            self.save_name = '%s_gene_bc_matrices' % type
            self.download_name = self.save_name + '.tar.gz'
        else:
            try:
                assert os.path.isdir(os.path.join(self.save_path, filename))
            except AssertionError:
                print("The file %s was not found in the location you gave" %
                      filename)
                raise
            self.save_path = os.path.join(self.save_path, filename)

        self.dense = dense

        expression_data, gene_names = self.download_and_preprocess()
        super(Dataset10X, self).__init__(
            *GeneExpressionDataset.get_attributes_from_matrix(expression_data),
            gene_names=gene_names)
Esempio n. 2
0
    def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True, genecol=0):

        self.remote = remote
        self.save_path = save_path
        self.genecol = genecol
        if self.remote:
            group = to_groups[filename]
            url_skeleton = group_to_url_skeleton[group]
            self.url = url_skeleton.format(group, filename, filename, type)
            self.save_path = os.path.join(save_path, '10X/%s/' % filename)
            self.save_name = '%s_gene_bc_matrices' % type
            self.download_name = self.save_name + '.tar.gz'
        else:
            try:
                assert os.path.isdir(os.path.join(self.save_path, filename))
            except AssertionError:
                print("The file %s was not found in the location you gave" % filename)
                raise
            self.save_path = os.path.join(self.save_path, filename)

        self.dense = dense

        expression_data, gene_names = self.download_and_preprocess()
        super().__init__(*GeneExpressionDataset.get_attributes_from_matrix(
            expression_data), gene_names=gene_names)
Esempio n. 3
0
    def create_dataset(self, path):
        print("Reading rds")
        ro.r("sce<-readRDS('%s')" % path)
        print("Extracting log counts")
        log_counts = ro.r("logcounts(sce)")
        print("Transforming log count to counts")
        counts = (np.exp(log_counts * np.log(2)) - 1).T.astype(np.int)
        gene_symbols = ro.r("rowData(sce)$feature_symbol")
        labels = ro.r("colData(sce)$cell_type1")
        labels_levels = ro.r("levels(colData(sce)$cell_type1)")
        if labels_levels is not rpy2.rinterface.NULL:
            labels = np.array([labels_levels[int(l) - 1] for l in labels])

        cell_types = list(np.unique(labels))
        labels = np.array([cell_types.index(l) for l in labels])

        valid_idx = (counts.sum(axis=1) >
                     10).ravel()  # Filter bad quality cells
        counts = counts[valid_idx]
        labels = labels[valid_idx]
        gene_expression_dataset = GeneExpressionDataset(
            *GeneExpressionDataset.get_attributes_from_matrix(counts,
                                                              labels=labels),
            cell_types=cell_types)
        gene_expression_dataset.gene_symbols = gene_symbols
        return gene_expression_dataset
Esempio n. 4
0
    def __init__(self, n_proteins=7):
        assert n_proteins in (
            2, 5, 7), "Only support: 2, 5 or 7 protein FACS dataset"

        self.n_proteins = int(n_proteins)
        expression_data = self.download_and_preprocess()
        super().__init__(
            *GeneExpressionDataset.get_attributes_from_matrix(expression_data))
Esempio n. 5
0
def training_score_scvi(train, **kwargs):
    from scvi.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE
    data = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(train))
    vae = VAE(n_input=train.shape[1])
    m = UnsupervisedTrainer(vae, data, verbose=False)
    m.train(n_epochs=100)
    # Training permuted the data for minibatching. Unpermute before "imputing"
    # (estimating lambda)
    lam = np.vstack([
        m.train_set.sequential().imputation(),
        m.test_set.sequential().imputation()
    ])
    return st.poisson(mu=lam).logpmf(train).sum()
Esempio n. 6
0
def generalization_score_scvi(train, test, **kwargs):
    from scvi.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE
    data = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(train))
    vae = VAE(n_input=train.shape[1])
    m = UnsupervisedTrainer(vae, data, verbose=False)
    m.train(n_epochs=100)
    # Training permuted the data for minibatching. Unpermute before "imputing"
    # (estimating lambda)
    with torch.autograd.set_grad_enabled(False):
        lam = np.vstack([
            m.train_set.sequential().imputation(),
            m.test_set.sequential().imputation()
        ])
        return pois_llik(lam, train, test)
Esempio n. 7
0
def assign_label(cellid, geneid, labels_map, count, cell_type, seurat):
    labels = seurat[1:, 4]
    labels = np.int64(np.asarray(labels))
    labels_new = deepcopy(labels)
    for i, j in enumerate(labels_map):
        labels_new[labels == i] = j
    temp = dict(zip(cellid, count))
    new_count = []
    for x in seurat[1:, 5]:
        new_count.append(temp[x])
    new_count = sparse.vstack(new_count)
    dataset = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(new_count,
                                                          labels=labels_new),
        gene_names=geneid,
        cell_types=cell_type)
    return dataset
Esempio n. 8
0
    def __init__(self,
                 filename,
                 save_path='data/',
                 type='filtered',
                 dense=False):
        group = to_groups[filename]
        self.url = (
            "http://cf.10xgenomics.com/samples/cell-exp/%s/%s/%s_%s_gene_bc_matrices.tar.gz"
            % (group, filename, filename, type))
        self.save_path = save_path + '10X/%s/' % filename
        self.save_name = '%s_gene_bc_matrices' % type
        self.dense = dense

        self.download_name = self.save_name + '.tar.gz'
        expression_data, gene_names = self.download_and_preprocess()
        super(Dataset10X, self).__init__(
            *GeneExpressionDataset.get_attributes_from_matrix(expression_data),
            gene_names=gene_names)
Esempio n. 9
0
                          sep=",",
                          index_col=0)["pop"].values

batch_array = pd.read_csv(os.path.join(save_path, "DE.batchid.csv"),
                          sep=",",
                          index_col=0)["x"].values
batch_array -= 1
batch_array = batch_array[:, np.newaxis]
count_matrix = pd.read_csv(os.path.join(save_path, "DE.obsv.2.csv"),
                           sep=",",
                           index_col=0).T

gene_names = np.array(count_matrix.columns, dtype=str)

dataset1 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(
        count_matrix.values, labels=label_array, batch_indices=batch_array),
    gene_names=gene_names,
    cell_types=np.unique(label_array))

dataset1.update_cells(batch_array.ravel() == 0)

count_matrix = pd.read_csv(os.path.join(save_path, "DE.obsv.4.csv"),
                           sep=",",
                           index_col=0).T

dataset2 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(
        count_matrix.values, labels=label_array, batch_indices=batch_array),
    gene_names=gene_names,
    cell_types=np.unique(label_array))
Esempio n. 10
0
 def __init__(self, filename, save_path='/data/scanorama/'):
     self.save_path = save_path + '%s' % filename
     count, gene_names = self.preprocess()
     super(DatasetSCANORAMA, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(count),
         gene_names=np.char.upper(gene_names))
Esempio n. 11
0
 def __init__(self):
     expression_data = self.download_and_preprocess()
     super().__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(expression_data))
Esempio n. 12
0
use_labels=False
use_cuda=False
reconstruction_loss="nb"


rawcounts = feather.read_dataframe(input_rawcounts)
meta = feather.read_dataframe(input_meta)
meta.index =  meta.loc[:,"cell_name"].values.astype(str)
var = feather.read_dataframe(input_var)
var.index = var.loc[:,"symbol"].values.astype(str)

annobj = anndata.AnnData(X=rawcounts)
annobj.obs = meta
annobj.var = var

X, local_mean, local_var, batch_indices, labels = GeneExpressionDataset.get_attributes_from_matrix(annobj.X)

geneExp = GeneExpressionDataset(X, local_mean, local_var, batch_indices, labels, gene_names=annobj.var.index)

if bool(batch_id) is not False:
    use_batches=True
    plates, plates_ids = pd.factorize(annobj.obs[batch_id])
    geneExp.batch_indices = plates.reshape(-1, 1)
    geneExp.n_batches = np.unique(plates.reshape(-1, 1)).size
else:
    use_batches = False

ldvae = LDVAE(geneExp.nb_genes, 
            n_batch=geneExp.n_batches * use_batches, 
            n_latent=latent,
            n_layers=layer,
Esempio n. 13
0
def imputation(infer,
               name,
               rate=0.1,
               n_samples=1,
               n_epochs=1,
               corruption="uniform"):
    corrupted_data = copy.deepcopy(infer.gene_dataset.X)

    if corruption == "uniform":  # multiply the entry n with a Ber(0.9) random variable.
        i, j = np.nonzero(corrupted_data)
        ix = np.random.choice(range(len(i)),
                              int(np.floor(rate * len(i))),
                              replace=False)
        i, j = i[ix], j[ix]
        corrupted_data[i, j] *= np.random.binomial(n=np.ones(len(ix),
                                                             dtype=np.int64),
                                                   p=0.9)
    elif corruption == "binomial":  # multiply the entry n with a Bin(n, 0.9) random variable.
        i, j = (k.ravel() for k in np.indices(corrupted_data.shape))
        ix = np.random.choice(range(len(i)),
                              int(np.floor(rate * len(i))),
                              replace=False)
        i, j = i[ix], j[ix]
        corrupted_data[i, j] = np.random.binomial(
            n=corrupted_data[i, j].astype(np.int64), p=0.2)

    infer.gene_dataset = gene_dataset = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(
            corrupted_data,
            batch_indices=infer.gene_dataset.batch_indices,
            labels=infer.gene_dataset.labels))

    original_data_loaders_loop = infer.data_loaders.loop
    infer.data_loaders.loop = [
        'corrupted_%s' % s for s in infer.data_loaders.loop
    ]
    original_keys = list(infer.data_loaders.dict.keys())
    for key in original_keys:
        kwargs = copy.copy(infer.data_loaders.kwargs)
        kwargs['collate_fn'] = gene_dataset.collate_fn
        kwargs['sampler'] = copy.copy(infer.data_loaders[key].sampler)
        infer.data_loaders['corrupted_%s' % key] = DataLoaderWrapper(
            gene_dataset, use_cuda=infer.use_cuda, **kwargs)

    infer.train(n_epochs=n_epochs)
    infer.data_loaders.loop = original_data_loaders_loop

    original_list = []
    imputed_list = []
    batch_size = infer.data_loaders.kwargs["batch_size"] // n_samples
    for tensors, corrupted_tensors in \
        zip(infer.data_loaders[name].sequential(batch_size=batch_size),
            infer.data_loaders['corrupted_%s' % name].sequential(batch_size=batch_size)):
        batch = tensors[0]
        actual_batch_size = batch.size(0)
        dropout_batch, _, _, batch_index, labels = corrupted_tensors
        px_rate = infer.model.get_sample_rate(dropout_batch,
                                              batch_index=batch_index,
                                              y=labels,
                                              n_samples=n_samples)

        indices_dropout = torch.nonzero(batch - dropout_batch)
        i = indices_dropout[:, 0]
        j = indices_dropout[:, 1]

        batch = batch.unsqueeze(0).expand(
            (n_samples, batch.size(0), batch.size(1)))
        original = np.array(batch[:, i, j].view(-1).cpu())
        imputed = np.array(px_rate[:, i, j].view(-1).cpu())

        cells_index = np.tile(np.array(i.cpu()), n_samples)

        original_list += [
            original[cells_index == i] for i in range(actual_batch_size)
        ]
        imputed_list += [
            imputed[cells_index == i] for i in range(actual_batch_size)
        ]

    return original_list, imputed_list