Beispiel #1
0
def test_anndata_loader():
    x = np.random.randint(low=0, high=100, size=(15, 4))
    batch_ids = np.random.randint(low=0, high=2, size=(15, ))
    n_batches = 2
    adata = AnnData(X=x, obs=dict(batch=batch_ids))
    _ = AnnDatasetFromAnnData(adata, batch_label="batch")
    dataset = AnnDatasetFromAnnData(adata, batch_label="batch")
    assert (dataset.n_batches == n_batches
            ), "AnnDatasetFromAnnData should not modify the anndata object"
Beispiel #2
0
    def test_sparse_data(self):
        data = np.random.poisson(0.2, size=(25, 10))

        sparse_mat = sp_sparse.csr_matrix(data)
        ad = anndata.AnnData(sparse_mat)
        AnnDatasetFromAnnData(ad)

        sparse_mat = sp_sparse.csc_matrix(data)
        ad = anndata.AnnData(sparse_mat)
        AnnDatasetFromAnnData(ad)
Beispiel #3
0
    def train(self,
              adata,
              condition_key,
              cell_type_key,
              n_epochs=300,
              patience=30,
              lr_reducer=20):
        le = LabelEncoder()
        adata.obs['labels'] = le.fit_transform(adata.obs[cell_type_key].values)
        adata.obs['batch_indices'] = le.fit_transform(
            adata.obs[condition_key].values)

        net_adata = AnnDatasetFromAnnData(adata)

        early_stopping_kwargs = {
            "early_stopping_metric": "elbo",
            "save_best_state_metric": "elbo",
            "patience": patience,
            "threshold": 0,
            "reduce_lr_on_plateau": True,
            "lr_patience": lr_reducer,
            "lr_factor": 0.1,
        }

        self.trainer = UnsupervisedTrainer(
            self.model,
            net_adata,
            train_size=0.8,
            use_cuda=True,
            frequency=1,
            early_stopping_kwargs=early_stopping_kwargs,
        )

        self.trainer.train(n_epochs=n_epochs, lr=0.001)
    def run(self):
        n_epochs = 100
        n_latent = 10
        n_hidden = 128
        n_layers = 2
        net_data = self.data.copy()
        net_data.X = self.data.layers['counts']
        del net_data.layers['counts']
        net_data.raw = None  # Ensure that the raw counts are not accidentally used

        # Define batch indices
        le = LabelEncoder()
        net_data.obs['batch_indices'] = le.fit_transform(
            net_data.obs[self.batch].values)
        net_data = AnnDatasetFromAnnData(net_data)
        vae = VAE(net_data.nb_genes,
                  reconstruction_loss='nb',
                  n_batch=net_data.n_batches,
                  n_layers=n_layers,
                  n_latent=n_latent,
                  n_hidden=n_hidden)
        trainer = UnsupervisedTrainer(vae,
                                      net_data,
                                      train_size=1,
                                      use_cuda=False)
        trainer.train(n_epochs=n_epochs, lr=1e-3)
        full = trainer.create_posterior(trainer.model,
                                        net_data,
                                        indices=np.arange(len(net_data)))
        latent, _, _ = full.sequential().get_latent()
        self.data.obsm['X_emb'] = latent
        self.dump_to_h5ad("scvi")
Beispiel #5
0
    def predict(self,
                adata,
                cell_type_to_predict,
                condition_key,
                cell_type_key,
                target_condition,
                source_condition,
                n_generated_samples=50):
        cell_type_adata = adata.copy()[adata.obs[cell_type_key] ==
                                       cell_type_to_predict]

        real_adata = cell_type_adata[cell_type_adata.obs[condition_key] ==
                                     target_condition]
        ctrl_adata = cell_type_adata[cell_type_adata.obs[condition_key] ==
                                     source_condition]

        le = LabelEncoder()
        le.fit([source_condition, target_condition])
        real_adata.obs['batch_indices'] = le.transform(
            real_adata.obs[condition_key].values)
        ctrl_adata.obs['batch_indices'] = le.transform([target_condition] *
                                                       ctrl_adata.shape[0])

        ctrl_adata = AnnDatasetFromAnnData(ctrl_adata)

        posterior = self.trainer.create_posterior(self.trainer.model,
                                                  ctrl_adata,
                                                  indices=np.arange(
                                                      len(ctrl_adata)))

        generated_samples, _ = posterior.sequential().generate(
            n_generated_samples)

        reconstructed = generated_samples.mean(axis=2)
        reconstructed_adata = sc.AnnData(X=reconstructed)
        reconstructed_adata.obs = ctrl_adata.obs.copy(deep=True)
        reconstructed_adata.obs[condition_key].replace(
            source_condition,
            f'{cell_type_to_predict}_pred_{target_condition}',
            inplace=True)
        reconstructed_adata.var_names = cell_type_adata.var_names

        pred_adata = reconstructed_adata[
            reconstructed_adata.obs[condition_key] ==
            f'{cell_type_to_predict}_pred_{target_condition}']

        sc.pp.normalize_per_cell(pred_adata)
        sc.pp.log1p(pred_adata)
        return pred_adata
Beispiel #6
0
 def test_data_loader(self):
     data = np.ones((25, 10)) * 100
     paired = np.ones((25, 4)) * np.arange(0, 4)
     pair_names = ["gabou", "achille", "pedro", "oclivio"]
     y = CellMeasurement(name="dev",
                         data=paired,
                         columns_attr_name="dev_names",
                         columns=pair_names)
     dataset = GeneExpressionDataset()
     dataset.populate_from_data(data, Ys=[y])
     ad = dataset.to_anndata()
     dataset_ad = AnnDatasetFromAnnData(
         ad, cell_measurements_col_mappings={"dev": "dev_names"})
     self.assertTrue((paired == dataset_ad.dev).all())
     self.assertTrue((dataset.X == dataset_ad.X).all())
     self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
Beispiel #7
0
    def to_mmd_layer(self, adata, condition_key, cell_type_key):
        le = LabelEncoder()
        adata.obs['labels'] = le.fit_transform(adata.obs[cell_type_key].values)
        adata.obs['batch_indices'] = le.fit_transform(
            adata.obs[condition_key].values)

        net_adata = AnnDatasetFromAnnData(adata)

        posterior = self.trainer.create_posterior(self.trainer.model,
                                                  net_adata,
                                                  indices=np.arange(
                                                      len(net_adata)))

        latent, _, __ = posterior.sequential().get_latent()

        latent_adata = sc.AnnData(X=latent)
        latent_adata.obs = adata.obs.copy(deep=True)
        return latent_adata
Beispiel #8
0
def correct_scvi(Xs, genes):
    import torch
    torch.manual_seed(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    from scvi.dataset import AnnDatasetFromAnnData
    from scvi.dataset.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE

    all_ann = [AnnDatasetFromAnnData(AnnData(X, var=genes)) for X in Xs]

    all_dataset = GeneExpressionDataset()
    all_dataset.populate_from_datasets(all_ann)

    vae = VAE(all_dataset.nb_genes,
              n_batch=all_dataset.n_batches,
              n_labels=all_dataset.n_labels,
              n_hidden=128,
              n_latent=30,
              n_layers=2,
              dispersion='gene')
    trainer = UnsupervisedTrainer(
        vae,
        all_dataset,
        train_size=1.,
        use_cuda=True,
    )
    n_epochs = 100
    #trainer.train(n_epochs=n_epochs)
    #torch.save(trainer.model.state_dict(),
    #           'data/harmonization.vae.pkl')
    trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl'))
    trainer.model.eval()

    full = trainer.create_posterior(trainer.model,
                                    all_dataset,
                                    indices=np.arange(len(all_dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()

    return latent
Beispiel #9
0
 def test_use_raw_flag(self):
     raw_data = np.random.randint(1, 5, size=(4, 7))
     ad = anndata.AnnData(raw_data)
     ad.raw = ad.copy()
     dataset = AnnDatasetFromAnnData(ad, use_raw=True)
     np.testing.assert_array_equal(dataset.X, raw_data)
Beispiel #10
0
 def test_train_one(self):
     data = np.random.randint(1, 5, size=(4, 7))
     ad = anndata.AnnData(data)
     dataset = AnnDatasetFromAnnData(ad)
     unsupervised_training_one_epoch(dataset)
Beispiel #11
0
 def test_init(self):
     data = np.random.randint(1, 5, size=(3, 7))
     ad = anndata.AnnData(data)
     dataset = AnnDatasetFromAnnData(ad)
     self.assertEqual(3, dataset.nb_cells)
     self.assertEqual(7, dataset.nb_genes)
adatas = []
for b in np.unique(anndataset_111.obs["batch_indices"]):
    adatas.append(anndataset_111[anndataset_111.obs["batch_indices"] == b, :].copy())
    adatas[-1].obs["batch_indices"] *= 0
for b in np.unique(anndataset_206.obs["batch_indices"]):
    adatas.append(anndataset_206[anndataset_206.obs["batch_indices"] == b, :].copy())
    adatas[-1].obs["batch_indices"] *= 0

names = ["111_d1", "111_d2", "206_d1", "206_d2"]

# Iterate over datasets
for n, adata in zip(names, adatas):
    hvg = adata.var["hvg_encode"]

    dataset = AnnDatasetFromAnnData(ad=adata[:, hvg])
    protein_data = CellMeasurement(
        name="protein_expression",
        data=adata.obsm["protein_expression"].astype(np.float32),
        columns_attr_name="protein_names",
        columns=adata.uns["protein_names"],
    )
    dataset.initialize_cell_measurement(protein_data)
    dataset.gene_names = adata[:, hvg].var_names.values
    
    set_seed(0)

    model = TOTALVI(dataset.nb_genes, dataset.protein_expression.shape[1], n_latent=20,)
    use_cuda = True
    lr = 4e-3
    early_stopping_kwargs = {
Beispiel #13
0
def main():
    usage = 'solo'
    parser = ArgumentParser(usage,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument(dest='model_json_file',
                        help='json file to pass VAE parameters')
    parser.add_argument(
        dest='data_path',
        help=
        'path to h5ad, loom or 10x directory containing cell by genes counts')
    parser.add_argument('-d',
                        dest='doublet_depth',
                        default=2.,
                        type=float,
                        help='Depth multiplier for a doublet relative to the \
                        average of its constituents')
    parser.add_argument('-g',
                        dest='gpu',
                        default=True,
                        action='store_true',
                        help='Run on GPU')
    parser.add_argument('-a',
                        dest='anndata_output',
                        default=False,
                        action='store_true',
                        help='output modified anndata object with solo scores \
                        Only works for anndata')
    parser.add_argument('-o', dest='out_dir', default='solo_out')
    parser.add_argument('-r',
                        dest='doublet_ratio',
                        default=2.,
                        type=float,
                        help='Ratio of doublets to true \
                        cells')
    parser.add_argument('-s',
                        dest='seed',
                        default=None,
                        help='Path to previous solo output  \
                        directory. Seed VAE models with previously \
                        trained solo model. Directory structure is assumed to \
                        be the same as solo output directory structure. \
                        should at least have a vae.pt a pickled object of \
                        vae weights and a latent.npy an np.ndarray of the \
                        latents of your cells.')
    parser.add_argument('-k',
                        dest='known_doublets',
                        help='Experimentally defined doublets tsv file. \
                        Should be a single column of True/False. True \
                        indicates the cell is a doublet. No header.',
                        type=str)
    parser.add_argument('-t',
                        dest='doublet_type',
                        help='Please enter \
                        multinomial, average, or sum',
                        default='multinomial',
                        choices=['multinomial', 'average', 'sum'])
    parser.add_argument('-e',
                        dest='expected_number_of_doublets',
                        help='Experimentally expected number of doublets',
                        type=int,
                        default=None)
    parser.add_argument('-p',
                        dest='plot',
                        default=False,
                        action='store_true',
                        help='Plot outputs for solo')
    parser.add_argument('-l',
                        dest='normal_logging',
                        default=False,
                        action='store_true',
                        help='Logging level set to normal (aka not debug)')
    parser.add_argument('--random_size',
                        dest='randomize_doublet_size',
                        default=False,
                        action='store_true',
                        help='Sample depth multipliers from Unif(1, \
                        DoubletDepth) \
                        to provide a diversity of possible doublet depths.')
    args = parser.parse_args()

    if not args.normal_logging:
        scvi._settings.set_verbosity(10)

    model_json_file = args.model_json_file
    data_path = args.data_path
    if args.gpu and not torch.cuda.is_available():
        args.gpu = torch.cuda.is_available()
        print('Cuda is not available, switching to cpu running!')

    if not os.path.isdir(args.out_dir):
        os.mkdir(args.out_dir)

    ##################################################
    # data

    # read loom/anndata
    data_ext = os.path.splitext(data_path)[-1]
    if data_ext == '.loom':
        scvi_data = LoomDataset(data_path)
    elif data_ext == '.h5ad':
        adata = anndata.read(data_path)
        if issparse(adata.X):
            adata.X = adata.X.todense()
        scvi_data = AnnDatasetFromAnnData(adata)
    elif os.path.isdir(data_path):
        scvi_data = Dataset10X(save_path=data_path,
                               measurement_names_column=1,
                               dense=True)
        cell_umi_depth = scvi_data.X.sum(axis=1)
        fifth, ninetyfifth = np.percentile(cell_umi_depth, [5, 95])
        min_cell_umi_depth = np.min(cell_umi_depth)
        max_cell_umi_depth = np.max(cell_umi_depth)
        if fifth * 10 < ninetyfifth:
            print("""WARNING YOUR DATA HAS A WIDE RANGE OF CELL DEPTHS.
            PLEASE MANUALLY REVIEW YOUR DATA""")
        print(
            f"Min cell depth: {min_cell_umi_depth}, Max cell depth: {max_cell_umi_depth}"
        )
    else:
        msg = f'{data_path} is not a recognized format.\n'
        msg += 'must be one of {h5ad, loom, 10x directory}'
        raise TypeError(msg)

    num_cells, num_genes = scvi_data.X.shape

    if args.known_doublets is not None:
        print('Removing known doublets for in silico doublet generation')
        print('Make sure known doublets are in the same order as your data')
        known_doublets = np.loadtxt(args.known_doublets, dtype=str) == 'True'

        assert len(known_doublets) == scvi_data.X.shape[0]
        known_doublet_data = make_gene_expression_dataset(
            scvi_data.X[known_doublets], scvi_data.gene_names)
        known_doublet_data.labels = np.ones(known_doublet_data.X.shape[0])
        singlet_scvi_data = make_gene_expression_dataset(
            scvi_data.X[~known_doublets], scvi_data.gene_names)
        singlet_num_cells, _ = singlet_scvi_data.X.shape
    else:
        known_doublet_data = None
        singlet_num_cells = num_cells
        known_doublets = np.zeros(num_cells, dtype=bool)
        singlet_scvi_data = scvi_data
    singlet_scvi_data.labels = np.zeros(singlet_scvi_data.X.shape[0])
    scvi_data.labels = known_doublets.astype(int)
    ##################################################
    # parameters

    # check for parameters
    if not os.path.exists(model_json_file):
        raise FileNotFoundError(f'{model_json_file} does not exist.')
    # read parameters
    with open(model_json_file, 'r') as model_json_open:
        params = json.load(model_json_open)

    # set VAE params
    vae_params = {}
    for par in [
            'n_hidden', 'n_latent', 'n_layers', 'dropout_rate', 'ignore_batch'
    ]:
        if par in params:
            vae_params[par] = params[par]
    vae_params['n_batch'] = 0 if params.get('ignore_batch',
                                            False) else scvi_data.n_batches

    # training parameters
    batch_size = params.get('batch_size', 128)
    valid_pct = params.get('valid_pct', 0.1)
    learning_rate = params.get('learning_rate', 1e-3)
    stopping_params = {'patience': params.get('patience', 10), 'threshold': 0}

    # protect against single example batch
    while num_cells % batch_size == 1:
        batch_size = int(np.round(1.25 * batch_size))
        print('Increasing batch_size to %d to avoid single example batch.' %
              batch_size)

    ##################################################
    # VAE

    vae = VAE(n_input=singlet_scvi_data.nb_genes,
              n_labels=2,
              reconstruction_loss='nb',
              log_variational=True,
              **vae_params)

    if args.seed:
        if args.gpu:
            device = torch.device('cuda')
            vae.load_state_dict(torch.load(os.path.join(args.seed, 'vae.pt')))
            vae.to(device)
        else:
            map_loc = 'cpu'
            vae.load_state_dict(
                torch.load(os.path.join(args.seed, 'vae.pt'),
                           map_location=map_loc))

        # save latent representation
        utrainer = \
            UnsupervisedTrainer(vae, singlet_scvi_data,
                                train_size=(1. - valid_pct),
                                frequency=2,
                                metrics_to_monitor=['reconstruction_error'],
                                use_cuda=args.gpu,
                                early_stopping_kwargs=stopping_params,
                                batch_size=batch_size)

        full_posterior = utrainer.create_posterior(utrainer.model,
                                                   singlet_scvi_data,
                                                   indices=np.arange(
                                                       len(singlet_scvi_data)))
        latent, _, _ = full_posterior.sequential(batch_size).get_latent()
        np.save(os.path.join(args.out_dir, 'latent.npy'),
                latent.astype('float32'))

    else:
        stopping_params['early_stopping_metric'] = 'reconstruction_error'
        stopping_params['save_best_state_metric'] = 'reconstruction_error'

        # initialize unsupervised trainer
        utrainer = \
            UnsupervisedTrainer(vae, singlet_scvi_data,
                                train_size=(1. - valid_pct),
                                frequency=2,
                                metrics_to_monitor=['reconstruction_error'],
                                use_cuda=args.gpu,
                                early_stopping_kwargs=stopping_params,
                                batch_size=batch_size)
        utrainer.history['reconstruction_error_test_set'].append(0)
        # initial epoch
        utrainer.train(n_epochs=2000, lr=learning_rate)

        # drop learning rate and continue
        utrainer.early_stopping.wait = 0
        utrainer.train(n_epochs=500, lr=0.5 * learning_rate)

        # save VAE
        torch.save(vae.state_dict(), os.path.join(args.out_dir, 'vae.pt'))

        # save latent representation
        full_posterior = utrainer.create_posterior(utrainer.model,
                                                   singlet_scvi_data,
                                                   indices=np.arange(
                                                       len(singlet_scvi_data)))
        latent, _, _ = full_posterior.sequential(batch_size).get_latent()
        np.save(os.path.join(args.out_dir, 'latent.npy'),
                latent.astype('float32'))

    ##################################################
    # simulate doublets

    non_zero_indexes = np.where(singlet_scvi_data.X > 0)
    cells = non_zero_indexes[0]
    genes = non_zero_indexes[1]
    cells_ids = defaultdict(list)
    for cell_id, gene in zip(cells, genes):
        cells_ids[cell_id].append(gene)

    # choose doublets function type
    if args.doublet_type == 'average':
        doublet_function = create_average_doublet
    elif args.doublet_type == 'sum':
        doublet_function = create_summed_doublet
    else:
        doublet_function = create_multinomial_doublet

    cell_depths = singlet_scvi_data.X.sum(axis=1)
    num_doublets = int(args.doublet_ratio * singlet_num_cells)
    if known_doublet_data is not None:
        num_doublets -= known_doublet_data.X.shape[0]
        # make sure we are making a non negative amount of doublets
        assert num_doublets >= 0

    in_silico_doublets = np.zeros((num_doublets, num_genes), dtype='float32')
    # for desired # doublets
    for di in range(num_doublets):
        # sample two cells
        i, j = np.random.choice(singlet_num_cells, size=2)

        # generate doublets
        in_silico_doublets[di, :] = \
            doublet_function(singlet_scvi_data.X, i, j,
                             doublet_depth=args.doublet_depth,
                             cell_depths=cell_depths, cells_ids=cells_ids,
                             randomize_doublet_size=args.randomize_doublet_size)

    # merge datasets
    # we can maybe up sample the known doublets
    # concatentate
    classifier_data = GeneExpressionDataset()
    classifier_data.populate_from_data(
        X=np.vstack([scvi_data.X, in_silico_doublets]),
        labels=np.hstack(
            [np.ravel(scvi_data.labels),
             np.ones(in_silico_doublets.shape[0])]),
        remap_attributes=False)

    assert (len(np.unique(classifier_data.labels.flatten())) == 2)

    ##################################################
    # classifier

    # model
    classifier = Classifier(n_input=(vae.n_latent + 1),
                            n_hidden=params['cl_hidden'],
                            n_layers=params['cl_layers'],
                            n_labels=2,
                            dropout_rate=params['dropout_rate'])

    # trainer
    stopping_params['early_stopping_metric'] = 'accuracy'
    stopping_params['save_best_state_metric'] = 'accuracy'
    strainer = ClassifierTrainer(classifier,
                                 classifier_data,
                                 train_size=(1. - valid_pct),
                                 frequency=2,
                                 metrics_to_monitor=['accuracy'],
                                 use_cuda=args.gpu,
                                 sampling_model=vae,
                                 sampling_zl=True,
                                 early_stopping_kwargs=stopping_params,
                                 batch_size=batch_size)

    # initial
    strainer.train(n_epochs=1000, lr=learning_rate)

    # drop learning rate and continue
    strainer.early_stopping.wait = 0
    strainer.train(n_epochs=300, lr=0.1 * learning_rate)
    torch.save(classifier.state_dict(),
               os.path.join(args.out_dir, 'classifier.pt'))

    ##################################################
    # post-processing
    # use logits for predictions for better results
    logits_classifier = Classifier(n_input=(vae.n_latent + 1),
                                   n_hidden=params['cl_hidden'],
                                   n_layers=params['cl_layers'],
                                   n_labels=2,
                                   dropout_rate=params['dropout_rate'],
                                   logits=True)
    logits_classifier.load_state_dict(classifier.state_dict())

    # using logits leads to better performance in for ranking
    logits_strainer = ClassifierTrainer(logits_classifier,
                                        classifier_data,
                                        train_size=(1. - valid_pct),
                                        frequency=2,
                                        metrics_to_monitor=['accuracy'],
                                        use_cuda=args.gpu,
                                        sampling_model=vae,
                                        sampling_zl=True,
                                        early_stopping_kwargs=stopping_params,
                                        batch_size=batch_size)

    # models evaluation mode
    vae.eval()
    classifier.eval()
    logits_classifier.eval()

    print('Train accuracy: %.4f' % strainer.train_set.accuracy())
    print('Test accuracy:  %.4f' % strainer.test_set.accuracy())

    # compute predictions manually
    # output logits
    train_y, train_score = strainer.train_set.compute_predictions(soft=True)
    test_y, test_score = strainer.test_set.compute_predictions(soft=True)
    # train_y == true label
    # train_score[:, 0] == singlet score; train_score[:, 1] == doublet score
    train_score = train_score[:, 1]
    train_y = train_y.astype('bool')
    test_score = test_score[:, 1]
    test_y = test_y.astype('bool')

    train_auroc = roc_auc_score(train_y, train_score)
    test_auroc = roc_auc_score(test_y, test_score)

    print('Train AUROC: %.4f' % train_auroc)
    print('Test AUROC:  %.4f' % test_auroc)

    train_fpr, train_tpr, train_t = roc_curve(train_y, train_score)
    test_fpr, test_tpr, test_t = roc_curve(test_y, test_score)
    train_t = np.minimum(train_t, 1 + 1e-9)
    test_t = np.minimum(test_t, 1 + 1e-9)

    train_acc = np.zeros(len(train_t))
    for i in range(len(train_t)):
        train_acc[i] = np.mean(train_y == (train_score > train_t[i]))
    test_acc = np.zeros(len(test_t))
    for i in range(len(test_t)):
        test_acc[i] = np.mean(test_y == (test_score > test_t[i]))

    # write predictions
    # softmax predictions
    order_y, order_score = strainer.compute_predictions(soft=True)
    _, order_pred = strainer.compute_predictions()
    doublet_score = order_score[:, 1]
    np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores.npy'),
            doublet_score[:num_cells])
    np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores_sim.npy'),
            doublet_score[num_cells:])

    # logit predictions
    logit_y, logit_score = logits_strainer.compute_predictions(soft=True)
    logit_doublet_score = logit_score[:, 1]
    np.save(os.path.join(args.out_dir, 'logit_scores.npy'),
            logit_doublet_score[:num_cells])
    np.save(os.path.join(args.out_dir, 'logit_scores_sim.npy'),
            logit_doublet_score[num_cells:])

    # update threshold as a function of Solo's estimate of the number of
    # doublets
    # essentially a log odds update
    # TODO put in a function
    diff = np.inf
    counter_update = 0
    solo_scores = doublet_score[:num_cells]
    logit_scores = logit_doublet_score[:num_cells]
    d_s = (args.doublet_ratio / (args.doublet_ratio + 1))
    while (diff > .01) | (counter_update < 5):

        # calculate log odss calibration for logits
        d_o = np.mean(solo_scores)
        c = np.log(d_o / (1 - d_o)) - np.log(d_s / (1 - d_s))

        # update soloe scores
        solo_scores = 1 / (1 + np.exp(-(logit_scores + c)))

        # update while conditions
        diff = np.abs(d_o - np.mean(solo_scores))
        counter_update += 1

    np.save(os.path.join(args.out_dir, 'softmax_scores.npy'), solo_scores)

    if args.expected_number_of_doublets is not None:
        k = len(solo_scores) - args.expected_number_of_doublets
        if args.expected_number_of_doublets / len(solo_scores) > .5:
            print('''Make sure you actually expect more than half your cells
                   to be doublets. If not change your
                   -e parameter value''')
        assert k > 0
        idx = np.argpartition(solo_scores, k)
        threshold = np.max(solo_scores[idx[:k]])
        is_solo_doublet = solo_scores > threshold
    else:
        is_solo_doublet = solo_scores > .5

    is_doublet = known_doublets
    new_doublets_idx = np.where(~(is_doublet) & is_solo_doublet[:num_cells])[0]
    is_doublet[new_doublets_idx] = True

    np.save(os.path.join(args.out_dir, 'is_doublet.npy'),
            is_doublet[:num_cells])
    np.save(os.path.join(args.out_dir, 'is_doublet_sim.npy'),
            is_doublet[num_cells:])

    np.save(os.path.join(args.out_dir, 'preds.npy'), order_pred[:num_cells])
    np.save(os.path.join(args.out_dir, 'preds_sim.npy'),
            order_pred[num_cells:])

    smoothed_preds = knn_smooth_pred_class(X=latent,
                                           pred_class=is_doublet[:num_cells])
    np.save(os.path.join(args.out_dir, 'smoothed_preds.npy'), smoothed_preds)

    if args.anndata_output and data_ext == '.h5ad':
        adata.obs['is_doublet'] = is_doublet[:num_cells]
        adata.obs['logit_scores'] = logit_doublet_score[:num_cells]
        adata.obs['softmax_scores'] = doublet_score[:num_cells]
        adata.write(os.path.join(args.out_dir, "soloed.h5ad"))

    if args.plot:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import seaborn as sns
        # plot ROC
        plt.figure()
        plt.plot(train_fpr, train_tpr, label='Train')
        plt.plot(test_fpr, test_tpr, label='Test')
        plt.gca().set_xlabel('False positive rate')
        plt.gca().set_ylabel('True positive rate')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'roc.pdf'))
        plt.close()

        # plot accuracy
        plt.figure()
        plt.plot(train_t, train_acc, label='Train')
        plt.plot(test_t, test_acc, label='Test')
        plt.axvline(0.5, color='black', linestyle='--')
        plt.gca().set_xlabel('Threshold')
        plt.gca().set_ylabel('Accuracy')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'accuracy.pdf'))
        plt.close()

        # plot distributions
        plt.figure()
        sns.distplot(test_score[test_y], label='Simulated')
        sns.distplot(test_score[~test_y], label='Observed')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'train_v_test_dist.pdf'))
        plt.close()

        plt.figure()
        sns.distplot(doublet_score[:num_cells], label='Observed')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'real_cells_dist.pdf'))
        plt.close()

        scvi_umap = umap.UMAP(n_neighbors=16).fit_transform(latent)
        fig, ax = plt.subplots(1, 1, figsize=(10, 10))
        ax.scatter(scvi_umap[:, 0],
                   scvi_umap[:, 1],
                   c=doublet_score[:num_cells],
                   s=8,
                   cmap="GnBu")

        ax.set_xlabel("UMAP 1")
        ax.set_ylabel("UMAP 2")
        ax.set_xticks([], [])
        ax.set_yticks([], [])
        fig.savefig(os.path.join(args.out_dir, 'umap_solo_scores.pdf'))
Beispiel #14
0
def load_posterior(dir_path: str,
                   model: nn.Module,
                   use_cuda: Optional[Union[bool, str]] = "auto",
                   **posterior_kwargs):
    """Function to use in order to retrieve a posterior that was saved using the ``save_posterior`` method

    Because of pytorch model loading usage, this function needs a scVI model object initialized with exact same parameters
    that during training.
    Because saved posteriors correspond to already trained models, data is loaded sequentially using a ``SequentialSampler``.

    Parameters
    ----------
    dir_path
        directory containing the posterior properties to be retrieved.
    model
        scVI initialized model.
    use_cuda
        Specifies if the computations should be perfomed with a GPU.
        Default: ``True``
        If ``auto``, then cuda availability is inferred, with a preference to load on GPU.
        If ``False``, the model will be loaded on the CPU, even if it was trained using a GPU.
    **posterior_kwargs
        additional parameters to feed to the posterior constructor.

    Returns
    -------

    >>> model = VAE(nb_genes, n_batches, n_hidden=128, n_latent=10)
    >>> trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda)
    >>> trainer.train(n_epochs=200)
    >>> trainer.train_set.save_posterior("./my_run_train_posterior")

    >>> model = VAE(nb_genes, n_batches, n_hidden=128, n_latent=10)
    >>> post = load_posterior("./my_run_train_posterior", model=model)
    """
    # Avoid circular imports
    from scvi.inference.total_inference import TotalPosterior
    from scvi.inference.jvae_trainer import JPosterior
    from scvi.inference.posterior import Posterior
    from scvi.inference.annotation import AnnotationPosterior

    post_type_path = os.path.join(dir_path, "posterior_type.txt")
    dataset_path = os.path.join(dir_path, "anndata_dataset.h5ad")
    model_path = os.path.join(dir_path, "model_params.pt")
    indices_path = os.path.join(dir_path, "indices.npy")
    data_loader_kwargs_path = os.path.join(dir_path, "data_loader_kwargs.h5")

    # Infering posterior type
    with open(post_type_path, "r") as post_file:
        post_class_str = post_file.readline()
    str_to_classes = dict(
        TotalPosterior=TotalPosterior,
        JPosterior=JPosterior,
        Posterior=Posterior,
        AnnotationPosterior=AnnotationPosterior,
    )
    if post_class_str not in str_to_classes:
        raise ValueError("Posterior type {} not eligible for loading".format(
            post_class_str))
    post_class = str_to_classes[post_class_str]

    # Loading dataset and associated measurements
    ad = anndata.read_h5ad(filename=dataset_path)
    key = "cell_measurements_col_mappings"
    if key in ad.uns:
        cell_measurements_col_mappings = ad.uns[key]
    else:
        cell_measurements_col_mappings = dict()
    dataset = AnnDatasetFromAnnData(
        ad=ad, cell_measurements_col_mappings=cell_measurements_col_mappings)

    # Loading scVI model
    if use_cuda == "auto":
        use_cuda = torch.cuda.is_available()
    use_cuda = use_cuda and torch.cuda.is_available()
    if use_cuda:
        model.load_state_dict(torch.load(model_path))
        model.cuda()
    else:
        device = torch.device("cpu")
        model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Loading data loader options and posterior
    indices = np.load(file=indices_path)
    data_loader_kwargs = pd.read_hdf(data_loader_kwargs_path,
                                     key="data_loader").to_dict()
    my_post = post_class(model=model,
                         gene_dataset=dataset,
                         shuffle=False,
                         indices=indices,
                         use_cuda=use_cuda,
                         data_loader_kwargs=data_loader_kwargs,
                         **posterior_kwargs)
    return my_post
Beispiel #15
0
 def test_protected_X(self):
     data = np.random.poisson(0.2, size=(25, 10))
     ad = anndata.AnnData(data)
     ad.obs["_X"] = np.zeros(25)
     AnnDatasetFromAnnData(ad)
Beispiel #16
0
def runScanvi(adata, batch, labels):
    # Use non-normalized (count) data for scanvi!

    # Check for counts data layer
    if 'counts' not in adata.layers:
        raise TypeError(
            'Adata does not contain a `counts` layer in `adata.layers[`counts`]`'
        )

    from scvi.models import VAE, SCANVI
    from scvi.inference import UnsupervisedTrainer, SemiSupervisedTrainer
    from sklearn.preprocessing import LabelEncoder
    from scvi.dataset import AnnDatasetFromAnnData
    import numpy as np

    # STEP 1: prepare the data
    net_adata = adata.copy()
    net_adata.X = adata.layers['counts']
    del net_adata.layers['counts']
    # Ensure that the raw counts are not accidentally used
    del net_adata.raw  # Note that this only works from anndata 0.7

    # Define batch indices
    le = LabelEncoder()
    net_adata.obs['batch_indices'] = le.fit_transform(
        net_adata.obs[batch].values)
    net_adata.obs['labels'] = le.fit_transform(net_adata.obs[labels].values)

    net_adata = AnnDatasetFromAnnData(net_adata)

    print("scANVI dataset object with {} batches and {} cell types".format(
        net_adata.n_batches, net_adata.n_labels))

    #if hvg is True:
    #    # this also corrects for different batches by default
    #    net_adata.subsample_genes(2000, mode="seurat_v3")

    # # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization
    n_epochs_scVI = np.min([round((20000 / adata.n_obs) * 400), 400])  #400
    n_epochs_scANVI = int(np.min([10, np.max([2, round(n_epochs_scVI / 3.)])]))
    n_latent = 30
    n_hidden = 128
    n_layers = 2

    # STEP 2: RUN scVI to initialize scANVI

    vae = VAE(
        net_adata.nb_genes,
        reconstruction_loss='nb',
        n_batch=net_adata.n_batches,
        n_latent=n_latent,
        n_hidden=n_hidden,
        n_layers=n_layers,
    )

    trainer = UnsupervisedTrainer(
        vae,
        net_adata,
        train_size=1.0,
        use_cuda=False,
    )

    trainer.train(n_epochs=n_epochs_scVI, lr=1e-3)

    # STEP 3: RUN scANVI

    scanvi = SCANVI(net_adata.nb_genes,
                    net_adata.n_batches,
                    net_adata.n_labels,
                    n_hidden=n_hidden,
                    n_latent=n_latent,
                    n_layers=n_layers,
                    dispersion='gene',
                    reconstruction_loss='nb')
    scanvi.load_state_dict(trainer.model.state_dict(), strict=False)

    # use default parameter from semi-supervised trainer class
    trainer_scanvi = SemiSupervisedTrainer(scanvi, net_adata)
    # use all cells as labelled set
    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        trainer_scanvi.model, net_adata, indices=np.arange(len(net_adata)))
    # put one cell in the unlabelled set
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=[0])
    trainer_scanvi.train(n_epochs=n_epochs_scANVI)

    # extract info from posterior
    scanvi_full = trainer_scanvi.create_posterior(trainer_scanvi.model,
                                                  net_adata,
                                                  indices=np.arange(
                                                      len(net_adata)))
    latent, _, _ = scanvi_full.sequential().get_latent()

    adata.obsm['X_emb'] = latent

    return adata
Beispiel #17
0
def scvi(
    adata: AnnData,
    n_hidden: int = 128,
    n_latent: int = 10,
    n_layers: int = 1,
    dispersion: str = "gene",
    n_epochs: int = 400,
    lr: int = 1e-3,
    train_size: int = 1.0,
    batch_key: Optional[str] = None,
    use_highly_variable_genes: bool = True,
    subset_genes: Optional[Sequence[Union[int, str]]] = None,
    linear_decoder: bool = False,
    copy: bool = False,
    use_cuda: bool = True,
    return_posterior: bool = True,
    trainer_kwargs: dict = {},
    model_kwargs: dict = {},
) -> Optional[AnnData]:
    """\
    SCVI [Lopez18]_.

    Fits scVI model onto raw count data given an anndata object

    scVI uses stochastic optimization and deep neural networks to aggregate information 
    across similar cells and genes and to approximate the distributions that underlie
    observed expression values, while accounting for batch effects and limited sensitivity.

    To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.),
    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can 
    be used to inspect which genes contribute to variation in the dataset. It may also be used
    for all scVI tasks, like differential expression, batch correction, imputation, etc.
    However, batch correction may be less powerful as it assumes a linear model.

    .. note::
        More information and bug reports `here <https://github.com/YosefLab/scVI>`__.

    Parameters
    ----------
    adata
        An anndata file with `X` attribute of unnormalized count data
    n_hidden
        Number of nodes per hidden layer
    n_latent
        Dimensionality of the latent space
    n_layers
        Number of hidden layers used for encoder and decoder NNs
    dispersion
        One of the following
        * `'gene'` - dispersion parameter of NB is constant per gene across cells
        * `'gene-batch'` - dispersion can differ between different batches
        * `'gene-label'` - dispersion can differ between different labels
        * `'gene-cell'` - dispersion can differ for every gene in every cell
    n_epochs
        Number of epochs to train
    lr
        Learning rate
    train_size
        The train size, either a float between 0 and 1 or an integer for the number of training samples to use
    batch_key
        Column name in anndata.obs for batches. 
        If None, no batch correction is performed
        If not None, batch correction is performed per batch category
    use_highly_variable_genes
        If true, uses only the genes in anndata.var["highly_variable"]
    subset_genes
        Optional list of indices or gene names to subset anndata. 
        If not None, use_highly_variable_genes is ignored
    linear_decoder
        If true, uses LDVAE model, which is an implementation of [Svensson20]_.
    copy
        If true, a copy of anndata is returned
    return_posterior
        If true, posterior object is returned
    use_cuda
        If true, uses cuda
    trainer_kwargs
        Extra arguments for UnsupervisedTrainer
    model_kwargs
        Extra arguments for VAE or LDVAE model
    
    Returns
    -------
    If `copy` is true, anndata is returned.
    If `return_posterior` is true, the posterior object is returned
    If both `copy` and `return_posterior` are true, 
    a tuple of anndata and the posterior are returned in that order. 

    `adata.obsm['X_scvi']` stores the latent representations
    `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial
    `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial
    
    If linear_decoder is true:
    `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a
    genes by n_latent matrix.

    """
    warnings.warn(
        "scvi via scanpy external API is no longer supported. " +
        "Please use the new scvi-tools package from `scvi-tools.org`",
        FutureWarning,
    )

    try:
        from scvi.models import VAE, LDVAE
        from scvi.inference import UnsupervisedTrainer
        from scvi.dataset import AnnDatasetFromAnnData
    except ImportError:
        raise ImportError(
            "Please install scvi package from https://github.com/YosefLab/scVI"
        )

    # check if observations are unnormalized using first 10
    # code from: https://github.com/theislab/dca/blob/89eee4ed01dd969b3d46e0c815382806fbfc2526/dca/io.py#L63-L69
    if len(adata) > 10:
        X_subset = adata.X[:10]
    else:
        X_subset = adata.X
    norm_error = (
        'Make sure that the dataset (adata.X) contains unnormalized count data.'
    )
    if sp.sparse.issparse(X_subset):
        assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error
    else:
        assert np.all(X_subset.astype(int) == X_subset), norm_error

    if subset_genes is not None:
        adata_subset = adata[:, subset_genes]
    elif use_highly_variable_genes and "highly_variable" in adata.var:
        adata_subset = adata[:, adata.var["highly_variable"]]
    else:
        adata_subset = adata

    if batch_key is not None:
        codes, uniques = pd.factorize(adata_subset.obs[batch_key])
        adata_subset.obs['_tmp_scvi_batch'] = codes
        n_batches = len(uniques)
    else:
        n_batches = 0

    dataset = AnnDatasetFromAnnData(adata_subset.copy(),
                                    batch_label='_tmp_scvi_batch')

    if linear_decoder:
        vae = LDVAE(
            n_input=dataset.nb_genes,
            n_batch=n_batches,
            n_labels=dataset.n_labels,
            n_hidden=n_hidden,
            n_latent=n_latent,
            n_layers_encoder=n_layers,
            dispersion=dispersion,
            **model_kwargs,
        )

    else:
        vae = VAE(
            dataset.nb_genes,
            n_batch=n_batches,
            n_labels=dataset.n_labels,
            n_hidden=n_hidden,
            n_latent=n_latent,
            n_layers=n_layers,
            dispersion=dispersion,
            **model_kwargs,
        )

    trainer = UnsupervisedTrainer(
        model=vae,
        gene_dataset=dataset,
        use_cuda=use_cuda,
        train_size=train_size,
        **trainer_kwargs,
    )

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(trainer.model,
                                    dataset,
                                    indices=np.arange(len(dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()

    if copy:
        adata = adata.copy()

    adata.obsm['X_scvi'] = latent
    adata.obsm['X_scvi_denoised'] = full.sequential().get_sample_scale()
    adata.obsm['X_scvi_sample_rate'] = full.sequential().imputation()

    if linear_decoder:
        loadings = vae.get_loadings()
        df = pd.DataFrame(loadings, index=adata_subset.var_names)
        adata.uns['ldvae_loadings'] = df

    if copy and return_posterior:
        return adata, full
    elif copy:
        return adata
    elif return_posterior:
        return full
Beispiel #18
0
def runScvi(adata, batch, hvg=None):
    # Use non-normalized (count) data for scvi!
    # Expects data only on HVGs

    checkSanity(adata, batch, hvg)

    # Check for counts data layer
    if 'counts' not in adata.layers:
        raise TypeError(
            'Adata does not contain a `counts` layer in `adata.layers[`counts`]`'
        )

    from scvi.models import VAE
    from scvi.inference import UnsupervisedTrainer
    from sklearn.preprocessing import LabelEncoder
    from scvi.dataset import AnnDatasetFromAnnData

    # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization
    n_epochs = np.min([round((20000 / adata.n_obs) * 400), 400])
    n_latent = 30
    n_hidden = 128
    n_layers = 2

    net_adata = adata.copy()
    net_adata.X = adata.layers['counts']
    del net_adata.layers['counts']
    # Ensure that the raw counts are not accidentally used
    del net_adata.raw  # Note that this only works from anndata 0.7

    # Define batch indices
    le = LabelEncoder()
    net_adata.obs['batch_indices'] = le.fit_transform(
        net_adata.obs[batch].values)

    net_adata = AnnDatasetFromAnnData(net_adata)

    vae = VAE(
        net_adata.nb_genes,
        reconstruction_loss='nb',
        n_batch=net_adata.n_batches,
        n_layers=n_layers,
        n_latent=n_latent,
        n_hidden=n_hidden,
    )

    trainer = UnsupervisedTrainer(
        vae,
        net_adata,
        train_size=1.0,
        use_cuda=False,
    )

    trainer.train(n_epochs=n_epochs, lr=1e-3)

    full = trainer.create_posterior(trainer.model,
                                    net_adata,
                                    indices=np.arange(len(net_adata)))
    latent, _, _ = full.sequential().get_latent()

    adata.obsm['X_emb'] = latent

    return adata
Beispiel #19
0
                # SCVI
                ##############################################################
                import time
                from scvi.dataset import AnnDatasetFromAnnData
                from scvi.dataset.dataset import GeneExpressionDataset
                from scvi.inference import UnsupervisedTrainer
                from scvi.models import SCANVI, VAE
                from umap import UMAP
                import scanpy as sc

                # TODO: import the datasets into SCVI objects (sigh!)
                # scVI wants raw counts, but who knows about those TabulaMurisSenis data
                # quick and dirty solution for now
                asubr_scvi = asubr.copy()
                asubr_scvi.X.data = asubr_scvi.X.data.astype(np.int64)
                ds_atlas = AnnDatasetFromAnnData(asubr_scvi)

                asub2_scvi = asub2.copy()
                asub2_scvi.X.data = asub2_scvi.X.data.astype(np.int64)
                ds_new = AnnDatasetFromAnnData(asub2_scvi)

                all_dataset = GeneExpressionDataset()
                all_dataset.populate_from_datasets([ds_atlas, ds_new])

                ##############################################################
                t0 = time.time()
                print('Prepare some data structures')
                vae = VAE(
                    all_dataset.nb_genes,
                    n_batch=all_dataset.n_batches,
                    n_labels=all_dataset.n_labels,