Exemple #1
0
def test_concatenate_mixed():
    X1 = sparse.csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]]))
    X2 = sparse.csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]]))
    X3 = sparse.csr_matrix(np.array([[1, 0, 3], [0, 0, 6], [0, 8, 0]]))
    X4 = np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])
    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2", "s3"], anno1=["c1", "c2", "c3"]),
        dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]),
        layers=dict(counts=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]),
        layers=dict(counts=X4),  # sic
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s7", "s8", "s9"], anno2=["d3", "d4", "d5"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 2, 3], annoB=[0, 1, 2]),
        layers=dict(counts=X3),
    )
    adata4 = AnnData(
        X4,
        dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]),
        layers=dict(counts=X2),  # sic
    )

    adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4)
    assert isinstance(adata_all.X, sparse.csr_matrix)
    assert isinstance(adata_all.layers["counts"], sparse.csr_matrix)
Exemple #2
0
def test_scvi_linear():
    n_samples = 4
    n_genes = 7
    batch1 = np.random.randint(1, 5, size=(n_samples, n_genes))
    batch2 = np.random.randint(1, 5, size=(n_samples, n_genes))
    ad1 = AnnData(batch1)
    ad2 = AnnData(batch2)
    adata = ad1.concatenate(ad2, batch_categories=['test1', 'test2'])
    n_latent = 30
    gene_subset = ['1', '4', '6']
    sce.pp.scvi(
        adata,
        use_cuda=False,
        n_epochs=1,
        n_latent=n_latent,
        return_posterior=True,
        batch_key='batch',
        linear_decoder=True,
        subset_genes=gene_subset,
    )

    assert adata.obsm['X_scvi'].shape == (n_samples * 2, n_latent)
    assert adata.obsm['X_scvi_denoised'].shape == (n_samples * 2,
                                                   len(gene_subset))
    assert adata.obsm['X_scvi_sample_rate'].shape == (n_samples * 2,
                                                      len(gene_subset))
    assert adata.uns['ldvae_loadings'].shape == (len(gene_subset), n_latent)
    assert len(adata.uns['ldvae_loadings'].index) == len(gene_subset)
    assert set(adata.uns['ldvae_loadings'].index) == set(gene_subset)
Exemple #3
0
def test_concatenate_mixed():
    X1 = csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]]))
    X2 = csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]]))
    X3 = csr_matrix(np.array([[1, 0, 3], [0, 0, 6], [0, 8, 0]]))
    X4 = np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])
    adata1 = AnnData(
        X1,
        dict(obs_names=['s1', 's2', 's3'], anno1=['c1', 'c2', 'c3']),
        dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]),
        layers=dict(counts=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=['s4', 's5', 's6'], anno1=['c3', 'c4', 'c5']),
        dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]),
        layers=dict(counts=X4),  # sic
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=['s7', 's8', 's9'], anno2=['d3', 'd4', 'd5']),
        dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]),
        layers=dict(counts=X3),
    )
    adata4 = AnnData(
        X4,
        dict(obs_names=['s4', 's5', 's6'], anno1=['c3', 'c4', 'c5']),
        dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]),
        layers=dict(counts=X2),  # sic
    )

    adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4)
    assert isspmatrix_csr(adata_all.X)
    assert isspmatrix_csr(adata_all.layers['counts'])
Exemple #4
0
def load_normalized_data(file_path, log1p=True):
    """load normalized data
    1. Load filtered data for both FACS and droplet
    2. Size factor normalization to counts per 10 thousand
    3. log(x+1) transform
    4. Combine the data 

    Args:
        file_path (str): file path. Should contain both FACS data facs_filtered.h5ad and droplet data droplet_filtered.h5ad

    Returns:
        adata_combine (AnnData): Combined data for FACS and droplet
    """
    # Load filtered data
    adata_facs = read_h5ad(f'{file_path}/facs_filtered.h5ad')
    adata_droplet = read_h5ad(f'{file_path}/droplet_filtered.h5ad')
    # Size factor normalization
    sc.pp.normalize_per_cell(adata_facs, counts_per_cell_after=1e4)
    sc.pp.normalize_per_cell(adata_droplet, counts_per_cell_after=1e4)
    # log(x+1) transform
    if log1p:
        sc.pp.log1p(adata_facs)
        sc.pp.log1p(adata_droplet)
    # Combine the data
    ind_select = adata_facs.obs['age'].isin(['3m', '18m', '24m'])
    adata_facs = adata_facs[ind_select, ]
    adata_combine = AnnData.concatenate(adata_facs,
                                        adata_droplet,
                                        batch_key='b_method',
                                        batch_categories=['facs', 'droplet'])
    return adata_combine
Exemple #5
0
def concat_data(data_list,
                batch_categories=None,
                join='inner',
                batch_key='batch',
                index_unique=None,
                save=None):
    """
    Concat multiple datasets
    """
    if len(data_list) == 1:
        return load_files(data_list[0])
    elif isinstance(data_list, str):
        return load_files(data_list)
    adata_list = []
    for root in data_list:
        adata = load_files(root)
        adata_list.append(adata)

    if batch_categories is None:
        batch_categories = list(map(str, range(len(adata_list))))
    else:
        assert len(adata_list) == len(batch_categories)
    [print(b, adata.shape) for adata, b in zip(adata_list, batch_categories)]
    concat = AnnData.concatenate(*adata_list,
                                 join=join,
                                 batch_key=batch_key,
                                 batch_categories=batch_categories,
                                 index_unique=index_unique)
    if save:
        concat.write(save, compression='gzip')
    return concat
Exemple #6
0
    def map_query_data(cls, corrected_reference: AnnData, query: AnnData, reference_model: Union[str, 'scgen'], batch_key: str = 'study', return_latent = True):
        """
        Removes the batch effect between reference and query data.
        Additional training on query data is not needed.

        Parameters
        ----------
        corrected_reference: `~anndata.AnnData`
           Already corrected reference anndata object
        query: `~anndata.AnnData`
            Query anndata object
        batch_key: `str`
            batch label key in query.obs
        return_latent: `bool`
            if `True` returns corrected latent representation

        Returns
        -------
        integrated: `~anndata.AnnData`
        Returns an integrated query.
        """
        query_batches_labels = query.obs[batch_key].unique().tolist()
        query_adata_by_batches = [query[query.obs[batch_key].isin([batch])].copy() for batch in query_batches_labels]

        reference_query_adata = AnnData.concatenate(*[corrected_reference, query_adata_by_batches],
                                                    batch_key="reference_map",
                                                    batch_categories= ['reference'] + query_batches_labels,
                                                    index_unique=None)
        reference_query_adata.obs['original_batch'] = reference_query_adata.obs[batch_key].tolist()

        # passed model as file
        if isinstance(reference_model, str):
            attr_dict, model_state_dict, var_names = cls._load_params(reference_model)
            _validate_var_names(query, var_names)
            init_params = cls._get_init_params_from_dict(attr_dict)

            new_model = cls(reference_query_adata, **init_params)
            new_model.model.load_state_dict(model_state_dict)

            integrated_query = new_model.batch_removal(reference_query_adata, batch_key = "reference_map", cell_label_key = "cell_type", return_latent = True)

            return integrated_query

        #passed model as model object
        else:
            # when corrected_reference is already in the passed model
            if np.all(reference_model._get_user_attributes()[0][1].X == corrected_reference.X):
                integrated_query = reference_model.batch_removal(reference_query_adata, batch_key = "reference_map", cell_label_key = "cell_type", return_latent = True)
            else:
                attr_dict = reference_model._get_public_attributes()
                model_state_dict = reference_model.model.state_dict()
                init_params = cls._get_init_params_from_dict(attr_dict)

                new_model = cls(reference_query_adata, **init_params)
                new_model.model.load_state_dict(model_state_dict)

                integrated_query = new_model.batch_removal(reference_query_adata, batch_key = "reference_map", cell_label_key = "cell_type", return_latent = True)

            return integrated_query
def project(
    new_dir_name: str,
    adata_ref: AnnData,
    scv: bool = False,
    batch_categories: Optional[str] = None,
) -> AnnData:
    """
    This function is used to project adata to adata_ref using ingest function in scanpy. The procedures here are based
    on the tutorial: integration-data-using-inject. Both adata ad adata_ref should have been preprocessed by running the
    preprocess function but with need_scale false.
    :param new_dir_name: the data directory to be projected
    :param adata_ref: the reference data
    :param scv: true for scv-based RNA velocity analysis
    :param batch_categories: the name for the reference should be the first element
    :return: return a merged AnnData object with two originally data copied and merged.
    """
    # Check if leiden has been conducted.
    if 'leiden' not in adata_ref.obs:
        return "error: run cluster first for the reference data."
    adata = None
    if scv:
        adata = scv_open(new_dir_name)
        scv_preprocess(adata)
    else:
        adata = open_10_genomics_data(new_dir_name)
        # Make sure we do the same thing as in the original data. But we don't want to keep the original data
        adata = preprocess(adata, copy=False, need_scale=False)
        # Check if we need to do permutation
        if imputation_uns_key_name in adata_ref.uns_keys():
            # support magic only
            sc.external.pp.magic(
                adata, solver=adata_ref.uns[imputation_uns_key_name]['solver'])
            if 'min_value' in adata_ref.uns[imputation_uns_key_name].keys():
                adata.X -= adata_ref.uns[imputation_uns_key_name][
                    'min_value']  # Scale up as in the orginal data
    # Make sure both of them have the same set of variables
    shared_var_names = adata_ref.var_names.intersection(adata.var_names)
    sc.logging.info("shared_var_names: {}".format(len(shared_var_names)))
    # slicing the data to make copies
    adata = adata[:, shared_var_names]
    if not scv:
        # Call regress here so that we have almost the same number of genes selected by the adata_ref (aka highly invariable genes)
        regressout_key = None
        if regressout_uns_key_name in adata.uns_keys():
            regressout_key = adata.uns[regressout_uns_key_name]
            sc.logging.info("Find regressout_keys for projecting: ",
                            str(regressout_key))
        regress(adata, keys=regressout_key)
    adata_ref = adata_ref[:, shared_var_names]
    # inject based on the leiden
    sc.tl.ingest(adata, adata_ref, obs='leiden')
    # merge two objects
    if not batch_categories:
        batch_categories = ['ref', 'new']
    adata_merged = adata_ref.concatenate(adata,
                                         batch_categories=batch_categories)
    return adata_merged
Exemple #8
0
def test_concatenate_sparse():
    # sparse data
    from scipy.sparse import csr_matrix

    X1 = csr_matrix([[0, 2, 3], [0, 5, 6]])
    X2 = csr_matrix([[0, 2, 3], [0, 5, 6]])
    X3 = csr_matrix([[1, 2, 0], [0, 5, 6]])

    adata1 = AnnData(
        X1,
        dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']),
        dict(var_names=['a', 'b', 'c']),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']),
        dict(var_names=['d', 'c', 'b']),
        layers=dict(Xs=X2),
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']),
        dict(var_names=['d', 'c', 'b']),
        layers=dict(Xs=X3),
    )

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]]
    assert adata.X.toarray().astype(int).tolist() == X_combined
    assert adata.layers['Xs'].toarray().astype(int).tolist() == X_combined

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    assert adata.X.toarray().tolist() == [
        [0.0, 2.0, 3.0, 0.0],
        [0.0, 5.0, 6.0, 0.0],
        [0.0, 3.0, 2.0, 0.0],
        [0.0, 6.0, 5.0, 0.0],
        [0.0, 0.0, 2.0, 1.0],
        [0.0, 6.0, 5.0, 0.0],
    ]
Exemple #9
0
def test_concatenate_sparse():
    # sparse data
    from scipy.sparse import csr_matrix

    X1 = csr_matrix([[0, 2, 3], [0, 5, 6]])
    X2 = csr_matrix([[0, 2, 3], [0, 5, 6]])
    X3 = csr_matrix([[1, 2, 0], [0, 5, 6]])

    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c"]),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]),
        dict(var_names=["d", "c", "b"]),
        layers=dict(Xs=X2),
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s5", "s6"], anno2=["d3", "d4"]),
        dict(var_names=["d", "c", "b"]),
        layers=dict(Xs=X3),
    )

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]]
    assert adata.X.toarray().astype(int).tolist() == X_combined
    assert adata.layers["Xs"].toarray().astype(int).tolist() == X_combined

    # outer join
    adata = adata1.concatenate(adata2, adata3, join="outer")
    assert adata.X.toarray().tolist() == [
        [0.0, 2.0, 3.0, 0.0],
        [0.0, 5.0, 6.0, 0.0],
        [0.0, 3.0, 2.0, 0.0],
        [0.0, 6.0, 5.0, 0.0],
        [0.0, 0.0, 2.0, 1.0],
        [0.0, 6.0, 5.0, 0.0],
    ]
Exemple #10
0
def test_concatenate_layers_outer(array_type, fill_val):
    # Testing that issue #368 is fixed
    a = AnnData(
        X=np.ones((10, 20)),
        layers={"a": array_type(sparse.random(10, 20, format="csr"))},
    )
    b = AnnData(X=np.ones((10, 20)))

    c = a.concatenate(b, join="outer", fill_value=fill_val, batch_categories=["a", "b"])

    np.testing.assert_array_equal(
        asarray(c[c.obs["batch"] == "b"].layers["a"]), fill_val
    )
Exemple #11
0
def load_files(root):
    """
    Load single cell dataset from files
    """
    if root.split('/')[-1] == '*':
        adata = []
        for root in sorted(glob(root)):
            adata.append(load_file(root))
        return AnnData.concatenate(*adata,
                                   batch_key='sub_batch',
                                   index_unique=None)
    else:
        return load_file(root)
Exemple #12
0
def concat_data(data_list,
                batch_categories=None,
                join='inner',
                batch_key='batch',
                index_unique=None,
                save=None):
    """
    Concatenate multiple datasets along the observations axis with name ``batch_key``.
    
    Parameters
    ----------
    data_list
        A path list of AnnData matrices to concatenate with. Each matrix is referred to as a “batch”.
    batch_categories
        Categories for the batch annotation. By default, use increasing numbers.
    join
        Use intersection ('inner') or union ('outer') of variables of different batches. Default: 'inner'.
    batch_key
        Add the batch annotation to obs using this key. Default: 'batch'.
    index_unique
        Make the index unique by joining the existing index names with the batch category, using index_unique='-', for instance. Provide None to keep existing indices.
    save
        Path to save the new merged AnnData. Default: None.
        
    Returns
    -------
    New merged AnnData.
    """
    if len(data_list) == 1:
        return load_files(data_list[0])
    elif isinstance(data_list, str):
        return load_files(data_list)
    adata_list = []
    for root in data_list:
        adata = load_files(root)
        adata_list.append(adata)

    if batch_categories is None:
        batch_categories = list(map(str, range(len(adata_list))))
    else:
        assert len(adata_list) == len(batch_categories)
    [print(b, adata.shape) for adata, b in zip(adata_list, batch_categories)]
    concat = AnnData.concatenate(*adata_list,
                                 join=join,
                                 batch_key=batch_key,
                                 batch_categories=batch_categories,
                                 index_unique=index_unique)
    if save:
        concat.write(save, compression='gzip')
    return concat
Exemple #13
0
def test_concatenate_dense_duplicates():
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    # inner join duplicates
    adata1 = AnnData(
        X1,
        dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']),
        dict(
            var_names=['a', 'b', 'c'],
            annoA=[0, 1, 2],
            annoB=[1.1, 1.0, 2.0],
            annoC=[1.1, 1.0, 2.0],
            annoD=[2.1, 2.0, 3.0],
        ),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']),
        dict(
            var_names=['a', 'b', 'c'],
            annoA=[0, 1, 2],
            annoB=[1.1, 1.0, 2.0],
            annoC=[1.1, 1.0, 2.0],
            annoD=[2.1, 2.0, 3.0],
        ),
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']),
        dict(
            var_names=['a', 'b', 'c'],
            annoA=[0, 1, 2],
            annoB=[1.1, 1.0, 2.0],
            annoD=[2.1, 2.0, 3.1],
        ),
    )

    adata = adata1.concatenate(adata2, adata3)
    assert adata.var_keys() == [
        'annoA',
        'annoB',
        'annoC-0',
        'annoD-0',
        'annoC-1',
        'annoD-1',
        'annoD-2',
    ]
Exemple #14
0
    def from_scvi_model(cls,
                        scvi_model: SCVI,
                        adata: Optional[AnnData] = None):
        """
        Instantiate a SOLO model from an scvi model.

        Parameters
        ----------
        scvi_model
            Pre-trained model of :class:`~scvi.model.SCVI`. This model
            should have been trained on data comprising one lane. The
            adata object used to initialize this model should have only
            been setup with count data, i.e., no `batch_key`,
            `labels_key`, etc.
        adata
            Optional anndata to use that is compatible with scvi_model.

        Returns
        -------
        SOLO model
        """
        _validate_scvi_model(scvi_model)
        doublet_adata = cls.create_doublets(scvi_model.adata)

        # if model is using observed lib size, needs to get lib sample
        # which is just observed lib size on log scale
        give_mean_lib = not scvi_model.module.use_observed_lib_size

        # get latent representations and make input anndata
        latent_rep = scvi_model.get_latent_representation()
        lib_size = scvi_model.get_latent_library_size(give_mean=give_mean_lib)
        latent_adata = AnnData(np.concatenate([latent_rep, lib_size], axis=1))
        latent_adata.obs[LABELS_KEY] = "singlet"

        logger.info("Creating doublets, preparing SOLO model.")
        f = io.StringIO()
        with redirect_stdout(f):
            setup_anndata(doublet_adata)
            doublet_latent_rep = scvi_model.get_latent_representation(
                doublet_adata)
            doublet_lib_size = scvi_model.get_latent_library_size(
                doublet_adata, give_mean=give_mean_lib)
            doublet_adata = AnnData(
                np.concatenate([doublet_latent_rep, doublet_lib_size], axis=1))
            doublet_adata.obs[LABELS_KEY] = "doublet"

            full_adata = latent_adata.concatenate(doublet_adata)
            setup_anndata(full_adata, labels_key=LABELS_KEY)
        return cls(full_adata)
Exemple #15
0
def test_concatenate_dense_duplicates():
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    # inner join duplicates
    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(
            var_names=["a", "b", "c"],
            annoA=[0, 1, 2],
            annoB=[1.1, 1.0, 2.0],
            annoC=[1.1, 1.0, 2.0],
            annoD=[2.1, 2.0, 3.0],
        ),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]),
        dict(
            var_names=["a", "b", "c"],
            annoA=[0, 1, 2],
            annoB=[1.1, 1.0, 2.0],
            annoC=[1.1, 1.0, 2.0],
            annoD=[2.1, 2.0, 3.0],
        ),
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]),
        dict(
            var_names=["a", "b", "c"],
            annoA=[0, 1, 2],
            annoB=[1.1, 1.0, 2.0],
            annoD=[2.1, 2.0, 3.1],
        ),
    )

    adata = adata1.concatenate(adata2, adata3)
    assert adata.var_keys() == [
        "annoA",
        "annoB",
        "annoC-0",
        "annoD-0",
        "annoC-1",
        "annoD-1",
        "annoD-2",
    ]
Exemple #16
0
 def train(self,adata_list,adj_list, l_list,
         num_pcs=50, 
         lr=0.005,
         max_epochs=2000,
         weight_decay=0,
         opt="admin",
         init_spa=True,
         init="louvain", #louvain or kmeans
         n_neighbors=10, #for louvain
         n_clusters=None, #for kmeans
         res=0.4, #for louvain
         tol=1e-3):
     self.num_pcs=num_pcs
     self.res=res
     self.lr=lr
     self.max_epochs=max_epochs
     self.weight_decay=weight_decay
     self.opt=opt
     self.init_spa=init_spa
     self.init=init
     self.n_neighbors=n_neighbors
     self.n_clusters=n_clusters
     self.res=res
     self.tol=tol
     num_spots=0
     for i in adata_list: 
         num_spots+=i.shape[0]
     adj_exp_all=np.empty((num_spots, num_spots))
     start=0
     for i in range(len(l_list)):
         l=l_list[i]
         adj=adj_list[i]
         adj_exp=np.exp(-1*(adj**2)/(2*(l**2)))
         adj_exp_all[start:start+adj_exp.shape[0],start:start+adj_exp.shape[0]]=adj_exp
         start+=start+adj_exp.shape[0]
     self.adata_all=AnnData.concatenate(*adata_list,join='inner',batch_key="dataset_batch",batch_categories=["0","1"])
     pca = PCA(n_components=self.num_pcs)
     if issparse(self.adata_all.X):
         pca.fit(self.adata_all.X.A)
         embed=pca.transform(self.adata_all.X.A)
     else:
         pca.fit(self.adata_all.X)
         embed=pca.transform(self.adata_all.X)
     #----------Train model----------
     self.model=simple_GC_DEC(embed.shape[1],embed.shape[1])
     self.model.fit(embed,adj_exp_all,lr=self.lr,max_epochs=self.max_epochs,weight_decay=self.weight_decay,opt=self.opt,init_spa=self.init_spa,init=self.init,n_neighbors=self.n_neighbors,n_clusters=self.n_clusters,res=self.res, tol=self.tol)
     self.embed=embed
     self.adj_exp=adj_exp_all
Exemple #17
0
def test_load_timepoints_from_anndata_list():
    adata_ref = sc.datasets.pbmc3k()
    start = [596, 615, 1682, 1663, 1409, 1432]
    adata = AnnData.concatenate(
        *(adata_ref[i : i + 1000] for i in start),
        join="outer",
        batch_key="sample",
        batch_categories=[f"sa{i}_Rep{j}" for i, j in product((1, 2, 3), (1, 2))],
    )
    adata.obs["time_points"] = adata.obs["sample"].str.split("_", expand=True)[0]
    adata.obs["time_points"] = adata.obs["time_points"].astype("category")
    sc.pp.normalize_total(adata, target_sum=10000)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=1000, subset=True)

    sce.tl.harmony_timeseries(adata=adata, tp="time_points", n_components=None)
    assert all(
        [adata.obsp['harmony_aff'].shape[0], adata.obsp['harmony_aff_aug'].shape[0]]
    ), "harmony_timeseries augmented affinity matrix Error!"
Exemple #18
0
def load_files(root):
    """
    Load single cell dataset from files
    
    Parameters
    ----------
    root
        the root store the single-cell data files, each file represent one dataset
        
    Return
    ------
    AnnData
    """
    if root.split('/')[-1] == '*':
        adata = []
        for root in sorted(glob(root)):
            adata.append(load_file(root))
        return AnnData.concatenate(*adata, batch_key='sub_batch', index_unique=None)
    else:
        return load_file(root)
Exemple #19
0
def test_scvi():
    n_samples = 4
    n_genes = 7
    batch1 = np.random.randint(1, 5, size=(n_samples, n_genes))
    batch2 = np.random.randint(1, 5, size=(n_samples, n_genes))
    ad1 = AnnData(batch1)
    ad2 = AnnData(batch2)
    adata = ad1.concatenate(ad2, batch_categories=['test1', 'test2'])
    n_latent = 30
    sce.pp.scvi(
        adata,
        use_cuda=False,
        n_epochs=1,
        n_latent=n_latent,
        return_posterior=True,
        batch_key='batch',
        model_kwargs={'reconstruction_loss': 'nb'},
    )
    assert adata.obsm['X_scvi'].shape == (n_samples * 2, n_latent)
    assert adata.obsm['X_scvi_denoised'].shape == adata.shape
    assert adata.obsm['X_scvi_sample_rate'].shape == adata.shape
Exemple #20
0
    def batch_removal(self, adata: Optional[AnnData] = None) -> AnnData:
        """
        Removes batch effects.

        Parameters
        ----------
        adata
            AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the
            AnnData object used to initialize the model. Must have been setup with `batch_key` and `labels_key`,
            corresponding to batch and cell type metadata, respectively.

        Returns
        -------
        corrected: `~anndata.AnnData`
            AnnData of corrected gene expression in adata.X and corrected latent space in adata.obsm["latent"].
            A reference to the original AnnData is in `corrected.raw` if the input adata had no `raw` attribute.
        """
        adata = self._validate_anndata(adata)
        latent_all = self.get_latent_representation(adata)
        # use keys registered from `setup_anndata()`
        cell_label_key = self.scvi_setup_dict_["categorical_mappings"]["_scvi_labels"][
            "original_key"
        ]
        batch_key = self.scvi_setup_dict_["categorical_mappings"]["_scvi_batch"][
            "original_key"
        ]

        adata_latent = AnnData(latent_all)
        adata_latent.obs = adata.obs.copy(deep=True)
        unique_cell_types = np.unique(adata_latent.obs[cell_label_key])
        shared_ct = []
        not_shared_ct = []
        for cell_type in unique_cell_types:
            temp_cell = adata_latent[
                adata_latent.obs[cell_label_key] == cell_type
            ].copy()
            if len(np.unique(temp_cell.obs[batch_key])) < 2:
                cell_type_ann = adata_latent[
                    adata_latent.obs[cell_label_key] == cell_type
                ]
                not_shared_ct.append(cell_type_ann)
                continue
            temp_cell = adata_latent[
                adata_latent.obs[cell_label_key] == cell_type
            ].copy()
            batch_list = {}
            batch_ind = {}
            max_batch = 0
            max_batch_ind = ""
            batches = np.unique(temp_cell.obs[batch_key])
            for i in batches:
                temp = temp_cell[temp_cell.obs[batch_key] == i]
                temp_ind = temp_cell.obs[batch_key] == i
                if max_batch < len(temp):
                    max_batch = len(temp)
                    max_batch_ind = i
                batch_list[i] = temp
                batch_ind[i] = temp_ind
            max_batch_ann = batch_list[max_batch_ind]
            for study in batch_list:
                delta = np.average(max_batch_ann.X, axis=0) - np.average(
                    batch_list[study].X, axis=0
                )
                batch_list[study].X = delta + batch_list[study].X
                temp_cell[batch_ind[study]].X = batch_list[study].X
            shared_ct.append(temp_cell)
        all_shared_ann = AnnData.concatenate(
            *shared_ct, batch_key="concat_batch", index_unique=None
        )
        if "concat_batch" in all_shared_ann.obs.columns:
            del all_shared_ann.obs["concat_batch"]
        if len(not_shared_ct) < 1:
            corrected = AnnData(
                self.module.generative(torch.Tensor(all_shared_ann.X))["px"]
                .cpu()
                .numpy(),
                obs=all_shared_ann.obs,
            )
            corrected.var_names = adata.var_names.tolist()
            corrected = corrected[adata.obs_names]
            if adata.raw is not None:
                adata_raw = AnnData(X=adata.raw.X, var=adata.raw.var)
                adata_raw.obs_names = adata.obs_names
                corrected.raw = adata_raw
            corrected.obsm["latent"] = all_shared_ann.X
            corrected.obsm["corrected_latent"] = self.get_latent_representation(
                corrected
            )
            return corrected
        else:
            all_not_shared_ann = AnnData.concatenate(
                *not_shared_ct, batch_key="concat_batch", index_unique=None
            )
            all_corrected_data = AnnData.concatenate(
                all_shared_ann,
                all_not_shared_ann,
                batch_key="concat_batch",
                index_unique=None,
            )
            if "concat_batch" in all_shared_ann.obs.columns:
                del all_corrected_data.obs["concat_batch"]
            corrected = AnnData(
                self.module.generative(torch.Tensor(all_corrected_data.X))["px"]
                .cpu()
                .numpy(),
                obs=all_corrected_data.obs,
            )
            corrected.var_names = adata.var_names.tolist()
            corrected = corrected[adata.obs_names]
            if adata.raw is not None:
                adata_raw = AnnData(X=adata.raw.X, var=adata.raw.var)
                adata_raw.obs_names = adata.obs_names
                corrected.raw = adata_raw
            corrected.obsm["latent"] = all_corrected_data.X
            corrected.obsm["corrected_latent"] = self.get_latent_representation(
                corrected
            )
            return corrected
Exemple #21
0
def test_concatenate_with_raw():
    # dense data
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    X4 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]),
        layers=dict(Xs=X2),
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]),
        dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]),
        layers=dict(Xs=X3),
    )

    adata4 = AnnData(
        X4,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c", "z"], annoA=[0, 1, 2, 3]),
        layers=dict(Xs=X4),
    )

    adata1.raw = adata1
    adata2.raw = adata2
    adata3.raw = adata3

    adata_all = AnnData.concatenate(adata1, adata2, adata3)
    assert isinstance(adata_all.raw, Raw)
    assert set(adata_all.raw.var_names) == {"b", "c"}
    assert_equal(adata_all.raw.to_adata().obs, adata_all.obs)
    assert np.array_equal(adata_all.raw.X, adata_all.X)

    adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer")
    assert isinstance(adata_all.raw, Raw)
    assert set(adata_all.raw.var_names) == set("abcd")
    assert_equal(adata_all.raw.to_adata().obs, adata_all.obs)
    assert np.array_equal(np.nan_to_num(adata_all.raw.X),
                          np.nan_to_num(adata_all.X))

    adata3.raw = adata4
    adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer")
    assert isinstance(adata_all.raw, Raw)
    assert set(adata_all.raw.var_names) == set("abcdz")
    assert set(adata_all.var_names) == set("abcd")
    assert not np.array_equal(np.nan_to_num(adata_all.raw.X),
                              np.nan_to_num(adata_all.X))

    del adata3.raw
    with pytest.warns(
            UserWarning,
            match=("Only some AnnData objects have `.raw` attribute, "
                   "not concatenating `.raw` attributes."),
    ):
        adata_all = AnnData.concatenate(adata1, adata2, adata3)
    assert adata_all.raw is None

    del adata1.raw
    del adata2.raw
    assert all(_adata.raw is None for _adata in (adata1, adata2, adata3))
    adata_all = AnnData.concatenate(adata1, adata2, adata3)
    assert adata_all.raw is None
def TCR_activation_10X():
    ans = list()
    for sample, activation in zip(samples1[1:3],
                                  ["unstimulated", "stimulated"]):
        print(sample)
        output_prefix = pjoin("results", run, sample, sample)
        b = sc.read(output_prefix + ".filtered.h5ad")
        b.obs = b.obs.assign(activation=activation)
        ans.append(b)

    output_prefix = (run +
                     "/PD2XX1_10xscRNA_Human_Tcells_2S3Qmixed.samples_joined")
    a = AnnData.concatenate(*ans)
    assert (np.asarray(a.var["external_gene_name-0"]) == np.asarray(
        a.var["external_gene_name-1"])).all()
    a.var["external_gene_name"] = a.var["external_gene_name-0"]
    a = a[np.random.choice(a.obs.index.tolist(), a.obs.shape[0], replace=False
                           ), :]

    tech_attributes = [
        "log_counts",
        "log_genes",
        "efficiency_ratio",
        "percent_mito",
        "percent_ribo",
        "percent_malat1",
    ]
    attributes = ["activation"]

    params = Params()
    params.plot_raw = False

    sc.pp.scale(a)
    sc.pp.pca(a)
    sc.pp.neighbors(a)
    sc.tl.umap(a)

    # Plot

    # marker genes (tailored for a PBMC sample)
    mark1 = [
        "MALAT1",  # sc
        "CD34",  # HSC
        "CD3D",
        "CD3G",
        "CD247",  # T-cell
        "CD4",
        "FOXP3",
        "CCR7",
        "CTLA4",  # CD4
        "CD8A",
        "NKG7",
        "IL2RA",  # CD8
        "NCAM1",
        "GNLY",  # NK
        "CD14",
        "CST3",  # Monocytes
        "CD79A",
        "CD19",
        "IGHG1",  # B cells  (also MS4A1)
        "FCER1G",
        "CLEC10A",  # dendritic cells
        "SELE",
        "CD93",
        "PECAM1",
        "KDR",  # endothelial cells
        "DCN",
        "COL6A2",  # fibroblasts
        "GZMB",
        "CD68",
        "CD27",
        "MS4A1",
        "CD24",
        "NCR1",
        "CD274",
    ]  # Immune
    mark2 = [
        "IL32",
        "IFNG",
        "IFNGR1",
        "IL4R",
        "IL4",
        "JUN",
        "JUNB",
        "JUND",
        "JAK1",
        "JAK2",
        "GATA1",
        "JARID2",
        "KRAS",
        "MYC",
    ]
    mark3 = ["BTK", "LCK", "E2F4", "CXCR4", "ITGA4", "HBA1", "PTPRC"]
    red_mark = ["CST3", "TCL1A", "GZMB", "NKG7", "CD3D"]
    marker_genes = mark1 + mark2 + mark3

    sc.pl.pca_variance_ratio(a, log=True, show=False)
    plt.gca().figure.savefig(
        output_prefix + ".single_cell.pca_variance_ratio.svg",
        dpi=300,
        bbox_inches="tight",
    )

    # fig = sc.pl.pca(a, color=tech_attributes + attributes, components=['1,2', '2,3', '3,4', '4,5'], return_fig=True)
    # for ax in fig.axes:
    #     ax.get_children()[0].set_rasterized(True)
    # fig.savefig(output_prefix + ".single_cell.pca.svg", dpi=300, bbox_inches="tight")

    fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True)
    for ax in fig.axes:
        ax.get_children()[0].set_rasterized(True)
    fig.savefig(output_prefix + ".single_cell.umap.svg",
                dpi=300,
                bbox_inches="tight")

    sc.pp.combat(a, "activation")

    sc.pp.scale(a)
    sc.pp.pca(a)
    sc.pp.neighbors(a)
    sc.tl.umap(a)
    fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True)
    for ax in fig.axes:
        ax.get_children()[0].set_rasterized(True)
    fig.savefig(
        output_prefix + ".single_cell.post_combat.umap.svg",
        dpi=300,
        bbox_inches="tight",
    )

    sc.pp.regress_out(a, keys=["log_counts"])

    sc.pp.scale(a)
    sc.pp.pca(a)
    sc.pp.neighbors(a)
    sc.tl.umap(a)
    fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True)
    for ax in fig.axes:
        ax.get_children()[0].set_rasterized(True)
    fig.savefig(
        output_prefix +
        ".single_cell.post_combat.post_regressout_counts.umap.svg",
        dpi=300,
        bbox_inches="tight",
    )

    # sc.write(output_prefix + ".regress_out_counts.h5ad", a)

    # Now regress out also efficiency_ratio
    sc.pp.regress_out(a, keys=["efficiency_ratio"])

    sc.pp.scale(a)
    sc.pp.pca(a)
    sc.pp.neighbors(a)
    sc.tl.umap(a)
    fig = sc.pl.umap(a, color=tech_attributes + attributes, return_fig=True)
    for ax in fig.axes:
        ax.get_children()[0].set_rasterized(True)
    fig.savefig(
        output_prefix +
        ".single_cell.post_combat.post_regressout_counts.post_regressout_efficiency.umap.svg",
        dpi=300,
        bbox_inches="tight",
    )

    # sc.write(output_prefix + ".regress_out_counts.regress_out_efficiency_ratio.h5ad", a)
    if params.plot_raw:
        g = [
            y for x in marker_genes for y in a.raw.var.loc[
                a.raw.var["external_gene_name"] == x].index.tolist()
        ]
    else:
        g = [
            x for x in marker_genes
            if x in a.var["external_gene_name"].tolist()
        ]
    color = tech_attributes + attributes + g
    kwargs = dict(
        hspace=0.1,
        wspace=0,
        return_fig=True,
        use_raw=params.plot_raw,
        gene_symbols="external_gene_name" if not params.plot_raw else None,
    )

    fig = sc.pl.umap(a, color=color, **kwargs)
    for ax in fig.axes:
        ax.get_children()[0].set_rasterized(True)
    if params.plot_raw:
        for ax in fig.axes:
            try:
                ax.set_title(a.raw.var.loc[ax.get_title(),
                                           "external_gene_name"])
            except KeyError:
                pass
    fig.savefig(
        output_prefix +
        ".single_cell.post_combat.post_regressout_counts.post_regressout_efficiency.markers_extended.umap.svg",
        dpi=300,
        bbox_inches="tight",
    )

    # differntial expression

    max_genes = 500
    attribute = "activation"
    attributes = [attribute]
    a.X += abs(a.X.min())

    # # differential expression
    diff = differential_expression(a, attribute, n_genes=max_genes)
    diff.to_csv(
        output_prefix + f".{attribute}.cluster_comparison.top_values.csv",
        index=True,
    )

    diff = pd.read_csv(
        output_prefix + f".{attribute}.cluster_comparison.top_values.csv",
        index_col=0,
    )

    fig = plot_differential_expression(diff)
    fig.savefig(
        output_prefix + f".{attribute}.differential_expression.ma_plot.svg",
        dpi=300,
        bbox_inches="tight",
    )

    # # differential enrichment
    groups = [x for x in diff["group"].unique() if x not in ["-1", -1]]

    enrichments = differential_enrichment(
        diff,
        groups,
        attribute,
        alpha=0.05,
        alpha_col="pvals_adj",
        max_n=max_genes,
        sort_by="scores",
    )
    enrichments.to_csv(output_prefix +
                       f"{attribute}.differential_enrichment.csv",
                       index=False)
    enrichments = pd.read_csv(output_prefix +
                              f"{attribute}.differential_enrichment.csv",
                              index_col=0)

    g = (enrichments.set_index("description").groupby(
        ["group"])["combined_score"].nlargest(5))
    print("combined_score:\n", g)
    g = (enrichments.set_index("description").groupby(
        ["group"])["p_value"].nsmallest(5))
    print("p_value:\n", g)

    plot_differential_enrichment(enrichments, output_prefix, ntop_terms=20)
Exemple #23
0
def mnn_correct(*datas,
                var_index=None,
                var_subset=None,
                batch_key='batch',
                index_unique='-',
                batch_categories=None,
                k=20,
                sigma=1.,
                cos_norm_in=True,
                cos_norm_out=True,
                svd_dim=None,
                var_adj=True,
                compute_angle=False,
                mnn_order=None,
                svd_mode='rsvd',
                do_concatenate=True,
                save_raw=False,
                n_jobs=None,
                **kwargs):
    """
    Apply MNN correct to input data matrices or AnnData objects. Depending on do_concatenate,
    returns matrices or AnnData objects in the original order containing corrected expression
    values, or concatenated matrices or AnnData object.

    :param datas: `numpy.ndarray` or class:`anndata.AnnData`
        Expression matrices or AnnData objects. Matrices should be shaped like n_obs * n_vars
        (n_cell * n_gene) and have consistent number of columns. AnnData objects should have same
        number of vars.

    :param var_index: `list` or `None`, optional (default: None)
        The index (list of str) of vars (genes). Necessary when using only a subset of vars to
        perform MNN correction, and should be supplied with var_subset. When datas are AnnData
        objects, var_index is ignored.

    :param var_subset: `list` or `None`, optional (default: None)
        The subset of vars (list of str) to be used when performing MNN correction. Typically, a
        list of highly variable genes (HVGs). When set to None, uses all vars.

    :param batch_key: `str`, optional (default: 'batch')
        The batch_key for AnnData.concatenate. Only valid when do_concatenate and supplying AnnData
        objects.

    :param index_unique: `str`, optional (default: '-')
        The index_unique for AnnData.concatenate. Only valid when do_concatenate and supplying
        AnnData objects.

    :param batch_categories: `list` or `None`, optional (default: None)
        The batch_categories for AnnData.concatenate. Only valid when do_concatenate and supplying
        AnnData objects.

    :param k: `int`, optional (default: 20)
        Number of mutual nearest neighbors.

    :param sigma: `float`, optional (default: 1)
        The bandwidth of the Gaussian smoothing kernel used to compute the correction vectors.

    :param cos_norm_in: `bool`, optional (default: True)
        Whether cosine normalization should be performed on the input data prior to calculating
        distances between cells.

    :param cos_norm_out: `bool`, optional (default: True)
        Whether cosine normalization should be performed prior to computing corrected expression
        values.

    :param svd_dim: `int` or `None`, optional (default: None)
        The number of dimensions to use for summarizing biological substructure within each batch.
        If set to None, biological components will not be removed from the correction vectors.

    :param var_adj: `bool`, optional (default: True)
        Whether to adjust variance of the correction vectors. Note this step takes most computing
        time.

    :param compute_angle: `bool`, optional (default: False)
        Whether to compute the angle between each cell’s correction vector and the biological
        subspace of the reference batch.

    :param mnn_order: `list` or `None`, optional (default: None)
        The order in which batches are to be corrected. When set to None, datas are corrected
        sequentially.

    :param svd_mode: `str`, optional (default: 'rsvd')
        One of 'svd', 'rsvd', and 'irlb'. 'svd' computes SVD using a non-randomized SVD-via-ID
        algorithm, while 'rsvd' uses a randomized version. 'irlb' performes truncated SVD by
        implicitly restarted Lanczos bidiagonalization (forked from https://github.com/airysen/irlbpy).

    :param do_concatenate: `bool`, optional (default: True)
        Whether to concatenate the corrected matrices or AnnData objects. Default is True.

    :param save_raw: `bool`, optional (default: False)
        Whether to save the original expression data in the .raw attribute of AnnData objects.

    :param n_jobs: `int` or `None`, optional (default: None)
        The number of jobs. When set to None, automatically uses the number of cores.

    :param kwargs: `dict` or `None`, optional (default: None)
        optional keyword arguments for irlb.

    :return:
        datas: `numpy.ndarray` or class:`anndata.AnnData`
            Corrected matrix/matrices or AnnData object/objects, depending on the input type and
            do_concatenate.

        mnn_list_: `list`
            A list containing MNN pairing information as DataFrames in each iteration step.

        angle_list_: `list`
            A list containing angles of each batch.

    """
    if len(datas) < 2:
        return datas
    n_batch = len(datas)
    if mnn_order is not None:
        if sorted(mnn_order) != list(range(n_batch)):
            raise ValueError(
                'The argument mnn_order should contain values in 1:' +
                'n_batch' + '.')
    if isinstance(datas[0], AnnData):
        if var_index is not None:
            print('Inputs are AnnData objects, var_index ignored.')
        n_batch = len(datas)
        adata_vars = datas[0].var.index
        for i in range(1, n_batch):
            if (datas[i].var.index != adata_vars).any():
                raise ValueError(
                    'The AnnData objects have inconsistent number of vars.')
        if var_subset is not None and set(adata_vars) == set(var_subset):
            var_subset = None
        corrected = mnn_correct(*(adata.X for adata in datas),
                                var_index=adata_vars,
                                var_subset=var_subset,
                                k=k,
                                sigma=sigma,
                                cos_norm_in=cos_norm_in,
                                cos_norm_out=cos_norm_out,
                                svd_dim=svd_dim,
                                var_adj=var_adj,
                                compute_angle=compute_angle,
                                mnn_order=mnn_order,
                                svd_mode=svd_mode,
                                do_concatenate=do_concatenate,
                                **kwargs)
        print('Packing AnnData object...')
        if do_concatenate:
            adata = AnnData.concatenate(*datas,
                                        batch_key=batch_key,
                                        batch_categories=batch_categories,
                                        index_unique=index_unique)
            if save_raw:
                adata.raw = adata.copy()
            adata.X = corrected[0]
            print('Done.')
            return adata, corrected[1], corrected[2]
        else:
            for adata, new_matrix in zip(datas, corrected[0]):
                if save_raw:
                    adata.raw = adata.copy()
                adata.X = new_matrix
            print('Done.')
            return datas, corrected[1], corrected[2]
    # ------------------------------------------------------------
    if n_jobs is None:
        n_jobs = cpu_count()
    n_cols = datas[0].shape[1]
    if len(var_index) != n_cols:
        raise ValueError(
            'The number of vars is not equal to the length of var_index.')
    for i in range(1, n_batch):
        if datas[i].shape[1] != n_cols:
            raise ValueError(
                'The input matrices have inconsistent number of columns.')
    # ------------------------------------------------------------
    print('Performing cosine normalization...')
    in_batches, out_batches, var_subset, same_set = transform_input_data(
        datas, cos_norm_in, cos_norm_out, var_index, var_subset, n_jobs)
    if mnn_order is None:
        mnn_order = list(range(n_batch))
    ref = mnn_order[0]
    ref_batch_in = in_batches[ref]
    if not same_set:
        ref_batch_out = out_batches[ref]
    res_container = [out_batches[ref]]
    mnn_container = [0]
    angle_container = [0]
    original_batch = [ref] * ref_batch_in.shape[0]
    print('Starting MNN correct iteration. Reference batch: ' + str(ref))
    # ------------------------------------------------------------
    # loop through batches
    for step in range(1, n_batch):
        target = mnn_order[step]
        print('Step ' + str(step) + ' of ' + str(n_batch - 1) +
              ': processing batch ' + str(target))
        new_batch_in = in_batches[target]
        if not same_set:
            new_batch_out = out_batches[target]
        print('  Looking for MNNs...')
        mnn_ref, mnn_new = find_mutual_nn(data1=ref_batch_in,
                                          data2=new_batch_in,
                                          k1=k,
                                          k2=k,
                                          n_jobs=n_jobs)
        print('  Computing correction vectors...')
        correction_in = compute_correction(ref_batch_in, new_batch_in, mnn_ref,
                                           mnn_new, new_batch_in, sigma)
        if not same_set:
            correction_out = compute_correction(ref_batch_out, new_batch_out,
                                                mnn_ref, mnn_new, new_batch_in,
                                                sigma)
        if compute_angle:
            print('  Computing angle...')
            ref_centred = ref_batch_in - np.mean(ref_batch_in, axis=0)
            ref_basis = svd_internal(ref_centred.T,
                                     nu=2,
                                     svd_mode=svd_mode,
                                     **kwargs)
            find_subspace_job = partial(find_shared_subspace,
                                        mat1=ref_basis,
                                        mat2_vec=True)
            with Pool(n_jobs) as p_n:
                angle_out = p_n.map(find_subspace_job, correction_in)
            angle_container.append(angle_out)
        # ------------------------
        if svd_dim is not None and svd_dim != 0:
            print('  Removing components...')
            mnn_ref_u = np.unique(mnn_ref)
            mnn_new_u = np.unique(mnn_new)
            in_span_ref = get_bio_span(ref_batch_in[mnn_ref_u, :],
                                       ndim=svd_dim,
                                       svd_mode=svd_mode,
                                       **kwargs)
            in_span_new = get_bio_span(new_batch_in[mnn_new_u, :],
                                       ndim=svd_dim,
                                       svd_mode=svd_mode,
                                       **kwargs)
            correction_in = subtract_bio(in_span_ref,
                                         in_span_new,
                                         correction=correction_in)
            if not same_set:
                out_span_ref = get_bio_span(ref_batch_out[mnn_ref_u, :],
                                            ndim=svd_dim,
                                            svd_mode=svd_mode,
                                            var_subset=var_subset,
                                            **kwargs)
                out_span_new = get_bio_span(new_batch_out[mnn_new_u, :],
                                            ndim=svd_dim,
                                            svd_mode=svd_mode,
                                            var_subset=var_subset,
                                            **kwargs)
                correction_out = subtract_bio(out_span_ref,
                                              out_span_new,
                                              correction=correction_out,
                                              var_subset=var_subset)
        # ------------------------
        if var_adj:
            print('  Adjusting variance...')
            correction_in = adjust_shift_variance(ref_batch_in, new_batch_in,
                                                  correction_in, sigma, n_jobs)
            if not same_set:
                correction_out = adjust_shift_variance(ref_batch_out,
                                                       new_batch_out,
                                                       correction_out, sigma,
                                                       n_jobs, var_subset)
        # ------------------------
        print('  Applying correction...')
        new_batch_in = new_batch_in + correction_in
        ref_batch_in = np.concatenate((ref_batch_in, new_batch_in))
        if same_set:
            res_container.append(new_batch_in)
        else:
            new_batch_out = new_batch_out + correction_out
            ref_batch_out = np.concatenate((ref_batch_out, new_batch_out))
            res_container.append(new_batch_out)
        mnn_container.append(
            DataFrame({
                'new cell':
                mnn_new,
                'ref cell':
                mnn_ref,
                'original batch': [original_batch[mnn] for mnn in mnn_ref]
            }))
        original_batch += [target] * new_batch_in.shape[0]
    print('MNN correction complete. Gathering output...')
    reflow_order = [0] * n_batch
    for i in range(n_batch):
        reflow_order[mnn_order[i]] = i
    results_ = [np.array(res_container[i]) for i in reflow_order]
    mnn_list_ = [mnn_container[i] for i in reflow_order]
    angle_list_ = [angle_container[i]
                   for i in reflow_order] if compute_angle else None
    if do_concatenate:
        results_ = np.concatenate(tuple(results_))
    return results_, mnn_list_, angle_list_
Exemple #24
0
def label_transfer(adata: AnnData,
                   adata_ref: Optional[AnnData] = None,
                   obs: Optional[str] = None,
                   label_unk: Optional[str] = 'unknown',
                   use_best: Optional[bool] = False,
                   neighbors_key: Optional[str] = 'neighbors',
                   adjacency: Optional[sparse.spmatrix] = None,
                   directed: bool = False,
                   use_weights: bool = False,
                   pca_args: Optional[dict] = {},
                   use_rep: Optional[str] = None,
                   harmony_args: Optional[dict] = {},
                   copy: bool = False) -> Optional[AnnData]:
    """\
    Transfer annotation from one dataset to another using cell affinities.
    If two datasets are given, it uses harmony to perform
    integration and then the kNN graph. If only no reference is given, it is assumed
    that the only adata already contains the proper kNN graph and that
    labels to be reassigned have a specified value.
    
    Parameters
    ----------
    adata:
        The AnnData object. 
    adata_ref
        The optional reference dataset. If None, then all the needed information
        should be included in `adata` (i.e. the kNN graph and the labels)
    obs
        The label that needs to be transfered. Should be in `adata_ref.obs` or in 
        `adata.obs` if no `adata_ref` is given
    label_unk
        The label for unassigned cells. If no `adata_ref` is given, this label 
        identifies cells to be assigned in `adata`. If `adata_ref` is given, this
        label will be given to all cells that cannot be assigned.
    use_best
        When assigning labels, some cells may have not enough evidence and, therefore, 
        left `unknown`. If this parameter is set to `True`, all cells will be assigned
        to the best possible, even if it may not be optimal
    neighbors_key
        Use neighbors connectivities as adjacency.
        If not specified, leiden looks .obsp['connectivities'] for connectivities
        (default storage place for pp.neighbors).
        If specified, leiden looks
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.
    adjacency
        Sparse adjacency matrix of the graph, defaults to neighbors connectivities.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    pca_args
        Parameters to be passed to `sc.tl.pca` before harmony is issued
    use_rep
        If specified use this embedding and do not calculate a pca. Note that the
        embedding must be present in both datasets, with the same number of dimensions 
    harmony_args
    	Parameters to be passed to `sc.external.pp.harmony_integrate`
    copy:
        Return a new object or do everything in place
        

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with added labels 
    in adata.obs[f'{label_ref}']
        
"""
    adata = adata.copy() if copy else adata
    if adata_ref:
        from scanpy.tools import pca
        from scanpy.preprocessing import neighbors
        from scanpy.external.pp import harmony_integrate

        # we have to create a merged dataset and integrate
        # before that check that the labels are not in the recipient, in case drop

        if obs in adata.obs_keys():
            logg.warning(f'{obs} was found in dataset 1, it will be wiped')
            adata.obs.drop(obs, inplace=True, axis='columns')

        if not obs in adata_ref.obs_keys():
            raise ValueError(
                f'Annotation {obs} is not present in reference dataset.')

        if use_rep:
            revert_to_pca = False
            if not use_rep in adata.obsm.keys():
                logg.warning(
                    f'{use_rep} was not found into dataset 1, reverting to PCA'
                )
                revert_to_pca = True
            elif not use_rep in adata_ref.obsm.keys():
                logg.warning(
                    f'{use_rep} was not found into dataset 2, reverting to PCA'
                )
                revert_to_pca = True
            elif adata.obsm[use_rep].shape[1] != adata_ref.obsm[use_rep].shape[
                    1]:
                logg.warning(
                    f'{use_rep} is inconsistent in two datasets, reverting to PCA'
                )
                revert_to_pca = True
            if revert_to_pca:
                use_rep = None

        # now do the merge, so that the empty category is now created
        adata_merge = adata.concatenate(adata_ref,
                                        batch_categories=['_unk', '_ref'],
                                        batch_key='_label_transfer')
        #
        if adata_merge.obs[obs].dtype.name != 'category':
            adata_merge.obs[obs] = pd.Categorical(adata_merge.obs[obs])
        adata_merge.obs[obs] = adata_merge.obs[obs].cat.add_categories(
            label_unk).fillna(label_unk)

        # perform integration using harmony
        if not use_rep:
            pca(adata_merge, **pca_args)
            use_rep = 'X_pca'
        h_rep = f'{use_rep}_harmony'
        harmony_integrate(adata_merge,
                          key='_label_transfer',
                          basis=use_rep,
                          adjusted_basis=h_rep,
                          **harmony_args)
        # now calculate the kNN graph
        n_neighbors = int(np.sqrt(adata_merge.shape[0]) / 2)
        key_added = neighbors_key
        if key_added == 'neighbors':
            key_added = None
        neighbors(adata_merge,
                  use_rep=h_rep,
                  n_neighbors=n_neighbors,
                  key_added=key_added)
    else:
        adata_merge = adata  #.copy()
        if not obs in adata_merge.obs_keys():
            raise ValueError(f'Annotation {obs} is not present in dataset.')
        if not label_unk in adata_merge.obs[obs].cat.categories:
            raise ValueError(f'Label {label_unk} is not present in {obs}.')

    # calculate affinity

    calculate_affinity(adata_merge, group_by=obs, neighbors_key=neighbors_key)

    # now work on affinity, rank it to get the new labels
    categories = adata_merge.obs[obs].cat.categories
    affinity = pd.DataFrame(adata_merge.obsm[f'CA_{obs}'],
                            index=adata_merge.obs_names,
                            columns=categories)
    # if use_best we need to remove label unknonw from the matrix so it
    # does not get scored
    if use_best:
        affinity.drop(label_unk, axis='columns', inplace=True)

    rank_affinity = affinity.rank(axis=1, ascending=False)
    adata_merge.obs[f'_{obs}_tmp'] = adata_merge.obs[obs].values
    for c in rank_affinity.columns:
        # pretty sure there's a way to do it without a
        # for loop :-/ I really need a course on pandas
        cells = rank_affinity[rank_affinity[c] == 1].index
        adata_merge.obs.loc[cells, f'_{obs}_tmp'] = c

    # do actual transfer to dataset 1
    # here we assume that concatenation does not change the order of cells
    # only cell names

    labels = adata_merge.obs[f'_{obs}_tmp'].cat.categories
    if adata_ref:
        # transfer has been done between two files
        adata.obs[obs] = adata_merge.obs.query(
            '_label_transfer == "_unk"')[f'_{obs}_tmp'].values
    else:
        # transfer is within dataset
        adata_merge.obs[obs] = adata_merge.obs[f'_{obs}_tmp'].values
        adata_merge.obs.drop(f'_{obs}_tmp', axis='columns', inplace=True)
        adata = adata_merge

    # ensure that it is categorical with proper order
    adata.obs[obs] = pd.Categorical(adata.obs[obs], categories=labels)

    # transfer colors if any
    if adata_ref and f'{obs}_colors' in adata_ref.uns:
        colors = list(adata_ref.uns[f'{obs}_colors'])
        if not use_best:
            # add gray for unknown
            colors.append('#aabbcc')
        adata.uns[f'{obs}_colors'] = colors

    # remove unused categories if "use_best" hence no "unknown"
    if use_best:
        adata.obs[obs].cat.remove_unused_categories()

    return adata if copy else None
Exemple #25
0
def test_concatenate_dense():
    # dense data
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    adata1 = AnnData(
        X1,
        dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]),
        dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]),
        obsm=dict(X_1=X1, X_2=X2, X_3=X3),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]),
        dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]),
        obsm=dict(X_1=X1, X_2=X2, X_3=X3),
        layers={"Xs": X2},
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]),
        dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]),
        obsm=dict(X_1=X1, X_2=X2),
        layers=dict(Xs=X3),
    )

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]
    assert adata.X.astype(int).tolist() == X_combined
    assert adata.layers["Xs"].astype(int).tolist() == X_combined
    assert adata.obs_keys() == ["anno1", "anno2", "batch"]
    assert adata.var_keys() == ["annoA-0", "annoA-1", "annoB-2"]
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    assert adata.obsm_keys() == ["X_1", "X_2"]
    assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist()

    # with batch_key and batch_categories
    adata = adata1.concatenate(adata2, adata3, batch_key="batch1")
    assert adata.obs_keys() == ["anno1", "anno2", "batch1"]
    adata = adata1.concatenate(adata2,
                               adata3,
                               batch_categories=["a1", "a2", "a3"])
    assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"]
    assert adata.var_names.tolist() == ["b", "c"]

    # outer join
    adata = adata1.concatenate(adata2, adata3, join="outer")

    X_ref = np.array([
        [1.0, 2.0, 3.0, np.nan],
        [4.0, 5.0, 6.0, np.nan],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0],
    ])
    np.testing.assert_equal(adata.X, X_ref)
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(
        np.array([
            [0.0, np.nan, np.nan],
            [1.0, 2.0, 2.0],
            [2.0, 1.0, 1.0],
            [np.nan, 0.0, 0.0],
        ]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())
Exemple #26
0
def test_concatenate_dense():
    # dense data
    X1 = np.array([[1, 2, 3], [4, 5, 6]])
    X2 = np.array([[1, 2, 3], [4, 5, 6]])
    X3 = np.array([[1, 2, 3], [4, 5, 6]])

    adata1 = AnnData(
        X1,
        dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']),
        dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]),
        layers=dict(Xs=X1),
    )
    adata2 = AnnData(
        X2,
        dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']),
        dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]),
        layers={'Xs': X2},
    )
    adata3 = AnnData(
        X3,
        dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']),
        dict(var_names=['d', 'c', 'b'], annoB=[0, 1, 2]),
        layers=dict(Xs=X3),
    )

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]
    assert adata.X.astype(int).tolist() == X_combined
    assert adata.layers['Xs'].astype(int).tolist() == X_combined
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch']
    assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2']
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    adata = adata1.concatenate(adata2, adata3, batch_key='batch1')
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch1']
    adata = adata1.concatenate(adata2,
                               adata3,
                               batch_categories=['a1', 'a2', 'a3'])
    assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3']
    assert adata.var_names.tolist() == ['b', 'c']

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    from numpy import ma

    Xma = ma.masked_invalid(adata.X)
    Xma_ref = ma.masked_invalid(
        np.array([
            [1.0, 2.0, 3.0, np.nan],
            [4.0, 5.0, 6.0, np.nan],
            [np.nan, 3.0, 2.0, 1.0],
            [np.nan, 6.0, 5.0, 4.0],
            [np.nan, 3.0, 2.0, 1.0],
            [np.nan, 6.0, 5.0, 4.0],
        ]))
    assert np.array_equal(Xma.mask, Xma_ref.mask)
    assert np.allclose(Xma.compressed(), Xma_ref.compressed())
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(
        np.array([
            [0.0, np.nan, np.nan],
            [1.0, 2.0, 2.0],
            [2.0, 1.0, 1.0],
            [np.nan, 0.0, 0.0],
        ]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())
Exemple #27
0
def test_concatenate():
    # dense data
    adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno1': ['c1', 'c2']},
                     {'var_names': ['a', 'b', 'c'],
                      'annoA': [0, 1, 2]})
    adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s3', 's4'],
                      'anno1': ['c3', 'c4']},
                     {'var_names': ['d', 'c', 'b'],
                      'annoA': [0, 1, 2]})
    adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno2': ['d3', 'd4']},
                     {'var_names': ['d', 'c', 'b'],
                      'annoB': [0, 1, 2]})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch']
    assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2']
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    adata = adata1.concatenate(adata2, adata3, batch_key='batch1')
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch1']
    adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3'])
    assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3']
    assert adata.var_names.tolist() == ['b', 'c']

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    from numpy import ma
    Xma = ma.masked_invalid(adata.X)
    Xma_ref = ma.masked_invalid(np.array([
        [1.0, 2.0, 3.0, np.nan],
        [4.0, 5.0, 6.0, np.nan],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0]]))
    assert np.array_equal(Xma.mask, Xma_ref.mask)
    assert np.allclose(Xma.compressed(), Xma_ref.compressed())
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(np.array(
        [[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())

    # sparse data
    from scipy.sparse import csr_matrix
    adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno1': ['c1', 'c2']},
                     {'var_names': ['a', 'b', 'c']})
    adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
                     {'obs_names': ['s3', 's4'],
                      'anno1': ['c3', 'c4']},
                     {'var_names': ['d', 'c', 'b']})
    adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]),
                     {'obs_names': ['s5', 's6'],
                      'anno2': ['d3', 'd4']},
                     {'var_names': ['d', 'c', 'b']})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]]

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    assert adata.X.toarray().tolist() == [
        [0.0, 2.0, 3.0, 0.0],
        [0.0, 5.0, 6.0, 0.0],
        [0.0, 3.0, 2.0, 0.0],
        [0.0, 6.0, 5.0, 0.0],
        [0.0, 0.0, 2.0, 1.0],
        [0.0, 6.0, 5.0, 0.0]]
Exemple #28
0
    def from_scvi_model(
        cls,
        scvi_model: SCVI,
        adata: Optional[AnnData] = None,
        restrict_to_batch: Optional[str] = None,
        doublet_ratio: int = 2,
        **classifier_kwargs,
    ):
        """
        Instantiate a SOLO model from an scvi model.

        Parameters
        ----------
        scvi_model
            Pre-trained model of :class:`~scvi.model.SCVI`. The
            adata object used to initialize this model should have only
            been setup with count data, and optionally a `batch_key`;
            i.e., no extra covariates or labels, etc.
        adata
            Optional anndata to use that is compatible with scvi_model.
        restrict_to_batch
            Batch category in `batch_key` used to setup adata for scvi_model
            to restrict Solo model to. This allows to train a Solo model on
            one batch of a scvi_model that was trained on multiple batches.
        doublet_ratio
            Ratio of generated doublets to produce relative to number of
            cells in adata or length of indices, if not `None`.
        **classifier_kwargs
            Keyword args for :class:`~scvi.module.Classifier`

        Returns
        -------
        SOLO model
        """
        _validate_scvi_model(scvi_model, restrict_to_batch=restrict_to_batch)
        orig_adata_manager = scvi_model.adata_manager
        orig_batch_key = orig_adata_manager.get_state_registry(
            REGISTRY_KEYS.BATCH_KEY).original_key

        if adata is not None:
            adata_manager = orig_adata_manager.transfer_setup(adata)
            cls.register_manager(adata_manager)
        else:
            adata_manager = orig_adata_manager
        adata = adata_manager.adata

        if restrict_to_batch is not None:
            batch_mask = adata.obs[orig_batch_key] == restrict_to_batch
            if np.sum(batch_mask) == 0:
                raise ValueError(
                    "Batch category given to restrict_to_batch not found.\n" +
                    "Available categories: {}".format(
                        adata.obs[orig_batch_key].astype(
                            "category").cat.categories))
            # indices in adata with restrict_to_batch category
            batch_indices = np.where(batch_mask)[0]
        else:
            # use all indices
            batch_indices = None

        # anndata with only generated doublets
        doublet_adata = cls.create_doublets(adata_manager,
                                            indices=batch_indices,
                                            doublet_ratio=doublet_ratio)
        # if scvi wasn't trained with batch correction having the
        # zeros here does nothing.
        doublet_adata.obs[orig_batch_key] = (
            restrict_to_batch if restrict_to_batch is not None else 0)

        # if model is using observed lib size, needs to get lib sample
        # which is just observed lib size on log scale
        give_mean_lib = not scvi_model.module.use_observed_lib_size

        # get latent representations and make input anndata
        latent_rep = scvi_model.get_latent_representation(
            adata, indices=batch_indices)
        lib_size = scvi_model.get_latent_library_size(adata,
                                                      indices=batch_indices,
                                                      give_mean=give_mean_lib)
        latent_adata = AnnData(
            np.concatenate([latent_rep, np.log(lib_size)], axis=1))
        latent_adata.obs[LABELS_KEY] = "singlet"
        orig_obs_names = adata.obs_names
        latent_adata.obs_names = (orig_obs_names[batch_indices]
                                  if batch_indices is not None else
                                  orig_obs_names)

        logger.info("Creating doublets, preparing SOLO model.")
        f = io.StringIO()
        with redirect_stdout(f):
            scvi_model.setup_anndata(doublet_adata, batch_key=orig_batch_key)
            doublet_latent_rep = scvi_model.get_latent_representation(
                doublet_adata)
            doublet_lib_size = scvi_model.get_latent_library_size(
                doublet_adata, give_mean=give_mean_lib)
            doublet_adata = AnnData(
                np.concatenate([doublet_latent_rep,
                                np.log(doublet_lib_size)],
                               axis=1))
            doublet_adata.obs[LABELS_KEY] = "doublet"

            full_adata = latent_adata.concatenate(doublet_adata)
            cls.setup_anndata(full_adata, labels_key=LABELS_KEY)
        return cls(full_adata, **classifier_kwargs)
Exemple #29
0
    def fit(
            self,
            source_data,  #adata
            target_data,  #adata
            batch_size=256,
            maxiter=1000,
            pretrain_epochs=300,
            epochs_fit=5,
            tol=[0.001],
            alpha=[1.0],
            resolution=[0.2, 0.4, 0.8, 1.2, 1.6],
            n_neighbors=20,
            softmax=False,
            init="glorot_uniform",
            save_atr="isy_trans_True"):
        ''' 
        Fit the transfer learning model using provided data.
        This function includes preprocessing steps.
        Input: source_data(anndata format), target_data(anndata format).
        Source and target data can be in any form (UMI or TPM or FPKM)
        Retrun: No return
        '''
        self.batch_size = batch_size
        self.maxiter = maxiter
        self.pretrain_epochs = pretrain_epochs
        self.epochs_fit = epochs_fit
        self.tol = tol
        self.alpha = alpha
        self.source_data = source_data
        self.target_data = target_data
        self.resolution = resolution
        self.n_neighbors = n_neighbors
        self.softmax = softmax
        self.init = init
        self.save_atr = save_atr
        dictionary = {"alpha": alpha, "tol": tol, "resolution": resolution}
        df_expand = expand_grid(dictionary)
        #begin to conduct
        adata_tmp = []
        source_data.var_names_make_unique(join="-")
        source_data.obs_names_make_unique(join="-")

        #pre-processiong
        #1.pre filter cells
        prefilter_cells(source_data, min_genes=100)
        #2 pre_filter genes
        prefilter_genes(source_data,
                        min_cells=10)  # avoiding all gene is zeros
        #3 prefilter_specialgene: MT and ERCC
        prefilter_specialgenes(source_data)
        #4 normalization,var.genes,log1p,scale
        sc.pp.normalize_per_cell(source_data)
        #5 scale
        sc.pp.log1p(source_data)
        sc.pp.scale(source_data, zero_center=True, max_value=6)
        source_data.var_names = [
            i.upper() for i in list(source_data.var_names)
        ]  #avoding some gene have lower letter
        adata_tmp.append(source_data)

        #Target data
        target_data.var_names_make_unique(join="-")
        target_data.obs_names_make_unique(join="-")
        #pre-processiong
        #1.pre filter cells
        prefilter_cells(target_data, min_genes=100)
        #2 pre_filter genes
        prefilter_genes(target_data,
                        min_cells=10)  # avoiding all gene is zeros
        #3 prefilter_specialgene: MT and ERCC
        prefilter_specialgenes(target_data)
        #4 normalization,var.genes,log1p,scale
        sc.pp.normalize_per_cell(target_data)

        # select top genes
        if target_data.X.shape[0] <= 1500:
            ng = 500
        elif 1500 < target_data.X.shape[0] <= 3000:
            ng = 1000
        else:
            ng = 2000

        sc.pp.filter_genes_dispersion(target_data, n_top_genes=ng)
        sc.pp.log1p(target_data)
        sc.pp.scale(target_data, zero_center=True, max_value=6)
        target_data.var_names = [
            i.upper() for i in list(target_data.var_names)
        ]  #avoding some gene have lower letter
        adata_tmp.append(target_data)

        #Concat *adata
        full_adata = AnnData.concatenate(*adata_tmp,
                                         join='inner',
                                         batch_key="dataset_batch",
                                         batch_categories=["source",
                                                           "target"])  #inner
        del adata_tmp
        del target_data
        del source_data
        ref_id = full_adata.obs["dataset_batch"] == "source"
        adata_test = full_adata[~ref_id, :].copy()
        adata_train = full_adata[ref_id, :].copy()
        if issparse(adata_train.X):
            x_train = adata_train.X.toarray()
        else:
            x_train = adata_train.X

        y_train = pd.Series(adata_train.obs["celltype"], dtype="category")
        y_train = y_train.cat.rename_categories(
            range(len(y_train.cat.categories)))
        print("The number of training celltypes is: ", len(set(y_train)))

        if issparse(adata_test.X):
            x_test = adata_test.X.toarray()
        else:
            x_test = adata_test.X

        #Training Data dec
        print("Training the source network")
        dims = getdims(x_train.shape)
        #dims=[x_train.shape[1],128,64]
        print("The layer numbers are" + str(dims[1:]))
        print(":".join([
            "The shape of xtrain is",
            str(x_train.shape[0]),
            str(x_train.shape[1])
        ]))
        print(":".join([
            "The shape of xtest is",
            str(x_test.shape[0]),
            str(x_test.shape[1])
        ]))
        assert x_train.shape[1] == x_test.shape[1]
        dec = DEC(dims=dims,
                  y=y_train,
                  x=x_train,
                  alpha=alpha,
                  init=self.init,
                  pretrain_epochs=self.pretrain_epochs,
                  actinlayer1="tanh",
                  softmax=softmax)
        dec.compile(optimizer=SGD(lr=0.01, momentum=0.9))
        #print("dec.init_centroid",type(dec.init_centroid),dec.init_centroid)
        Embeded_z, q_pred = dec.fit_supervise(
            x=x_train, y=y_train, epochs=2e3,
            batch_size=self.batch_size)  # fine tunning

        #---------------------------------------------------------------------------------------------------
        weights = [i0.get_weights() for i0 in dec.model.layers]
        features = dec.encoder.predict(x_test)
        q = dec.model.predict(x_test, verbose=0)

        #np.savetxt("testq.txt",q)
        print("Training model finished! Start to fit target network!")
        val_y_pre = dec.model.predict(x_train, verbose=0)
        val_y_pre = [np.argmax(i) for i in val_y_pre]
        val_ari = metrics.adjusted_rand_score(val_y_pre, y_train.tolist())
        t0 = time()
        dec2 = DEC(dims=dims,
                   x=x_test,
                   alpha=alpha,
                   init=self.init,
                   pretrain_epochs=self.pretrain_epochs,
                   actinlayer1="tanh",
                   softmax=softmax,
                   transfer_feature=features,
                   model_weights=weights,
                   y_trans=q.argmax(axis=1))
        dec2.compile(optimizer=SGD(0.01, 0.9))
        trajectory_z, trajectory_l, Embeded_z, q_pred = dec2.fit_trajectory(
            x=x_test,
            tol=tol,
            epochs_fit=self.epochs_fit,
            batch_size=self.batch_size)  # Fine tunning
        print("How many trajectories ", len(trajectory_z))
        for i in range(len(trajectory_z)):
            adata_test.obsm["trajectory_Embeded_z_" + str(i)] = trajectory_z[i]
            adata_test.obs["trajectory_" + str(i)] = trajectory_l[i]

        #labels=change_to_continuous(q_pred)
        y_pred = np.asarray(np.argmax(q_pred, axis=1), dtype=int)
        labels = y_pred.astype('U')
        labels = pd.Categorical(values=labels,
                                categories=natsorted(
                                    np.unique(y_pred).astype('U')))

        adata_test.obsm["X_Embeded_z" + str(self.save_atr)] = Embeded_z
        adata_test.obs["dec" + str(self.save_atr)] = labels
        adata_test.obs["maxprob" + str(self.save_atr)] = q_pred.max(1)
        adata_test.obsm["prob_matrix" + str(self.save_atr)] = q_pred
        adata_test.obsm["X_pcaZ" + str(self.save_atr)] = sc.tl.pca(Embeded_z)

        self.adata_train = adata_train
        self.adata_test = adata_test
        self.dec2 = dec2
        self.labels = labels
Exemple #30
0
    [z_mean, z_log_var, z] = net.encoder.predict(adata.X)
    redadata1 = AnnData(X=z_mean, obs=adata.obs)
    path1 = path + "reference_"
    prep.nn_embedding(redadata1, path=path1)
    prep.plotEmbedding(redadata1, path=path1, ncol=10)
    prep.plotEmbedding(redadata1,
                       path=path1,
                       color_col="cell_type_code",
                       ncol=10)

    adata, n_batches, input_size = load_data(filename, batch_size, ref=False)
    [z_mean, z_log_var, z] = net.encoder.predict(adata.X)
    redadata2 = AnnData(X=z_mean, obs=adata.obs)
    path2 = path + "test_"
    prep.nn_embedding(redadata2, path=path2)
    prep.plotEmbedding(redadata2, path=path2, ncol=10)
    prep.plotEmbedding(redadata2,
                       path=path2,
                       color_col="cell_type_code",
                       ncol=10)

    redadata3 = redadata1.concatenate(redadata2)
    path3 = path + "combined_"
    prep.nn_embedding(redadata3, path=path3)
    prep.plotEmbedding(redadata3, path=path3, ncol=10)
    prep.plotEmbedding(redadata3,
                       path=path3,
                       color_col="cell_type_code",
                       ncol=10)
    prep.plotEmbedding(redadata3, path=path3, color_col="batch", ncol=10)
Exemple #31
0
def test_concatenate():
    # dense data
    adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {
        'obs_names': ['s1', 's2'],
        'anno1': ['c1', 'c2']
    }, {
        'var_names': ['a', 'b', 'c'],
        'annoA': [0, 1, 2]
    })
    adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {
        'obs_names': ['s3', 's4'],
        'anno1': ['c3', 'c4']
    }, {
        'var_names': ['d', 'c', 'b'],
        'annoA': [0, 1, 2]
    })
    adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {
        'obs_names': ['s1', 's2'],
        'anno2': ['d3', 'd4']
    }, {
        'var_names': ['d', 'c', 'b'],
        'annoB': [0, 1, 2]
    })

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5],
                                            [3, 2], [6, 5]]
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch']
    assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2']
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    adata = adata1.concatenate(adata2, adata3, batch_key='batch1')
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch1']
    adata = adata1.concatenate(adata2,
                               adata3,
                               batch_categories=['a1', 'a2', 'a3'])
    assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3']
    assert adata.var_names.tolist() == ['b', 'c']

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    from numpy import ma
    Xma = ma.masked_invalid(adata.X)
    Xma_ref = ma.masked_invalid(
        np.array([[1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan],
                  [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0],
                  [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0]]))
    assert np.array_equal(Xma.mask, Xma_ref.mask)
    assert np.allclose(Xma.compressed(), Xma_ref.compressed())
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(
        np.array([[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0],
                  [np.nan, 0.0, 0.0]]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())

    # sparse data
    from scipy.sparse import csr_matrix
    adata1 = adata_sparse
    adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {
        'obs_names': ['s3', 's4'],
        'anno1': ['c3', 'c4']
    }, {'var_names': ['d', 'c', 'b']})
    adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]), {
        'obs_names': ['s5', 's6'],
        'anno2': ['d3', 'd4']
    }, {'var_names': ['d', 'c', 'b']})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2],
                                                      [6, 5], [0, 2], [6, 5]]

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    assert adata.X.toarray().tolist() == [[0.0, 2.0, 3.0, 0.0],
                                          [0.0, 5.0, 6.0, 0.0],
                                          [0.0, 3.0, 2.0, 0.0],
                                          [0.0, 6.0, 5.0, 0.0],
                                          [0.0, 0.0, 2.0, 1.0],
                                          [0.0, 6.0, 5.0, 0.0]]