Example #1
0
def test_mantonbm():
    print("Testing on MantonBM dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"):
        adata = pg.read_input("./data/MantonBM/original_data.h5ad")
        adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1]))

    if os.path.exists("./result/MantonBM_torch_z.npy"):
        Z_torch = np.load("./result/MantonBM_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/MantonBM_torch_z.npy", Z_torch)

    if os.path.exists("./result/MantonBM_py_z.npy"):
        Z_py = np.load("./result/MantonBM_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel'])
        end_py = time.time()

        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/MantonBM_py_z.npy", Z_py)


    Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'L2')

    if os.path.exists("./result/MantonBM_result.h5ad"):
        adata = None

    umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "MantonBM", batch_key = "Individual")
Example #2
0
def test_pbmc():
    print("Testing on 10x pbmc dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"):
        adata = pg.read_input("./data/10x_pbmc/original_data.h5ad")

    if os.path.exists("./result/pbmc_torch_z.npy"):
        Z_torch = np.load("./result/pbmc_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/pbmc_torch_z.npy", Z_torch)

    if os.path.exists("./result/pbmc_py_z.npy"):
        Z_py = np.load("./result/pbmc_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel'])
        end_py = time.time()

        print(ho.objective_harmony)
        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/pbmc_py_z.npy", Z_py)

    Z_R = np.loadtxt("./result/pbmc_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'L2')

    if os.path.exists("./result/pbmc_result.h5ad"):
        adata = None

    umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "pbmc", batch_key = "Channel")
Example #3
0
    def run(self, **kwargs):
        """
        Run the harmony algorithm (see https://github.com/slowkow/harmonypy for details). Resulting object
        is stored in 'harmony' attribute

        Parameters
        ----------
        kwargs:
            Additional keyword arguments passed to harmonypy.run_harmony

        Returns
        -------
        Harmony
        """
        data = self.data[self.features].astype(float)
        self.harmony = harmonypy.run_harmony(data_mat=data.values,
                                             meta_data=self.meta,
                                             vars_use="sample_id",
                                             **kwargs)
        return
Example #4
0
# 296  t293_TAGAATTGTTGGTG    t293   3097      0.021769      t293
# 297  t293_CGGATAACACCACA    t293   3157      0.020411      t293
# 298  t293_GGTACTGAGTCGAT    t293   2685      0.027846      t293
# 299  t293_ACGCTGCTTCTTAC    t293   3513      0.021240      t293

# [300 rows x 5 columns]

# data_mat[:5,:5]
#
# array([[ 0.0071695 , -0.00552724, -0.0036281 , -0.00798025,  0.00028931],
#        [-0.011333  ,  0.00022233, -0.00073589, -0.00192452,  0.0032624 ],
#        [ 0.0091214 , -0.00940727, -0.00106816, -0.0042749 , -0.00029096],
#        [ 0.00866286, -0.00514987, -0.0008989 , -0.00821785, -0.00126997],
#        [-0.00953977,  0.00222714, -0.00374373, -0.00028554,  0.00063737]])

ho = hm.run_harmony(data_mat, meta_data, vars_use)

# Write the adjusted PCs to a new file.
res = pd.DataFrame(ho.Z_corr)
res.columns = ['X{}'.format(i + 1) for i in range(res.shape[1])]
res.to_csv("data/adj.tsv.gz", sep="\t", index=False)

# Test 2
########################################################################

import pandas as pd
import numpy as np
from scipy.cluster.vq import kmeans
from scipy.stats.stats import pearsonr
import harmonypy as hm
Example #5
0
def test_cell_lines():
    print("Testing on cell lines dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
        X = np.loadtxt("./data/cell_lines/pca.txt")
        df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")
        source_loaded = True

    if os.path.exists("./result/cell_lines_torch_z.npy"):
        Z_torch = np.load("./result/cell_lines_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(X, df_metadata, batch_key = 'dataset')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/cell_lines_torch_z.npy", Z_torch)

    if os.path.exists("./result/cell_lines_py_z.npy"):
        Z_py = np.load("./result/cell_lines_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(X, df_metadata, ['dataset'])
        end_py = time.time()

        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))
        print(ho.objective_harmony)

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/cell_lines_py_z.npy", Z_py)

    Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2')

    if os.path.exists("./result/cell_lines_result.h5ad"):
        adata = None
    else:
        n_obs = X.shape[0]
        adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata)
        adata.obsm['X_pca'] = X

        pg.neighbors(adata, rep = 'pca')
        pg.umap(adata)

    umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset")

    if os.path.exists("./result/cell_lines_result.h5ad"):
       adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r')

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony')
       print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py')
       print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch')
       print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
Example #6
0
import os
#import sys
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)

# os.chdir("U:\\GitHub\\scGEAToolbox\\+run\\thirdparty\\harmony")
import pandas as pd
import numpy as np
from scipy.cluster.vq import kmeans
from scipy.stats.stats import pearsonr
import harmonypy as hm

meta_data = pd.read_csv("input2.csv")
# data_mat = pd.read_csv("input1.csv", header=None)
data_mat = pd.read_csv("input1.csv")

data_mat = np.array(data_mat)
vars_use = ['batchidx']
ho = hm.run_harmony(data_mat, meta_data, vars_use)

res = pd.DataFrame(ho.Z_corr.T)
# res.columns = ['X{}'.format(i + 1) for i in range(res.shape[1])]
res.to_csv("output.csv", sep="\t", index=False, header=False)
    morphology_features = infer_cp_features(
        df, compartments=["Cells", "Cytoplasm", "Nuclei"])
    metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df,
                                                                metadata=True)

    # Fit PCA
    pca = PCA(n_components=num_pcs)
    pca.fit(df.loc[:, morphology_features])

    # Transform PCA
    pc_df = pca.transform(df.loc[:, morphology_features])
    pc_df = pd.DataFrame(pc_df).add_prefix("pca_")

    # Apply harmony per plate
    harmony_out = (hm.run_harmony(data_mat=pc_df,
                                  meta_data=df.loc[:, metadata_cols],
                                  vars_use=harmony_adjust_vars_perplate,
                                  random_state=harmony_random_state))

    # Compile harmony output file
    harmony_df = pd.concat([
        df.loc[:, metadata_cols],
        pd.DataFrame(harmony_out.Z_corr).transpose().add_prefix("harmonized_")
    ],
                           axis="columns")

    # Output harmonized file
    output_file = pathlib.Path(f"{data_dir}/{plate}_{output_file_suffix}")
    harmony_df.to_csv(output_file, index=False, sep=",")

    # Apply an inverse transform to get back to original feature space
    inverse_harmony_df = pd.concat([
Example #8
0
def harmony_integrate(
    adata: AnnData,
    key: str,
    basis: str = "X_pca",
    adjusted_basis: str = "X_pca_harmony",
    **kwargs,
):
    """\
    Use harmonypy [Korunsky19]_ to integrate different experiments.

    Harmony [Korunsky19]_ is an algorithm for integrating single-cell
    data from multiple experiments. This function uses the python
    port of Harmony, ``harmonypy``, to integrate single-cell data
    stored in an AnnData object. As Harmony works by adjusting the
    principal components, this function should be run after performing
    PCA but before computing the neighbor graph, as illustrated in the
    example below.

    Parameters
    ----------
    adata
        The annotated data matrix.
    key
        The name of the column in ``adata.obs`` that differentiates
        among experiments/batches.
    basis
        The name of the field in ``adata.obsm`` where the PCA table is
        stored. Defaults to ``'X_pca'``, which is the default for
        ``sc.tl.pca()``.
    adjusted_basis
        The name of the field in ``adata.obsm`` where the adjusted PCA
        table will be stored after running this function. Defaults to
        ``X_pca_harmony``.
    kwargs
        Any additional arguments will be passed to
        ``harmonypy.run_harmony()``.

    Returns
    -------
    Updates adata with the field ``adata.obsm[obsm_out_field]``,
    containing principal components adjusted by Harmony such that
    different experiments are integrated.

    Example
    -------
    First, load libraries and example dataset, and preprocess.

    >>> import scanpy as sc
    >>> import scanpy.external as sce
    >>> adata = sc.datasets.pbmc3k()
    >>> sc.pp.recipe_zheng17(adata)
    >>> sc.tl.pca(adata)

    We now arbitrarily assign a batch metadata variable to each cell
    for the sake of example, but during real usage there would already
    be a column in ``adata.obs`` giving the experiment each cell came
    from.

    >>> adata.obs['batch'] = 1350*['a'] + 1350*['b']

    Finally, run harmony. Afterwards, there will be a new table in
    ``adata.obsm`` containing the adjusted PC's.

    >>> sce.pp.harmony_integrate(adata, 'batch')
    >>> 'X_pca_harmony' in adata.obsm
    True
    """
    try:
        import harmonypy
    except ImportError:
        raise ImportError(
            "\nplease install harmonypy:\n\n\tpip install harmonypy")

    harmony_out = harmonypy.run_harmony(adata.obsm[basis], adata.obs, key,
                                        **kwargs)

    adata.obsm[adjusted_basis] = harmony_out.Z_corr.T
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Runs Harmony on PCs.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument(
        '-pc',
        '--pca_file',
        action='store',
        dest='pc',
        required=True,
        help='Tab-delimited file of PCs for each cell. First column is\
            cell_barcode. Subsequent columns are PCs.')

    parser.add_argument(
        '-mf',
        '--metadata_file',
        action='store',
        dest='mf',
        required=True,
        help='Tab-delimited metadata file, must have a column labelled\
            cell_barcode that maps to pca_file.')

    parser.add_argument(
        '-mc',
        '--metadata_columns',
        action='store',
        dest='mc',
        required=True,
        help='Comma separated string of columns to use in metadata_file.')

    parser.add_argument(
        '-t',
        '--theta',
        action='store',
        dest='theta',
        default='',
        help='Comma separated string of theta values (corresponding to\
            metadata_columns). If "" then sets theta to 2 for all\
            columns. Larger values of theta result in more diverse\
            clusters.\
            (default: "")')

    parser.add_argument('-npc',
                        '--n_pcs',
                        action='store',
                        dest='npc',
                        default=0,
                        type=int,
                        help='Number of PCs to use.\
            (default: maximum number in tsv_pcs file)')

    parser.add_argument(
        '-of',
        '--out_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: <tsv_pcs>-harmony)')

    options = parser.parse_args()

    # Fixed settings.
    verbose = True

    # Get the out file base.
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}-harmony'.format(
            os.path.basename(options.pc.rstrip('tsv.gz').rstrip('.')))

    # Load the PCs.
    df_pca = pd.read_csv(options.pc, sep='\t', index_col='cell_barcode')

    # Check that nPCs is valid.
    n_pcs = options.npc
    if n_pcs == 0:
        n_pcs = len(df_pca.columns)
    elif n_pcs > len(df_pca.columns):
        raise Exception(
            '--number_pcs ({}) is > than n_pcs in --tsv_pcs ({}).'.format(
                n_pcs, len(df_pca.columns)))
    if verbose:
        print('Using {} PCs.'.format(n_pcs))

    # Subset down to these PCs.
    df_pca = df_pca.iloc[:, range(0, n_pcs)]

    # Get the metadata_file columns that we want to adjust with Harmony.
    metadata_columns = options.mc.split(',')

    # Read in the metadata file.
    df_meta = pd.read_csv(options.mf, sep='\t', index_col='cell_barcode')
    # Ensure cell order in df_meta is the same as df_pca
    df_meta = df_meta.loc[df_pca.index, metadata_columns]
    # Also ensure that the metadata columns are categorical -- run_harmony
    # fails if not categorical
    try:
        df_meta[metadata_columns].describe().loc['unique']
    except KeyError:
        print(
            "metadata_columns contains non-categorical attributes. Harmony does \
        not work with continuous variables. Either make attributes a string or \
        use a different column.")

    # Get the theta values for each column (if none, set to 2 for all columns).
    theta = [2] * len(metadata_columns)
    if options.theta != '':
        theta = [float(i) for i in options.theta.split(',')]

    # Run Harmony
    harmony_embeddings = hm.run_harmony(
        data_mat=df_pca.values,  # Pandas dataframe to numpy.ndarray
        meta_data=df_meta,
        vars_use=metadata_columns,
        theta=theta,
        max_iter_kmeans=500,
        verbose=verbose)
    # NOTE: harmony_embeddings.result() == harmony_embeddings.Z_corr
    df_harmony = pd.DataFrame(np.transpose(harmony_embeddings.Z_corr))
    harmony_cols = [
        'harmony{}'.format(i + 1) for i in range(df_harmony.shape[1])
    ]
    df_harmony.columns = harmony_cols
    df_harmony['cell_barcode'] = df_pca.index
    final_col_order = ['cell_barcode']
    final_col_order.extend(harmony_cols)
    df_harmony = df_harmony.loc[:, final_col_order]

    # Save the clustered data to a data frame.
    df_harmony.to_csv(
        '{}.tsv.gz'.format(out_file_base),
        sep='\t',
        index=False,
        quoting=csv.QUOTE_NONNUMERIC,
        # index_label='cell_barcode',
        na_rep='',
        compression='gzip')