Esempio n. 1
0
def test_cell_lines():
    print("Testing on Cell Lines...")

    z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
        X = np.loadtxt("./data/cell_lines/pca.txt")
        df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")

    if os.path.exists("./result/cell_lines_cpu_z.npy"):
        Z_cpu = np.load("./result/cell_lines_cpu_z.npy")
        print("Precalculated CPU mode result is loaded.")
    else:
        start_cpu = time.time()
        Z_cpu = harmonize(X, df_metadata, 'dataset')
        end_cpu = time.time()

        print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu))
        np.save("./result/cell_lines_cpu_z.npy", Z_cpu)

    if os.path.exists("./result/cell_lines_gpu_z.npy"):
        Z_gpu = np.load("./result/cell_lines_gpu_z.npy")
        print("Precalculated GPU mode result is loaded.")
    else:
        start_gpu = time.time()
        Z_gpu = harmonize(X, df_metadata, 'dataset', use_gpu = True)
        end_gpu = time.time()

        print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu))
        np.save("./result/cell_lines_gpu_z.npy", Z_gpu)

    Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")

    check_metrics(Z_cpu, Z_R, prefix = "cell_lines_cpu")
    check_metrics(Z_gpu, Z_R, prefix = "cell_lines_gpu")

    if os.path.exists("./result/cell_lines_result.h5ad"):
        adata = None
    else:
        n_obs = X.shape[0]
        adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata)
        adata.obsm['X_pca'] = X

        pg.neighbors(adata, rep = 'pca')
        pg.umap(adata)

    umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf")]
    if len(umap_list) < 4:
        plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "cell_lines", batch_key = 'dataset')
Esempio n. 2
0
def test_mantonbm():
    print("Testing on MantonBM...")

    z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)]
    if len(z_files) < 3:
        adata = pg.read_input("./data/MantonBM/original_data.h5ad")
        adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1]))

    if os.path.exists("./result/MantonBM_cpu_z.npy"):
        Z_cpu = np.load("./result/MantonBM_cpu_z.npy")
        print("Precalculated CPU mode result is loaded.")
    else:
        start_cpu = time.time()
        Z_cpu = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel')
        end_cpu = time.time()

        print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu))
        np.save("./result/MantonBM_cpu_z.npy", Z_cpu)

    if os.path.exists("./result/MantonBM_gpu_z.npy"):
        Z_gpu = np.load("./result/MantonBM_gpu_z.npy")
        print("Precalculated GPU mode result is loaded.")
    else:
        start_gpu = time.time()
        Z_gpu = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel', use_gpu = True)
        end_gpu = time.time()

        print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu))
        np.save("./result/MantonBM_gpu_z.npy", Z_gpu)

    Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt")

    check_metrics(Z_cpu, Z_R, prefix = "MantonBM_cpu")
    check_metrics(Z_gpu, Z_R, prefix = "MantonBM_gpu")

    if os.path.exists("./result/MantonBM_result.h5ad"):
        adata = None

    umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "MantonBM", batch_key = 'Individual')
Esempio n. 3
0
    def transform(self,
                  ds: loompy.LoomConnection,
                  normalizer: Normalizer,
                  cells: np.ndarray = None) -> np.ndarray:
        if cells is None:
            cells = np.arange(ds.shape[1])

        transformed = np.zeros((cells.shape[0], self.pca.n_components_))
        j = 0

        # Support out-of-order datasets
        key = None
        if "Accession" in ds.row_attrs:
            key = "Accession"

        layer = self.layer if self.layer is not None else ""
        for (_, selection, view) in ds.scan(items=cells,
                                            axis=1,
                                            layers=[layer],
                                            key=key):
            vals = normalizer.transform(view.layers[layer][:, :], selection)
            n_cells_in_batch = selection.shape[0]
            transformed[j:j + n_cells_in_batch, :] = self.pca.transform(
                vals[self.genes, :].transpose())
            j += n_cells_in_batch

        if self.test_significance:
            # Must select significant components only once, and reuse for future transformations
            if self.sigs is None:
                pvalue_KS = np.zeros(
                    transformed.shape[1])  # pvalue of each component
                for i in range(1, transformed.shape[1]):
                    (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1],
                                                 transformed[:, i])
                self.sigs = np.where(pvalue_KS < 0.1)[0]
                if len(self.sigs) == 0:
                    self.sigs = (0, 1)

            transformed = transformed[:, self.sigs]

        if self.batch_keys is not None and len(self.batch_keys) > 0:
            keys_df = pd.DataFrame.from_dict(
                {k: ds.ca[k]
                 for k in self.batch_keys})
            transformed = harmonize(transformed,
                                    keys_df,
                                    batch_key=self.batch_keys)
        return transformed
Esempio n. 4
0
def pre_step1(adata):
    adata.var['highly_variable'] = meanCVfit(adata)

    adata.raw = adata

    sc.pp.scale(adata, max_value=10)  #scale
    sc.tl.pca(adata, svd_solver='arpack')  #run PCA

    Z = harmonize(adata.obsm['X_pca'], adata.obs, batch_key='Batch')
    adata.obsm['X_harmony'] = Z

    #need these b/c will re-run kNN in UMAP 2D space
    sc.pp.neighbors(adata, n_neighbors=25, use_rep='X_harmony')
    sc.tl.umap(adata)

    return adata
Esempio n. 5
0
def test_mantonbm():
    print("Testing on MantonBM dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"):
        adata = pg.read_input("./data/MantonBM/original_data.h5ad")
        adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1]))

    if os.path.exists("./result/MantonBM_torch_z.npy"):
        Z_torch = np.load("./result/MantonBM_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/MantonBM_torch_z.npy", Z_torch)

    if os.path.exists("./result/MantonBM_py_z.npy"):
        Z_py = np.load("./result/MantonBM_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel'])
        end_py = time.time()

        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/MantonBM_py_z.npy", Z_py)


    Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'L2')

    if os.path.exists("./result/MantonBM_result.h5ad"):
        adata = None

    umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "MantonBM", batch_key = "Individual")
Esempio n. 6
0
def test_pbmc():
    print("Testing on 10x pbmc dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"):
        adata = pg.read_input("./data/10x_pbmc/original_data.h5ad")

    if os.path.exists("./result/pbmc_torch_z.npy"):
        Z_torch = np.load("./result/pbmc_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/pbmc_torch_z.npy", Z_torch)

    if os.path.exists("./result/pbmc_py_z.npy"):
        Z_py = np.load("./result/pbmc_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel'])
        end_py = time.time()

        print(ho.objective_harmony)
        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/pbmc_py_z.npy", Z_py)

    Z_R = np.loadtxt("./result/pbmc_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'L2')

    if os.path.exists("./result/pbmc_result.h5ad"):
        adata = None

    umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "pbmc", batch_key = "Channel")
Esempio n. 7
0
    def init_recording(self):
        data = self.recorder.toggle()
        if data:
            print(data)
            wave_gen, filename, duration_midi = data
            for i in range(len(duration_midi)):
                if duration_midi[i][0] < 0.12:
                    duration_midi[i] = (duration_midi[i][0], 0)
            duration_midi = harmony.harmonize(duration_midi)
            self.live_wave = wave_gen
            print([[i[1] for i in j] for j in duration_midi])

            tempo = 120
            multiplier = 1 / 60 * tempo * 480
            converted_midi_duration = [[(i * multiplier, j) for i, j in k]
                                       for k in duration_midi]

            for i in converted_midi_duration:
                self.seq.append(
                    NoteSequencer(self.sched, self.synth, 1, (0, 0), i, True))
Esempio n. 8
0
def test_cell_lines():
    print("Testing on cell lines dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
        X = np.loadtxt("./data/cell_lines/pca.txt")
        df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")
        source_loaded = True

    if os.path.exists("./result/cell_lines_torch_z.npy"):
        Z_torch = np.load("./result/cell_lines_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(X, df_metadata, batch_key = 'dataset')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/cell_lines_torch_z.npy", Z_torch)

    if os.path.exists("./result/cell_lines_py_z.npy"):
        Z_py = np.load("./result/cell_lines_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(X, df_metadata, ['dataset'])
        end_py = time.time()

        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))
        print(ho.objective_harmony)

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/cell_lines_py_z.npy", Z_py)

    Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2')

    if os.path.exists("./result/cell_lines_result.h5ad"):
        adata = None
    else:
        n_obs = X.shape[0]
        adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata)
        adata.obsm['X_pca'] = X

        pg.neighbors(adata, rep = 'pca')
        pg.umap(adata)

    umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset")

    if os.path.exists("./result/cell_lines_result.h5ad"):
       adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r')

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony')
       print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py')
       print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch')
       print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
Esempio n. 9
0
def run_harmony(
    data: MultimodalData,
    rep: str = 'pca',
    n_jobs: int = -1,
    n_clusters: int = None,
    random_state: int = 0,
) -> str:
    """Batch correction on PCs using Harmony.

    This is a wrapper of `harmony-pytorch <https://github.com/lilab-bcb/harmony-pytorch>`_ package, which is a Pytorch implementation of Harmony algorithm [Korsunsky19]_.

    Parameters
    ----------
    data: ``MultimodalData``.
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``.
        Which representation to use as input of Harmony, default is PCA.

    n_jobs : ``int``, optional, default: ``-1``.
        Number of threads to use for the KMeans clustering used in Harmony. ``-1`` refers to using all available threads.

    n_clusters: ``int``, optional, default: ``None``.
        Number of Harmony clusters. Default is ``None``, which asks Harmony to estimate this number from the data.

    random_state: ``int``, optional, default: ``0``.
        Seed for random number generator

    Returns
    -------
    out_rep: ``str``
        The keyword in ``data.obsm`` referring to the embedding calculated by Harmony algorithm.

        This keyword is ``rep + '_harmony'``, where ``rep`` is the input parameter above.

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_rep]``: The embedding calculated by Harmony algorithm.

    Examples
    --------
    >>> pg.run_harmony(data, rep = "pca", n_jobs = 10, random_state = 25)
    """
    if not is_categorical_dtype(data.obs['Channel']):
        data.obs['Channel'] = pd.Categorical(data.obs['Channel'])
    if data.obs['Channel'].cat.categories.size  == 1:
        logger.warning("Warning: data only contains 1 channel. Cannot apply Harmony!")
        return rep

    try:
        from harmony import harmonize
    except ImportError as e:
        print(f"ERROR: {e}")
        print("ERROR: Need Harmony! Try 'pip install harmony-pytorch'.")
        import sys
        sys.exit(-1)


    logger.info("Start integration using Harmony.")
    out_rep = rep + '_harmony'
    data.obsm['X_' + out_rep] = harmonize(X_from_rep(data, rep), data.obs, 'Channel', n_clusters = n_clusters, n_jobs_kmeans = n_jobs, random_state = random_state)
    return out_rep
#6. PCA
sc.pp.regress_out(adata, ['n_counts', 'perc_others'])
sc.tl.pca(adata, svd_solver='arpack')

#7. copy PCA adata for harmony
ad_pca = adata.copy()

#8. calculate neighbor
sc.pp.neighbors(adata, n_neighbors=nn, n_pcs=npc)

#9. embed- umap
sc.tl.umap(adata, n_components=2, random_state=42)
adata.write(f'{fd_out}/concat_merged.h5ad')

#-----------------------------harmony-----------------------------------
#1. rename ad_pca
adata = ad_pca.copy()

#2. harmony
Z = harmonize(adata.obsm['X_pca'], adata.obs, batch_key='sample')
adata.obsm['X_harmony'] = Z

#3. calculate neighbor
sc.pp.neighbors(adata, n_neighbors=nn, n_pcs=npc, use_rep='X_harmony')

#4. embed- umap
sc.tl.umap(adata, n_components=2, random_state=42)

#5. save
adata.write(f'{fd_out}/harmony_merged.h5ad')
def run_Harmony(adata, batch_key='orig.ident'):
    return harmonize(adata.obsm['X_pca'], adata.obs, batch_key=batch_key)
Esempio n. 12
0
def subcluster_iteration(adata_in,
                         min_cells=10,
                         nhvgs=2000,
                         npcs=20,
                         n_neighbors=50,
                         min_dist=1.0,
                         spread=2.0,
                         resolution=1.,
                         umap_genestoplot=['CD14'],
                         pc_genestoplot=['CD14'],
                         other_plot=['DPIc', 'louvain'],
                         random_state=14,
                         harmony=False,
                         harmony_key='frz_status',
                         regress_out_keys=None,
                         n_jobs_regress=1,
                         harmony_theta=2,
                         scale=True):
    ''' Assumes input data is already log TP10K normalized'''

    _adata = adata_in.copy()
    sc.pp.filter_genes(_adata, min_cells=min_cells)
    sc.pp.highly_variable_genes(_adata, n_top_genes=nhvgs)
    _adata = _adata[:, _adata.var['highly_variable']]

    if regress_out_keys is not None:
        _adata = _adata.copy()
        sc.pp.regress_out(_adata,
                          regress_out_keys,
                          n_jobs=n_jobs_regress,
                          copy=False)

    if scale:
        sc.pp.scale(_adata, max_value=10)

    sc.tl.pca(_adata, svd_solver='arpack', random_state=14)

    sc.pl.pca(_adata,
              components=['1,2', '3,4', '5,6', '7,8'],
              color=pc_genestoplot,
              ncols=4,
              use_raw=True)

    sc.pl.pca_loadings(_adata, components=[1, 2, 3, 4, 5])
    sc.pl.pca_variance_ratio(_adata, log=True)

    if harmony:
        Z = harmonize(_adata.obsm['X_pca'],
                      _adata.obs,
                      batch_key=harmony_key,
                      random_state=random_state,
                      theta=harmony_theta)
        _adata.obsm['X_harmony'] = Z
        sc.pp.neighbors(_adata,
                        n_neighbors=n_neighbors,
                        n_pcs=npcs,
                        random_state=random_state,
                        use_rep='X_harmony')
    else:
        sc.pp.neighbors(_adata,
                        n_neighbors=n_neighbors,
                        n_pcs=npcs,
                        random_state=random_state)

    sc.tl.umap(_adata,
               min_dist=min_dist,
               spread=spread,
               random_state=random_state)

    np.random.seed(random_state)
    sc.tl.leiden(_adata, resolution=resolution, random_state=random_state)

    fig = sc.pl.umap(_adata, color=umap_genestoplot, use_raw=True)
    fig = sc.pl.umap(_adata, color=other_plot)

    sc.tl.rank_genes_groups(_adata, 'leiden', method='wilcoxon')
    display(pd.DataFrame(_adata.uns['rank_genes_groups']['names']).head(20))
    return (_adata)
def run_harmony(
    data: Union[MultimodalData, UnimodalData],
    batch: str = "Channel",
    rep: str = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    n_clusters: int = None,
    random_state: int = 0,
    use_gpu: bool = False,
    max_iter_harmony: int = 10,
) -> str:
    """Batch correction on PCs using Harmony.

    This is a wrapper of `harmony-pytorch <https://github.com/lilab-bcb/harmony-pytorch>`_ package, which is a Pytorch implementation of Harmony algorithm [Korsunsky19]_.

    Parameters
    ----------
    data: ``MultimodalData``.
        Annotated data matrix with rows for cells and columns for genes.

    batch: ``str``, optional, default: ``"Channel"``.
        Which attribute in data.obs field represents batches, default is "Channel".

    rep: ``str``, optional, default: ``"pca"``.
        Which representation to use as input of Harmony, default is PCA.

    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.

    n_jobs : ``int``, optional, default: ``-1``.
        Number of threads to use in Harmony. ``-1`` refers to using all physical CPU cores.

    n_clusters: ``int``, optional, default: ``None``.
        Number of Harmony clusters. Default is ``None``, which asks Harmony to estimate this number from the data.

    random_state: ``int``, optional, default: ``0``.
        Seed for random number generator

    use_gpu: ``bool``, optional, default: ``False``.
        If ``True``, use GPU if available. Otherwise, use CPU only.
        
    max_iter_harmony: ``int``, optional, default: ``10``.
        Maximum iterations on running Harmony if not converged.

    Returns
    -------
    out_rep: ``str``
        The keyword in ``data.obsm`` referring to the embedding calculated by Harmony algorithm.

        This keyword is ``rep + '_harmony'``, where ``rep`` is the input parameter above.

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_rep]``: The embedding calculated by Harmony algorithm.

    Examples
    --------
    >>> pg.run_harmony(data, rep = "pca", n_jobs = 10, random_state = 25)
    """
    if not check_batch_key(data, batch, "Cannot apply Harmony!"):
        return rep

    try:
        from harmony import harmonize
    except ImportError as e:
        import sys
        logger.error(f"{e}\nNeed Harmony! Try 'pip install harmony-pytorch'.")
        sys.exit(-1)

    logger.info("Start integration using Harmony.")
    out_rep = rep + "_harmony"
    data.obsm["X_" + out_rep] = harmonize(
        X_from_rep(data, rep, n_comps),
        data.obs,
        batch,
        n_clusters = n_clusters,
        n_jobs = n_jobs,
        random_state = random_state,
        use_gpu = use_gpu,
        max_iter_harmony = max_iter_harmony,
    )
    return out_rep