Example #1
0
def write_output(assignment_file: str, input_mat_file: str, output_zarr_file: str, matching: dict) -> None:
	df = pd.read_csv(assignment_file, sep = '\t', header = 0, index_col = 0)
	df.index = pd.Index([x[:-2] for x in df.index])
	f = np.vectorize(translate_donor_name)
	df['assignment'] = f(df['assignment'].values, matching)
	idx = df['status'].values == 'unassigned'
	df.loc[idx, 'status'] = 'unknown'
	df.loc[idx, 'assignment'] = ''

	type_counts = df['status'].value_counts()
	print("\nSinglets = {}, doublets = {}, unknown = {}.".format(type_counts['singlet'], type_counts['doublet'], type_counts['unknown']))

	idx = df['status'] == 'singlet'
	singlet_counts = df.loc[idx, 'assignment'].value_counts()
	print("Among {} singlets, we have the following statistics:".format(type_counts['singlet']))
	for donor in natsorted(singlet_counts.index):
		print("  Reference donor {}: {}".format(donor, singlet_counts[donor]))
	print()

	data = pegasusio.read_input(input_mat_file)
	data.obs['demux_type'] = ''
	data.obs['assignment'] = ''

	idx = data.obs_names.isin(df.index)
	barcodes = data.obs_names[idx]
	ndf = df.loc[barcodes, ['status', 'assignment']]
	data.obs.loc[idx, 'demux_type'] = ndf['status'].values
	data.obs.loc[idx, 'assignment'] = ndf['assignment'].values

	pegasusio.write_output(data, output_zarr_file, zarr_zipstore = True)
Example #2
0
def run_pipeline(input_file: str, output_name: str, **kwargs):
    is_raw = not kwargs["processed"]

    black_list = set()
    if kwargs["black_list"] is not None:
        black_list = set(kwargs["black_list"].split(","))

    # load input data
    data = read_input(input_file, black_list=black_list)

    # process focus_list
    focus_list = kwargs["focus"]
    if len(focus_list) == 0:
        focus_list = [data.current_key()]

    append_data = None
    if kwargs["append"] is not None:
        append_data = data.get_data(kwargs["append"])

    logger.info("Inputs are loaded.")

    if is_raw and not kwargs["subcluster"]:
        # filter out low quality cells/genes
        tools._run_filter_data(
            data,
            focus_list=focus_list,
            output_filt=kwargs["output_filt"],
            plot_filt=kwargs["plot_filt"],
            plot_filt_figsize=kwargs["plot_filt_figsize"],
            min_genes_before_filt=kwargs["min_genes_before_filt"],
            select_singlets=kwargs["select_singlets"],
            remap_string=kwargs["remap_singlets"],
            subset_string=kwargs["subset_singlets"],
            min_genes=kwargs["min_genes"],
            max_genes=kwargs["max_genes"],
            min_umis=kwargs["min_umis"],
            max_umis=kwargs["max_umis"],
            mito_prefix=kwargs["mito_prefix"],
            percent_mito=kwargs["percent_mito"],
            percent_cells=kwargs["percent_cells"],
        )

    for key in focus_list:
        unidata = data.get_data(key)
        analyze_one_modality(unidata, f"{output_name}.{unidata.get_uid()}",
                             is_raw, append_data, **kwargs)

    print()

    # if kwargs["subcluster"]:
    #     unidata = tools.get_anndata_for_subclustering(adata, kwargs["subset_selections"])
    #     is_raw = True  # get submat and then set is_raw to True

    # write out results

    write_output(data, f"{output_name}.zarr.zip")

    print("Results are written.")
    def test_zarr(self):
        data = io.read_input("pegasusio-test-data/case4/MantonBM1_1_dbls.zarr")
        io.write_output(data, "pegasusio-test-data/case4/MantonBM_out.zarr")
        data = io.read_input("pegasusio-test-data/case4/MantonBM_out.zarr")

        self.assertEqual(data.shape, (4274, 19360),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "GRCh38", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
    def test_h5ad(self):
        data = io.read_input("pegasusio-test-data/case1/pbmc3k.h5ad",
                             genome='hg19')
        io.write_output(data, "pegasusio-test-data/case1/pbmc3k_out.h5ad")
        data = io.read_input("pegasusio-test-data/case1/pbmc3k_out.h5ad")

        self.assertEqual(data.shape, (2638, 1838),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "hg19", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
    def test_loom(self):
        data = io.read_input("pegasusio-test-data/case3/pancreas.loom",
                             genome='hg19')
        io.write_output(data, "pegasusio-test-data/case3/pancreas_out.loom")
        data = io.read_input("pegasusio-test-data/case3/pancreas_out.loom")

        self.assertEqual(data.shape, (2544, 58347),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "hg19", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
Example #6
0
def annotate_data_object(input_file: str, annotation: str) -> None:
    """ For command line use.
        annotation:  anno_name:clust_name:cell_type1;...cell_typen
    """
    from pegasusio import read_input, write_output

    data = read_input(input_file, mode="r")
    anno_name, clust_name, anno_str = annotation.split(":")
    anno_dict = {str(i + 1): x for i, x in enumerate(anno_str.split(";"))}
    annotate(data, anno_name, clust_name, anno_dict)
    write_output(data, input_file)
    def test_10x_mtx(self):
        data = io.read_input(
            "pegasusio-test-data/case3/42468c97-1c5a-4c9f-86ea-9eaa1239445a.mtx",
            genome='hg19')
        io.write_output(data, "pegasusio-test-data/case3/test.mtx")
        data = io.read_input("pegasusio-test-data/case3/test.mtx")

        self.assertEqual(data.shape, (2544, 58347),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "hg19", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
Example #8
0
 def execute(self):
     data = aggregate_matrices(
         self.args["<csv_file>"],
         restrictions=self.args["--restriction"],
         attributes=self.split_string(self.args["--attributes"]),
         default_ref=self.args["--default-reference"],
         append_sample_name=not self.args["--no-append-sample-name"],
         select_singlets=self.args["--select-only-singlets"],
         remap_string=self.args["--remap-singlets"],
         subset_string=self.args["--subset-singlets"],
         min_genes=self.convert_to_int(self.args["--min-genes"]),
         max_genes=self.convert_to_int(self.args["--max-genes"]),
         min_umis=self.convert_to_int(self.args["--min-umis"]),
         max_umis=self.convert_to_int(self.args["--max-umis"]),
         mito_prefix=self.args["--mito-prefix"],
         percent_mito=self.convert_to_float(self.args["--percent-mito"])
     )
     write_output(data, self.args["<output_name>"] + ".zarr.zip")
Example #9
0
def run_de_analysis(
    input_file: str,
    output_excel_file: str,
    cluster: str,
    condition: Optional[str] = None,
    de_key: str = "de_res",
    n_jobs: int = -1,
    auc: bool = True,
    t: bool = True,
    fisher: bool = False,
    mwu: bool = False,
    temp_folder: str = None,
    verbose: bool = True,
    alpha: float = 0.05,
    ndigits: int = 3,
) -> None:
    """ For command line only
    """

    from pegasusio import read_input, write_output

    data = read_input(input_file, mode='r')

    de_analysis(
        data,
        cluster,
        condition=condition,
        de_key=de_key,
        n_jobs=n_jobs,
        t=t,
        fisher=fisher,
        temp_folder=temp_folder,
        verbose=verbose,
    )

    write_output(data, input_file)
    logger.info(
        f"Differential expression results are written to varm/{de_key}.")

    results = markers(data, de_key=de_key, alpha=alpha)
    write_results_to_excel(results, output_excel_file, ndigits=ndigits)
Example #10
0
def write_output(assignment_file: str, input_mat_file: str,
                 output_zarr_file: str) -> None:
    df = pd.read_csv(assignment_file, sep='\t', header=0, index_col='BARCODE')
    df.index = pd.Index([x[:-2] for x in df.index])
    df['demux_type'] = df['DROPLET.TYPE'].apply(lambda s: demux_type_dict[s])
    df['assignment'] = ''
    df.loc[df['demux_type'] == 'singlet',
           'assignment'] = df.loc[df['demux_type'] == 'singlet',
                                  'SNG.BEST.GUESS']
    df.loc[df['demux_type'] == 'doublet', 'assignment'] = df.loc[
        df['demux_type'] == 'doublet',
        'DBL.BEST.GUESS'].apply(lambda s: ','.join(s.split(',')[:-1]))

    data = io.read_input(input_mat_file)
    data.obs['demux_type'] = ''
    data.obs['assignment'] = ''

    idx = data.obs_names.isin(df.index)
    barcodes = data.obs_names[idx]
    df_valid = df.loc[barcodes, ['demux_type', 'assignment']]
    data.obs.loc[idx, 'demux_type'] = df_valid['demux_type'].values
    data.obs.loc[idx, 'assignment'] = df_valid['assignment'].values

    io.write_output(data, output_zarr_file)
Example #11
0
def run_pipeline(input_rna_file, input_hto_file, output_name, **kwargs):
    # load input rna data
    data = io.read_input(input_rna_file,
                         genome=kwargs["genome"],
                         modality="rna")
    data.concat_data()  # in case of multi-organism mixing data
    rna_key = data.uns["genome"]

    # load input hashing data
    data.update(
        io.read_input(input_hto_file, genome="hashing", modality="hashing"))
    hashing_key = "hashing"

    # Extract rna and hashing data
    rna_data = data.get_data(rna_key)
    hashing_data = data.get_data(hashing_key)

    # Filter the RNA matrix
    rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1)
    rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
    obs_index = np.logical_and.reduce((
        rna_data.obs["n_genes"] >= kwargs["min_num_genes"],
        rna_data.obs["n_counts"] >= kwargs["min_num_umis"],
    ))
    rna_data._inplace_subset_obs(obs_index)

    # run demuxEM
    estimate_background_probs(hashing_data,
                              random_state=kwargs["random_state"])

    demultiplex(
        rna_data,
        hashing_data,
        min_signal=kwargs["min_signal"],
        alpha=kwargs["alpha"],
        n_threads=kwargs["n_jobs"],
    )

    # annotate raw matrix with demuxEM results
    demux_results = attach_demux_results(input_rna_file, rna_data)

    # generate plots
    if kwargs["gen_plots"]:
        plot_hto_hist(hashing_data,
                      "hto_type",
                      output_name + ".ambient_hashtag.hist.pdf",
                      alpha=1.0)
        plot_bar(
            hashing_data.uns["background_probs"],
            hashing_data.var_names,
            "Sample ID",
            "Background probability",
            output_name + ".background_probabilities.bar.pdf",
        )
        plot_hto_hist(hashing_data,
                      "rna_type",
                      output_name + ".real_content.hist.pdf",
                      alpha=0.5)
        plot_rna_hist(rna_data, output_name + ".rna_demux.hist.pdf")
        logger.info("Diagnostic plots are generated.")

    if len(kwargs["gen_gender_plot"]) > 0:
        rna_data.matrices["raw.X"] = rna_data.X.copy()
        rna_data.as_float()
        scale = 1e5 / rna_data.X.sum(axis=1).A1
        rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr))
        rna_data.X.data = np.log1p(rna_data.X.data)

        for gene_name in kwargs["gen_gender_plot"]:
            plot_gene_violin(
                rna_data,
                gene_name,
                "{output_name}.{gene_name}.violin.pdf".format(
                    output_name=output_name, gene_name=gene_name),
                title="{gene_name}: a gender-specific gene".format(
                    gene_name=gene_name),
            )

        logger.info(
            "Gender-specific gene expression violin plots are generated.")

    # output results
    io.write_output(demux_results,
                    output_name + "_demux.zarr",
                    zarr_zipstore=True)
    io.write_output(data,
                    output_name + ".out.demuxEM.zarr",
                    zarr_zipstore=True)

    # output summary statistics
    print("\nSummary statistics:")
    print("total\t{}".format(rna_data.shape[0]))
    for name, value in rna_data.obs["demux_type"].value_counts().iteritems():
        print("{}\t{}".format(name, value))
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
                         append_data: UnimodalData, **kwargs) -> None:
    print()
    logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.")

    if is_raw:
        # normailize counts and then transform to log space
        tools.log_norm(unidata, kwargs["norm_count"])

        # select highly variable features
        standardize = False  # if no select HVF, False
        if kwargs["select_hvf"]:
            if unidata.shape[1] <= kwargs["hvf_ngenes"]:
                logger.warning(
                    f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted."
                )
            else:
                standardize = True
                tools.highly_variable_features(
                    unidata,
                    kwargs["batch_attr"]
                    if kwargs["batch_correction"] else None,
                    flavor=kwargs["hvf_flavor"],
                    n_top=kwargs["hvf_ngenes"],
                    n_jobs=kwargs["n_jobs"],
                )
                if kwargs["hvf_flavor"] == "pegasus":
                    if kwargs["plot_hvf"] is not None:
                        from pegasus.plotting import hvfplot
                        fig = hvfplot(unidata, return_fig=True)
                        fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf")

        n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1])
        if n_pc < kwargs["pca_n"]:
            logger.warning(
                f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}."
            )

        # Run PCA irrespect of which batch correction method would apply
        tools.pca(
            unidata,
            n_components=n_pc,
            features="highly_variable_features",
            standardize=standardize,
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
        )
        dim_key = "pca"

        if kwargs["nmf"] or (kwargs["batch_correction"]
                             and kwargs["correction_method"] == "inmf"):
            n_nmf = min(kwargs["nmf_n"], unidata.shape[0], unidata.shape[1])
            if n_nmf < kwargs["nmf_n"]:
                logger.warning(
                    f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of NMF components {kwargs['nmf_n']}. Reduce the number of NMF components to {n_nmf}."
                )

        if kwargs["nmf"]:
            if kwargs["batch_correction"] and kwargs[
                    "correction_method"] == "inmf":
                logger.warning(
                    "NMF is skipped because integrative NMF is run instead.")
            else:
                tools.nmf(
                    unidata,
                    n_components=n_nmf,
                    features="highly_variable_features",
                    n_jobs=kwargs["n_jobs"],
                    random_state=kwargs["random_state"],
                )

        if kwargs["batch_correction"]:
            if kwargs["correction_method"] == "harmony":
                dim_key = tools.run_harmony(
                    unidata,
                    batch=kwargs["batch_attr"],
                    rep="pca",
                    n_jobs=kwargs["n_jobs"],
                    n_clusters=kwargs["harmony_nclusters"],
                    random_state=kwargs["random_state"])
            elif kwargs["correction_method"] == "inmf":
                dim_key = tools.integrative_nmf(
                    unidata,
                    batch=kwargs["batch_attr"],
                    n_components=n_nmf,
                    features="highly_variable_features",
                    lam=kwargs["inmf_lambda"],
                    n_jobs=kwargs["n_jobs"],
                    random_state=kwargs["random_state"])
            elif kwargs["correction_method"] == "scanorama":
                dim_key = tools.run_scanorama(
                    unidata,
                    batch=kwargs["batch_attr"],
                    n_components=n_pc,
                    features="highly_variable_features",
                    standardize=standardize,
                    random_state=kwargs["random_state"])
            else:
                raise ValueError(
                    f"Unknown batch correction method {kwargs['correction_method']}!"
                )

        # Find K neighbors
        tools.neighbors(
            unidata,
            K=kwargs["K"],
            rep=dim_key,
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            full_speed=kwargs["full_speed"],
        )

    if kwargs["calc_sigscore"] is not None:
        sig_files = kwargs["calc_sigscore"].split(",")
        for sig_file in sig_files:
            tools.calc_signature_score(unidata, sig_file)

    # calculate diffmap
    if (kwargs["fle"] or kwargs["net_fle"]):
        if not kwargs["diffmap"]:
            print("Turn on --diffmap option!")
        kwargs["diffmap"] = True

    if kwargs["diffmap"]:
        tools.diffmap(
            unidata,
            n_components=kwargs["diffmap_ndc"],
            rep=dim_key,
            solver=kwargs["diffmap_solver"],
            max_t=kwargs["diffmap_maxt"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
        )

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            unidata,
            kwargs["kBET_batch"],
            rep=dim_key,
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"])
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}."
            .format(stat_mean, pvalue_mean, accept_rate))

    # clustering
    if kwargs["spectral_louvain"]:
        tools.cluster(
            unidata,
            algo="spectral_louvain",
            rep=dim_key,
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_clusters2=kwargs["spectral_louvain_nclusters2"],
            n_init=kwargs["spectral_louvain_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.cluster(
            unidata,
            algo="spectral_leiden",
            rep=dim_key,
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_clusters2=kwargs["spectral_leiden_nclusters2"],
            n_init=kwargs["spectral_leiden_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.cluster(
            unidata,
            algo="louvain",
            rep=dim_key,
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.cluster(
            unidata,
            algo="leiden",
            rep=dim_key,
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_umap"]:
        tools.net_umap(
            unidata,
            rep=dim_key,
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            unidata,
            rep=dim_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            initialization=kwargs["tsne_init"],
        )

    if kwargs["umap"]:
        tools.umap(
            unidata,
            rep=dim_key,
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            n_jobs=kwargs["n_jobs"],
            full_speed=kwargs["full_speed"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    if kwargs["infer_doublets"]:
        channel_attr = "Channel"
        if (channel_attr not in unidata.obs) or (
                unidata.obs["Channel"].cat.categories.size == 1):
            channel_attr = None
        clust_attr = kwargs["dbl_cluster_attr"]
        if (clust_attr is None) or (clust_attr not in unidata.obs):
            clust_attr = None
            for value in [
                    "leiden_labels", "louvain_labels",
                    "spectral_leiden_labels", "spectral_louvain_labels"
            ]:
                if value in unidata.obs:
                    clust_attr = value
                    break

        if channel_attr is not None:
            logger.info(f"For doublet inference, channel_attr={channel_attr}.")
        if clust_attr is not None:
            logger.info(f"For doublet inference, clust_attr={clust_attr}.")

        tools.infer_doublets(
            unidata,
            channel_attr=channel_attr,
            clust_attr=clust_attr,
            expected_doublet_rate=kwargs["expected_doublet_rate"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            plot_hist=output_name)

        dbl_clusts = None
        if clust_attr is not None:
            clusts = []
            for idx, row in unidata.uns["pred_dbl_cluster"].iterrows():
                if row["percentage"] >= 50.0:
                    logger.info(
                        f"Cluster {row['cluster']} (percentage={row['percentage']:.2f}%, q-value={row['qval']:.6g}) is identified as a doublet cluster."
                    )
                    clusts.append(row["cluster"])
            if len(clusts) > 0:
                dbl_clusts = f"{clust_attr}:{','.join(clusts)}"

        tools.mark_doublets(unidata, dbl_clusts=dbl_clusts)

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(unidata, kwargs["pseudotime"])

    genome = unidata.uns["genome"]

    if append_data is not None:
        locs = unidata.obs_names.get_indexer(append_data.obs_names)
        idx = locs >= 0
        locs = locs[idx]
        Y = append_data.X[idx, :].tocoo(copy=False)
        Z = coo_matrix((Y.data, (locs[Y.row], Y.col)),
                       shape=(unidata.shape[0], append_data.shape[1])).tocsr()

        idy = Z.getnnz(axis=0) > 0
        n_nonzero = idy.sum()
        if n_nonzero > 0:
            if n_nonzero < append_data.shape[1]:
                Z = Z[:, idy]
                append_df = append_data.feature_metadata.loc[idy, :]
            else:
                append_df = append_data.feature_metadata

            if kwargs["citeseq"]:
                append_df = append_df.copy()
                append_df.index = append_df.index.map(lambda x: f"Ab-{x}")

            rawX = hstack([unidata.get_matrix("counts"), Z], format="csr")

            Zt = Z.astype(np.float32)
            if not kwargs["citeseq"]:
                Zt.data *= np.repeat(unidata.obs["scale"].values,
                                     np.diff(Zt.indptr))
                Zt.data = np.log1p(Zt.data)
            else:
                Zt.data = np.arcsinh(Zt.data / 5.0, dtype=np.float32)

            X = hstack([unidata.get_matrix(unidata.current_matrix()), Zt],
                       format="csr")

            new_genome = unidata.get_genome()
            if new_genome != append_data.get_genome():
                new_genome = f"{new_genome}_and_{append_data.get_genome()}"

            feature_metadata = pd.concat([unidata.feature_metadata, append_df],
                                         axis=0)
            feature_metadata.reset_index(inplace=True)
            _fillna(feature_metadata)
            unidata = UnimodalData(
                unidata.barcode_metadata, feature_metadata, {
                    unidata.current_matrix(): X,
                    "counts": rawX
                }, unidata.uns.mapping, unidata.obsm.mapping,
                unidata.varm.mapping
            )  # uns.mapping, obsm.mapping and varm.mapping are passed by reference
            unidata.uns["genome"] = new_genome

            if kwargs["citeseq"] and kwargs["citeseq_umap"]:
                umap_index = append_df.index.difference(
                    [f"Ab-{x}" for x in kwargs["citeseq_umap_exclude"]])
                unidata.obsm["X_citeseq"] = unidata.X[:,
                                                      unidata.var_names.
                                                      isin(umap_index
                                                           )].toarray()
                tools.umap(
                    unidata,
                    rep="citeseq",
                    n_neighbors=kwargs["umap_K"],
                    min_dist=kwargs["umap_min_dist"],
                    spread=kwargs["umap_spread"],
                    n_jobs=kwargs["n_jobs"],
                    full_speed=kwargs["full_speed"],
                    random_state=kwargs["random_state"],
                    out_basis="citeseq_umap",
                )

    if kwargs["output_h5ad"]:
        import time
        start_time = time.perf_counter()
        adata = unidata.to_anndata()
        if "_tmp_fmat_highly_variable_features" in adata.uns:
            adata.uns["scale.data"] = adata.uns.pop(
                "_tmp_fmat_highly_variable_features")  # assign by reference
            adata.uns["scale.data.rownames"] = unidata.var_names[
                unidata.var["highly_variable_features"] == True].values
        adata.write(f"{output_name}.h5ad", compression="gzip")
        del adata
        end_time = time.perf_counter()
        logger.info(
            f"H5AD file {output_name}.h5ad is written. Time spent = {end_time - start_time:.2f}s."
        )

    # write out results
    if kwargs["output_loom"]:
        write_output(unidata, f"{output_name}.loom")

    # Change genome name back if append_data is True
    if unidata.uns["genome"] != genome:
        unidata.uns["genome"] = genome
    # Eliminate objects starting with _tmp from uns
    unidata.uns.pop("_tmp_fmat_highly_variable_features", None)
Example #13
0
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
                         append_data: UnimodalData, **kwargs) -> None:
    print()
    logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.")
    if kwargs["channel_attr"] is not None:
        unidata.obs["Channel"] = unidata.obs[kwargs["channel_attr"]]

    if is_raw:
        # normailize counts and then transform to log space
        tools.log_norm(unidata, kwargs["norm_count"])
        # set group attribute
        if kwargs["batch_correction"] and kwargs["group_attribute"] is not None:
            tools.set_group_attribute(unidata, kwargs["group_attribute"])

    # select highly variable features
    standardize = False  # if no select HVF, False
    if kwargs["select_hvf"]:
        if unidata.shape[1] <= kwargs["hvf_ngenes"]:
            logger.warning(
                f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted."
            )
        else:
            standardize = True
            tools.highly_variable_features(
                unidata,
                kwargs["batch_correction"],
                flavor=kwargs["hvf_flavor"],
                n_top=kwargs["hvf_ngenes"],
                n_jobs=kwargs["n_jobs"],
            )
            if kwargs["hvf_flavor"] == "pegasus":
                if kwargs["plot_hvf"] is not None:
                    from pegasus.plotting import hvfplot
                    fig = hvfplot(unidata, return_fig=True)
                    fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf")

    # batch correction: L/S
    if kwargs["batch_correction"] and kwargs["correction_method"] == "L/S":
        tools.correct_batch(unidata, features="highly_variable_features")

    if kwargs["calc_sigscore"] is not None:
        sig_files = kwargs["calc_sigscore"].split(",")
        for sig_file in sig_files:
            tools.calc_signature_score(unidata, sig_file)

    n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1])
    if n_pc < kwargs["pca_n"]:
        logger.warning(
            f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}."
        )

    if kwargs["batch_correction"] and kwargs[
            "correction_method"] == "scanorama":
        pca_key = tools.run_scanorama(unidata,
                                      n_components=n_pc,
                                      features="highly_variable_features",
                                      standardize=standardize,
                                      random_state=kwargs["random_state"])
    else:
        # PCA
        tools.pca(
            unidata,
            n_components=n_pc,
            features="highly_variable_features",
            standardize=standardize,
            robust=kwargs["pca_robust"],
            random_state=kwargs["random_state"],
        )
        pca_key = "pca"

    # batch correction: Harmony
    if kwargs["batch_correction"] and kwargs["correction_method"] == "harmony":
        pca_key = tools.run_harmony(unidata,
                                    rep="pca",
                                    n_jobs=kwargs["n_jobs"],
                                    n_clusters=kwargs["harmony_nclusters"],
                                    random_state=kwargs["random_state"])

    # Find K neighbors
    tools.neighbors(
        unidata,
        K=kwargs["K"],
        rep=pca_key,
        n_jobs=kwargs["n_jobs"],
        random_state=kwargs["random_state"],
        full_speed=kwargs["full_speed"],
    )

    # calculate diffmap
    if (kwargs["fle"] or kwargs["net_fle"]):
        if not kwargs["diffmap"]:
            print("Turn on --diffmap option!")
        kwargs["diffmap"] = True

    if kwargs["diffmap"]:
        tools.diffmap(
            unidata,
            n_components=kwargs["diffmap_ndc"],
            rep=pca_key,
            solver=kwargs["diffmap_solver"],
            random_state=kwargs["random_state"],
            max_t=kwargs["diffmap_maxt"],
        )
        if kwargs["diffmap_to_3d"]:
            tools.reduce_diffmap_to_3d(unidata,
                                       random_state=kwargs["random_state"])

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            unidata,
            kwargs["kBET_batch"],
            rep=pca_key,
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"])
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}."
            .format(stat_mean, pvalue_mean, accept_rate))

    # clustering
    if kwargs["spectral_louvain"]:
        tools.cluster(
            unidata,
            algo="spectral_louvain",
            rep=pca_key,
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_clusters2=kwargs["spectral_louvain_nclusters2"],
            n_init=kwargs["spectral_louvain_ninit"],
            random_state=kwargs["random_state"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.cluster(
            unidata,
            algo="spectral_leiden",
            rep=pca_key,
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_clusters2=kwargs["spectral_leiden_nclusters2"],
            n_init=kwargs["spectral_leiden_ninit"],
            random_state=kwargs["random_state"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.cluster(
            unidata,
            algo="louvain",
            rep=pca_key,
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.cluster(
            unidata,
            algo="leiden",
            rep=pca_key,
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_tsne"]:
        tools.net_tsne(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_learning_frac=kwargs["net_tsne_polish_learing_frac"],
            polish_n_iter=kwargs["net_tsne_polish_niter"],
            out_basis=kwargs["net_tsne_basis"],
        )

    if kwargs["net_umap"]:
        tools.net_umap(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fitsne"]:
        tools.fitsne(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["umap"]:
        tools.umap(
            unidata,
            rep=pca_key,
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(unidata, kwargs["pseudotime"])

    genome = unidata.uns["genome"]

    if append_data is not None:
        locs = unidata.obs_names.get_indexer(append_data.obs_names)
        idx = locs >= 0
        locs = locs[idx]
        Y = append_data.X[idx, :].tocoo(copy=False)
        Z = coo_matrix((Y.data, (locs[Y.row], Y.col)),
                       shape=(unidata.shape[0], append_data.shape[1])).tocsr()

        idy = Z.getnnz(axis=0) > 0
        n_nonzero = idy.sum()
        if n_nonzero > 0:
            if n_nonzero < append_data.shape[1]:
                Z = Z[:, idy]
                append_df = append_data.feature_metadata.loc[idy, :]
            else:
                append_df = append_data.feature_metadata

            rawX = hstack([unidata.get_matrix("raw.X"), Z], format="csr")

            Zt = Z.astype(np.float32)
            Zt.data *= np.repeat(unidata.obs["scale"].values,
                                 np.diff(Zt.indptr))
            Zt.data = np.log1p(Zt.data)

            X = hstack([unidata.get_matrix("X"), Zt], format="csr")

            new_genome = unidata.get_genome(
            ) + "_and_" + append_data.get_genome()

            feature_metadata = pd.concat([unidata.feature_metadata, append_df],
                                         axis=0)
            feature_metadata.reset_index(inplace=True)
            feature_metadata.fillna(value=_get_fillna_dict(
                unidata.feature_metadata),
                                    inplace=True)

            unidata = UnimodalData(
                unidata.barcode_metadata, feature_metadata, {
                    "X": X,
                    "raw.X": rawX
                }, unidata.uns.mapping, unidata.obsm.mapping,
                unidata.varm.mapping
            )  # uns.mapping, obsm.mapping and varm.mapping are passed by reference
            unidata.uns["genome"] = new_genome

    if kwargs["output_h5ad"]:
        adata = unidata.to_anndata()
        adata.uns["scale.data"] = adata.uns.pop(
            "_tmp_fmat_highly_variable_features")  # assign by reference
        adata.uns["scale.data.rownames"] = unidata.var_names[
            unidata.var["highly_variable_features"]].values
        adata.write(f"{output_name}.h5ad", compression="gzip")
        del adata

    # write out results
    if kwargs["output_loom"]:
        write_output(unidata, f"{output_name}.loom")

    # Change genome name back if append_data is True
    if unidata.uns["genome"] != genome:
        unidata.uns["genome"] = genome
    # Eliminate objects starting with fmat_ from uns
    unidata.uns.pop("_tmp_fmat_highly_variable_features", None)