Beispiel #1
0
def plot_down_sampling(
    rna_file,
    adt_file,
    out_file,
    probs=[i / 10.0 for i in range(9, 0, -1)],
    n_threads=1,
    dpi=500,
    figsize=None,
):
    data_gt = read_input(rna_file)
    adt_gt = read_input(adt_file)
    fracs, accuracy = down_sampling(data_gt,
                                    adt_gt,
                                    probs,
                                    n_threads=n_threads)
    plt.plot(fracs, accuracy, ".-")
    ax = plt.gca()
    ax.set_xlim(1.0, 0.0)
    ax.set_ylim(0.79, 1.01)
    vals = ax.get_yticks()
    ax.set_yticklabels(["{:.0%}".format(v) for v in vals])
    ax.set_xlabel("Fraction of hashtag UMIs")
    ax.set_ylabel("Consistency")
    if figsize is not None:
        plt.gcf().set_size_inches(*figsize)
    plt.savefig(out_file, dpi=dpi)
    plt.close()
def show_attributes(
    input_file: str,
    show_attributes: bool,
    show_gene_attributes: bool,
    show_values_for_attributes: str,
) -> None:
    """ Show data attributes. For command line use.
    """

    data = read_input(input_file, h5ad_mode="r")
    if show_attributes:
        print(
            "Available sample attributes in input dataset: {0}".format(
                ", ".join(data.obs.columns.values)
            )
        )
    if show_gene_attributes:
        print(
            "Available gene attributes in input dataset: {0}".format(
                ", ".join(data.var.columns.values)
            )
        )
    if not show_values_for_attributes is None:
        for attr in show_values_for_attributes.split(","):
            print(
                "Available values for attribute {0}: {1}.".format(
                    attr, ", ".join(np.unique(data.obs[attr]))
                )
            )
Beispiel #3
0
def run_annotate_cluster(
    input_file: str,
    output_file: str,
    marker_file: str,
    de_test: str,
    de_alpha: float = 0.05,
    de_key: str = "de_res",
    threshold: float = 0.5,
    ignore_nonde: bool = False,
) -> None:
    """ For command line use.
    """
    import time
    from sccloud.io import read_input

    start = time.time()
    data = read_input(input_file, h5ad_mode="r")
    infer_cell_types(
        data,
        marker_file,
        de_test,
        de_alpha=de_alpha,
        de_key=de_key,
        threshold=threshold,
        ignore_nonde=ignore_nonde,
        output_file=output_file,
    )
    data.file.close()
    end = time.time()
    logger.info("Time spent for annotating clusters is {:.2f}s.".format(end - start))
Beispiel #4
0
def make_interactive_plots(input_file, plot_type, output_file, **kwargs):
    adata = read_input(input_file, h5ad_mode="r")
    basis = transform_basis(plot_type)
    if plot_type == "diffmap" or plot_type == "diffmap_pca":
        df = pd.DataFrame(
            adata.obsm["X_{}".format(plot_type)][:, 0:3],
            index=adata.obs.index,
            columns=[basis + i for i in ["1", "2", "3"]],
        )
        if kwargs["isgene"]:
            coln = adata.var.index.get_loc(kwargs["attr"])
            df.insert(0, "Annotation", adata.X[:, coln].toarray().ravel())
        else:
            df.insert(0, "Annotation", adata.obs[kwargs["attr"]])
        if not kwargs["isreal"]:
            iplot_library.scatter3d(df, output_file)
        else:
            iplot_library.scatter3d_real(df, output_file, kwargs["log10"])
    else:
        df = pd.DataFrame(
            adata.obsm["X_{}".format(plot_type)],
            index=adata.obs.index,
            columns=[basis + i for i in ["1", "2"]],
        )
        if kwargs["isgene"]:
            coln = adata.var.index.get_loc(kwargs["attr"])
            df.insert(0, "Annotation", adata.X[:, coln].toarray().ravel())
        else:
            df.insert(0, "Annotation", adata.obs[kwargs["attr"]])
        if not kwargs["isreal"]:
            iplot_library.scatter(df, output_file)
        else:
            iplot_library.scatter_real(df, output_file, kwargs["log10"])
    print(output_file + " is generated.")
    adata.file.close()
Beispiel #5
0
def merge_rna_and_adt_data(input_raw_h5, input_csv, antibody_control_csv,
                           output_name):
    data = read_input(input_raw_h5, return_type="MemData")
    print("Loaded the RNA matrix.")

    keyword = "CITE_Seq_" + data.listKeys()[0]
    data_citeseq = read_input(input_csv, return_type="MemData", genome=keyword)
    print("Loaded the ADT matrix.")

    array2d = data_citeseq.getData(keyword)
    if antibody_control_csv is None:
        array2d.matrix = array2d.matrix.log1p()
    else:
        size = array2d.feature_metadata.shape[0]
        idx = np.zeros(size, dtype=bool)
        antibody_to_pos = pd.Series(data=range(size),
                                    index=array2d.feature_metadata.index)

        adt_matrix = array2d.matrix.toarray().astype(float)

        series = pd.read_csv(antibody_control_csv,
                             header=0,
                             index_col=0,
                             squeeze=True)
        for antibody, control in series.iteritems():
            pos_a = antibody_to_pos[antibody]
            pos_c = antibody_to_pos[control]
            idx[pos_a] = True
            # convert to log expression
            adt_matrix[:, pos_a] = np.maximum(
                np.log(adt_matrix[:, pos_a] + 1.0) -
                np.log(adt_matrix[:, pos_c] + 1.0),
                0.0,
            )

        array2d.feature_metadata = array2d.feature_metadata[idx]
        array2d.matrix = csr_matrix(adt_matrix[:, idx])

    data.addData(keyword, array2d)
    write_output(data, output_name)

    print("Merged output is written.")
Beispiel #6
0
def annotate_anndata_object(input_file: str, annotation: str) -> None:
    """ For command line use.
        annotation:  anno_name:clust_name:cell_type1;...cell_typen
    """
    from sccloud.io import read_input, write_output

    data = read_input(input_file, h5ad_mode="r+")
    anno_name, clust_name, anno_str = annotation.split(":")
    anno_dict = {str(i + 1): x for i, x in enumerate(anno_str.split(";"))}
    annotate(data, anno_name, clust_name, anno_dict)
    write_output(data, input_file, whitelist = ["obs"])
Beispiel #7
0
def run_conversion(input_h5ad_file, output_name, nthreads):
    start = time.time()
    data = read_input(input_h5ad_file)
    end = time.time()
    print("Time spent for loading the expression matrix is {:.2f}s.".format(
        end - start))

    start = time.time()
    convert_to_parquet(data, output_name, nthreads)
    end = time.time()
    print(
        "Time spent on generating the PARQUET file is {:.2f}s.".format(end -
                                                                       start))
Beispiel #8
0
def run_de_analysis(
    input_file: str,
    output_excel_file: str,
    cluster: str,
    result_key: str = "de_res",
    n_jobs: int = -1,
    auc: bool = True,
    t: bool = True,
    fisher: bool = False,
    mwu: bool = False,
    temp_folder: str = None,
    verbose: bool = True,
    alpha: float = 0.05,
    ndigits: int = 3,
) -> None:
    """ For command line only
    """
    start = time.time()

    from sccloud.io import read_input, write_output

    data = read_input(input_file, h5ad_mode="r+")

    de_analysis(
        data,
        cluster,
        result_key=result_key,
        n_jobs=n_jobs,
        auc=auc,
        t=t,
        fisher=fisher,
        mwu=mwu,
        temp_folder=temp_folder,
        verbose=verbose,
    )

    write_output(data, input_file, whitelist=["varm/{}".format(result_key)])
    logger.info(
        "Differential expression results are written to varm/{} in h5ad file.".
        format(result_key))

    results = markers(data, de_key=result_key, alpha=alpha)

    write_results_to_excel(results, output_excel_file, ndigits=ndigits)

    end = time.time()
    logger.info("run_de_analysis is finished in {:.2f}s.".format(end - start))
Beispiel #9
0
def make_static_plots(input_file, plot_type, output_file, dpi=500, **kwargs):
    adata = read_input(input_file, h5ad_mode="r")

    if plot_type == "qc_violin":
        if kwargs["attr"] is None:
            plot_qc_violin(
                adata,
                kwargs["qc_type"],
                output_file,
                xattr=kwargs["cluster"],
                xlabel=kwargs["cluster"],
                xtick_font=kwargs["qc_xtick_font"],
                xtick_rotation=kwargs["qc_xtick_rotation"],
                figsize=kwargs["subplot_size"],
                linewidth=kwargs["qc_line_width"],
            )
        else:
            plot_qc_violin(
                adata,
                kwargs["qc_type"],
                output_file,
                xattr=kwargs["cluster"],
                hue=kwargs["attr"],
                xlabel=kwargs["cluster"],
                xtick_font=kwargs["qc_xtick_font"],
                xtick_rotation=kwargs["qc_xtick_rotation"],
                split=True,
                figsize=kwargs["subplot_size"],
                linewidth=kwargs["qc_line_width"],
            )
    else:
        fig = getattr(plot_library, "plot_" + plot_type)(adata, **kwargs)
        fig.savefig(output_file, dpi=dpi)

    print(output_file + " is generated.")
    adata.file.close()
Beispiel #10
0
def run_pipeline(input_file, output_name, **kwargs):
    is_raw = not kwargs["processed"]

    if "seurat_compatible" not in kwargs:
        kwargs["seurat_compatible"] = False

    # load input data
    adata = io.read_input(
        input_file,
        genome=kwargs["genome"],
        concat_matrices=False if kwargs["cite_seq"] else True,
        h5ad_mode=("a" if (is_raw or kwargs["subcluster"]) else "r+"),
        select_singlets=kwargs["select_singlets"],
        channel_attr=kwargs["channel_attr"],
        black_list=(kwargs["black_list"].split(",")
                    if kwargs["black_list"] is not None else []),
    )

    if not kwargs["cite_seq"]:
        if is_raw:
            values = adata.X.getnnz(axis=1)
            if values.min() == 0:  # 10x raw data
                adata._inplace_subset_obs(values >= kwargs["min_genes_on_raw"])
    else:
        data_list = adata
        assert len(data_list) == 2
        adata = cdata = None
        for i in range(len(data_list)):
            if data_list[i].uns["genome"].startswith("CITE_Seq"):
                cdata = data_list[i]
            else:
                adata = data_list[i]
        assert adata is not None and cdata is not None
    print("Inputs are loaded.")

    if kwargs["seurat_compatible"]:
        assert is_raw and kwargs["select_hvf"]

    if kwargs["subcluster"]:
        adata = tools.get_anndata_for_subclustering(
            adata, kwargs["subset_selections"])
        is_raw = True  # get submat and then set is_raw to True

    if is_raw:
        if not kwargs["subcluster"]:
            # filter out low quality cells/genes
            tools.run_filter_data(
                adata,
                output_filt=kwargs["output_filt"],
                plot_filt=kwargs["plot_filt"],
                plot_filt_figsize=kwargs["plot_filt_figsize"],
                mito_prefix=kwargs["mito_prefix"],
                min_genes=kwargs["min_genes"],
                max_genes=kwargs["max_genes"],
                min_umis=kwargs["min_umis"],
                max_umis=kwargs["max_umis"],
                percent_mito=kwargs["percent_mito"],
                percent_cells=kwargs["percent_cells"],
            )

            if kwargs["seurat_compatible"]:
                raw_data = adata.copy()  # raw as count

            # normailize counts and then transform to log space
            tools.log_norm(adata, kwargs["norm_count"])

            # set group attribute
            if kwargs["batch_correction"] and kwargs[
                    "group_attribute"] is not None:
                tools.set_group_attribute(adata, kwargs["group_attribute"])

        # select highly variable features
        if kwargs["select_hvf"]:
            tools.highly_variable_features(
                adata,
                kwargs["batch_correction"],
                flavor=kwargs["hvf_flavor"],
                n_top=kwargs["hvf_ngenes"],
                n_jobs=kwargs["n_jobs"],
            )
            if kwargs["hvf_flavor"] == "sccloud":
                if kwargs["plot_hvf"] is not None:
                    from sccloud.plotting import plot_hvf

                    robust_idx = adata.var["robust"].values
                    plot_hvf(
                        adata.var.loc[robust_idx, "mean"],
                        adata.var.loc[robust_idx, "var"],
                        adata.var.loc[robust_idx, "hvf_loess"],
                        adata.var.loc[robust_idx, "highly_variable_features"],
                        kwargs["plot_hvf"] + ".hvf.pdf",
                    )

        # batch correction
        if kwargs["batch_correction"]:
            tools.correct_batch(adata, features="highly_variable_features")

        # PCA
        tools.pca(
            adata,
            n_components=kwargs["nPC"],
            features="highly_variable_features",
            random_state=kwargs["random_state"],
        )

        # Find K neighbors
        tools.neighbors(
            adata,
            K=kwargs["K"],
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            full_speed=kwargs["full_speed"],
        )

        # calculate diffmap
        if (kwargs["fle"] or kwargs["net_fle"]):
            if not kwargs["diffmap"]:
                print("Turn on --diffmap option!")
            kwargs["diffmap"] = True

        if kwargs["diffmap"]:
            tools.diffmap(
                adata,
                n_components=kwargs["diffmap_ndc"],
                rep="pca",
                solver=kwargs["diffmap_solver"],
                random_state=kwargs["random_state"],
                max_t=kwargs["diffmap_maxt"],
            )
            if kwargs["diffmap_to_3d"]:
                tools.reduce_diffmap_to_3d(adata,
                                           random_state=kwargs["random_state"])

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            adata,
            kwargs["kBET_batch"],
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
        )
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}."
            .format(stat_mean, pvalue_mean, accept_rate))

    # clustering
    if kwargs["spectral_louvain"]:
        tools.spectral_louvain(
            adata,
            rep="pca",
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_init=kwargs["spectral_louvain_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            temp_folder=kwargs["temp_folder"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.spectral_leiden(
            adata,
            rep="pca",
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_init=kwargs["spectral_leiden_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            temp_folder=kwargs["temp_folder"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.louvain(
            adata,
            rep="pca",
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.leiden(
            adata,
            rep="pca",
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_tsne"]:
        tools.net_tsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_learning_frac=kwargs["net_tsne_polish_learing_frac"],
            polish_n_iter=kwargs["net_tsne_polish_niter"],
            out_basis=kwargs["net_tsne_basis"],
        )

    if kwargs["net_umap"]:
        tools.net_umap(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            adata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fitsne"]:
        tools.fitsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["umap"]:
        tools.umap(
            adata,
            rep="pca",
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            adata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(adata, kwargs["pseudotime"])

    # merge cite-seq data and run t-SNE
    if kwargs["cite_seq"]:
        adt_matrix = np.zeros((adata.shape[0], cdata.shape[1]),
                              dtype="float32")
        idx = adata.obs_names.isin(cdata.obs_names)
        adt_matrix[idx, :] = cdata[adata.obs_names[idx], ].X.toarray()
        if abs(100.0 - kwargs["cite_seq_capping"]) > 1e-4:
            cite_seq.capping(adt_matrix, kwargs["cite_seq_capping"])

        var_names = np.concatenate(
            [adata.var_names, ["AD-" + x for x in cdata.var_names]])

        new_data = anndata.AnnData(
            X=hstack([adata.X, csr_matrix(adt_matrix)], format="csr"),
            obs=adata.obs,
            obsm=adata.obsm,
            uns=adata.uns,
            var={
                "var_names":
                var_names,
                "gene_ids":
                var_names,
                "n_cells":
                np.concatenate(
                    [adata.var["n_cells"].values, [0] * cdata.shape[1]]),
                "percent_cells":
                np.concatenate([
                    adata.var["percent_cells"].values, [0.0] * cdata.shape[1]
                ]),
                "robust":
                np.concatenate(
                    [adata.var["robust"].values, [False] * cdata.shape[1]]),
                "highly_variable_features":
                np.concatenate([
                    adata.var["highly_variable_features"].values,
                    [False] * cdata.shape[1],
                ]),
            },
        )
        new_data.obsm["X_CITE-Seq"] = adt_matrix
        adata = new_data
        print("ADT count matrix is attached.")

        tools.fitsne(
            adata,
            rep="CITE-Seq",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            out_basis="citeseq_fitsne",
        )
        print("Antibody embedding is done.")

    if kwargs["seurat_compatible"]:
        seurat_data = adata.copy()
        seurat_data.raw = raw_data
        seurat_data.uns["scale.data"] = adata.uns[
            "fmat_highly_variable_features"]
        seurat_data.uns["scale.data.rownames"] = adata.var_names[
            adata.var["highly_variable_features"]].values
        io.write_output(seurat_data, output_name + ".seurat.h5ad")

    # write out results
    io.write_output(adata, output_name + ".h5ad")

    if kwargs["output_loom"]:
        io.write_output(adata, output_name + ".loom")

    print("Results are written.")
def run_demuxEM_pipeline(input_adt_file, input_rna_file, output_name, **kwargs):
    # load input data
    adt = io.read_input(input_adt_file, genome="_ADT_")
    print("ADT file is loaded.")
    data = io.read_input(input_rna_file, genome=kwargs["genome"], concat_matrices=True)
    print("RNA file is loaded.")

    # Filter the RNA matrix
    data.obs["n_genes"] = data.X.getnnz(axis=1)
    data.obs["n_counts"] = data.X.sum(axis=1).A1
    obs_index = np.logical_and.reduce(
        (
            data.obs["n_genes"] >= kwargs["min_num_genes"],
            data.obs["n_counts"] >= kwargs["min_num_umis"],
        )
    )
    data._inplace_subset_obs(obs_index)
    data.var["robust"] = True

    # run demuxEM
    demuxEM.estimate_background_probs(adt, random_state=kwargs["random_state"])
    print("Background probability distribution is estimated.")
    demuxEM.demultiplex(
        data,
        adt,
        min_signal=kwargs["min_signal"],
        alpha=kwargs["alpha"],
        n_threads=kwargs["n_jobs"],
    )
    print("Demultiplexing is done.")

    # annotate raw matrix with demuxEM results
    genome_indexed_raw_data = io.read_input(
        input_rna_file, return_type="MemData", concat_matrices=False
    )
    for keyword in genome_indexed_raw_data.listKeys():
        array2d = genome_indexed_raw_data.getData(keyword)
        barcodes = array2d.barcode_metadata.index
        idx = barcodes.isin(data.obs_names)
        selected = barcodes[idx]

        demux_type = np.empty(barcodes.size, dtype="object")
        demux_type[:] = ""
        demux_type[idx] = data.obs.loc[selected, "demux_type"]
        array2d.barcode_metadata["demux_type"] = demux_type

        assignment = np.empty(barcodes.size, dtype="object")
        assignment[:] = ""
        assignment[idx] = data.obs.loc[selected, "assignment"]
        array2d.barcode_metadata["assignment"] = assignment

        if "assignment.dedup" in data.obs:
            assignment_dedup = np.empty(barcodes.size, dtype="object")
            assignment_dedup[:] = ""
            assignment_dedup[idx] = data.obs.loc[selected, "assignment.dedup"]
            array2d.barcode_metadata["assignment.dedup"] = assignment_dedup

    print("Demultiplexing results are added to raw expression matrices.")

    # generate plots
    if kwargs["gen_plots"]:
        demuxEM.plot_adt_hist(
            adt, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0
        )
        demuxEM.plot_bar(
            adt.uns["background_probs"],
            adt.var_names,
            "Sample ID",
            "Background probability",
            output_name + ".background_probabilities.bar.pdf",
        )
        demuxEM.plot_adt_hist(
            adt, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5
        )
        demuxEM.plot_rna_hist(data, output_name + ".rna_demux.hist.pdf")
        print("Diagnostic plots are generated.")

    if len(kwargs["gen_gender_plot"]) > 0:
        tools.log_norm(data, 1e5)
        for gene_name in kwargs["gen_gender_plot"]:
            demuxEM.plot_violin(
                data,
                {"gene": gene_name},
                "{output_name}.{gene_name}.violin.pdf".format(
                    output_name=output_name, gene_name=gene_name
                ),
                title="{gene_name}: a gender-specific gene".format(gene_name=gene_name),
            )
        print("Gender-specific gene expression violin plots are generated.")

    # output results
    io.write_output(adt, output_name + "_ADTs.h5ad")
    print(
        "Hashtag count information is written to {output_name}_ADTs.h5ad .".format(
            output_name=output_name
        )
    )
    io.write_output(data, output_name + "_demux.h5ad")
    print(
        "Demutiplexed RNA expression information is written to {output_name}_demux.h5ad .".format(
            output_name=output_name
        )
    )
    io.write_output(genome_indexed_raw_data, output_name + "_demux")
    print(
        "Raw sccloud-format hdf5 file with demultiplexing results is written to {output_name}_demux.h5sc .".format(
            output_name=output_name
        )
    )

    # output summary statistics
    print("\nSummary statistics:")
    print("total\t{}".format(data.shape[0]))
    for name, value in data.obs["demux_type"].value_counts().iteritems():
        print("{}\t{}".format(name, value))
Beispiel #12
0
def aggregate_matrices(
    csv_file: str,
    what_to_return: str = AnnData,
    restrictions: List[str] = [],
    attributes: List[str] = [],
    google_cloud: bool = False,
    select_singlets: bool = False,
    ngene: int = None,
    concat_matrices: bool = False,
) -> "None or AnnData or MemData":
    """Aggregate channel-specific count matrices into one big count matrix.

    This function takes as input a csv_file, which contains at least 2 columns — Sample, sample name; Location, file that contains the count matrices (e.g. filtered_gene_bc_matrices_h5.h5), and merges matrices from the same genome together. Depending on what_to_return, it can output the merged results into a sccloud-formatted HDF5 file or return as an AnnData or MemData object.

    Parameters
    ----------

    csv_file : `str`
        The CSV file containing information about each channel.
    what_to_return : `str`, optional (default: 'AnnData')
        If this value is equal to 'AnnData' or 'MemData', an AnnData or MemData object will be returned. Otherwise, results will be written into 'what_to_return.sccloud.h5' file and None is returned.
    restrictions : `list[str]`, optional (default: [])
        A list of restrictions used to select channels, each restriction takes the format of name:value,…,value or name:~value,..,value, where ~ refers to not.
    attributes : `list[str]`, optional (default: [])
        A list of attributes need to be incorporated into the output count matrix.
    google_cloud : `bool`, optional (default: False)
        If the channel-specific count matrices are stored in a google bucket.
    select_singlets : `bool`, optional (default: False)
        If we have demultiplexed data, turning on this option will make sccloud only include barcodes that are predicted as singlets.
    ngene : `int`, optional (default: None)
        The minimum number of expressed genes to keep one barcode.
    concat_matrices : `bool`, optional (default: False)
        If concatenate multiple matrices. If so, return only one AnnData object, otherwise, might return a list of AnnData objects.

    Returns
    -------
    `None` or `AnnData` or `MemData`
        Either `None` or an `AnnData` object or a `MemData` object.

    Examples
    --------
    >>> scc.aggregate_matrix('example.csv', 'example_10x.h5', ['Source:pbmc', 'Donor:1'], ['Source', 'Platform', 'Donor'])
    """

    df = pd.read_csv(csv_file, header=0, index_col="Sample")
    df["Sample"] = df.index

    # Select channels
    rvec = [parse_restriction_string(x) for x in restrictions]

    idx = pd.Series([True] * df.shape[0], index=df.index, name="selected")
    for name, isin, content in rvec:
        assert name in df.columns
        if isin:
            idx = idx & df[name].isin(content)
        else:
            idx = idx & (~(df[name].isin(content)))

    df = df.loc[idx]

    if df.shape[0] == 0:
        raise ValueError("No channels pass the restrictions!")

    # Load channels
    tot = 0
    aggrData = MemData()
    dest_paths = []
    for sample_name, row in df.iterrows():
        input_file = os.path.expanduser(
            os.path.expandvars(row["Location"].rstrip(os.sep)))
        file_format, copy_path, copy_type = infer_file_format(input_file)
        if google_cloud:
            base_name = os.path.basename(copy_path)
            dest_path = sample_name + "_tmp_" + base_name

            if copy_type == "directory":
                check_call(["mkdir", "-p", dest_path])
                call_args = ["gsutil", "-m", "cp", "-r", copy_path, dest_path]
            else:
                call_args = ["gsutil", "-m", "cp", copy_path, dest_path]
            check_call(call_args)
            dest_paths.append(dest_path)

            input_file = dest_path
            if file_format == "csv" and copy_type == "directory":
                input_file = os.path.join(dest_path,
                                          os.path.basename(input_file))

        genome = None
        if file_format in ["dge", "csv", "mtx", "loom"]:
            assert "Reference" in row
            genome = row["Reference"]

        data = read_input(
            input_file,
            genome=genome,
            return_type="MemData",
            ngene=ngene,
            select_singlets=select_singlets,
        )
        data.update_barcode_metadata_info(sample_name, row, attributes)
        aggrData.addAggrData(data)

        tot += 1
        print("Processed {}.".format(input_file))

    # Delete temporary file
    for dest_path in dest_paths:
        check_call(["rm", "-rf", dest_path])

    # Merge channels
    t1 = time.time()
    aggrData.aggregate()
    t2 = time.time()
    print("Data aggregation is finished in {:.2f}s.".format(t2 - t1))

    if what_to_return == "AnnData":
        aggrData = aggrData.convert_to_anndata(concat_matrices)
    elif what_to_return != "MemData":
        write_output(aggrData, what_to_return)
        aggrData = None

    print("Aggregated {tot} files.".format(tot=tot))

    return aggrData
def run_find_markers(
    input_h5ad_file: str,
    output_file: str,
    label_attr: str,
    de_key: str = "de_res",
    n_jobs: int = -1,
    min_gain: float = 1.0,
    random_state: int = 0,
    remove_ribo: bool = False,
) -> None:
    """
    For command line use.
    """
    import xlsxwriter
    from natsort import natsorted

    data = read_input(input_h5ad_file)
    markers = find_markers(
        data,
        label_attr,
        de_key=de_key,
        n_jobs=n_jobs,
        min_gain=min_gain,
        random_state=random_state,
        remove_ribo=remove_ribo,
    )

    keywords = [("strong", "strong_gain"), ("weak", "weak_gain"),
                ("down", "down_gain")]

    writer = pd.ExcelWriter(output_file, engine="xlsxwriter")

    for clust_id in natsorted(markers.keys()):
        clust_markers = markers[clust_id]

        sizes = []
        for keyword in keywords:
            sizes.append(len(clust_markers[keyword[0]]))

        arr = np.zeros((max(sizes), 8), dtype=object)
        arr[:] = ""

        for i in range(3):
            arr[0:sizes[i], i * 3] = clust_markers[keywords[i][0]]
            arr[0:sizes[i], i * 3 + 1] = clust_markers[keywords[i][1]]

        df = pd.DataFrame(
            data=arr,
            columns=[
                "strongly up-regulated",
                "gain",
                "",
                "weakly up-regulated",
                "gain",
                "",
                "down-regulated",
                "gain",
            ],
        )
        df.to_excel(writer, sheet_name=clust_id, index=False)

    writer.save()