def test_zarr(self):
        data = io.read_input("pegasusio-test-data/case4/MantonBM1_1_dbls.zarr")
        io.write_output(data, "pegasusio-test-data/case4/MantonBM_out.zarr")
        data = io.read_input("pegasusio-test-data/case4/MantonBM_out.zarr")

        self.assertEqual(data.shape, (4274, 19360),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "GRCh38", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
    def test_loom(self):
        data = io.read_input("pegasusio-test-data/case3/pancreas.loom",
                             genome='hg19')
        io.write_output(data, "pegasusio-test-data/case3/pancreas_out.loom")
        data = io.read_input("pegasusio-test-data/case3/pancreas_out.loom")

        self.assertEqual(data.shape, (2544, 58347),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "hg19", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
    def test_h5ad(self):
        data = io.read_input("pegasusio-test-data/case1/pbmc3k.h5ad",
                             genome='hg19')
        io.write_output(data, "pegasusio-test-data/case1/pbmc3k_out.h5ad")
        data = io.read_input("pegasusio-test-data/case1/pbmc3k_out.h5ad")

        self.assertEqual(data.shape, (2638, 1838),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "hg19", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
    def test_10x_mtx(self):
        data = io.read_input(
            "pegasusio-test-data/case3/42468c97-1c5a-4c9f-86ea-9eaa1239445a.mtx",
            genome='hg19')
        io.write_output(data, "pegasusio-test-data/case3/test.mtx")
        data = io.read_input("pegasusio-test-data/case3/test.mtx")

        self.assertEqual(data.shape, (2544, 58347),
                         "Count matrix shape differs!")
        self.assertEqual(data.get_genome(), "hg19", "Genome differs!")
        self.assertEqual(data.get_modality(), "rna", "Modality differs!")
Exemple #5
0
def plot_down_sampling(
    demuxEM_res_file: str,
    out_file: str,
    probs: List[float] = [i / 10.0 for i in range(9, 0, -1)],
    n_threads: int = 1,
    dpi: int = 500,
    figsize: Tuple[float, float] = None,
):
    data = read_input(demuxEM_res_file)
    rna_gt = data.get_data(modality="rna")
    hto_gt = data.get_data(modality="hashing")

    fracs, accuracy = down_sampling(rna_gt, hto_gt, probs, n_threads=n_threads)

    plt.plot(fracs, accuracy, ".-")
    ax = plt.gca()
    ax.set_xlim(1.0, 0.0)
    ax.set_ylim(0.79, 1.01)
    vals = ax.get_yticks()
    ax.set_yticklabels(["{:.0%}".format(v) for v in vals])
    ax.set_xlabel("Fraction of hashtag UMIs")
    ax.set_ylabel("Consistency")
    if figsize is not None:
        plt.gcf().set_size_inches(*figsize)
    plt.savefig(out_file, dpi=dpi)
    plt.close()
Exemple #6
0
def run_annotate_cluster(
    input_file: str,
    output_file: str,
    markers: str,
    de_test: str,
    de_alpha: float = 0.05,
    de_key: str = "de_res",
    threshold: float = 0.5,
    ignore_nonde: bool = False,
) -> None:
    """ For command line use.
    """
    from pegasusio import read_input

    data = read_input(input_file, mode="r")
    infer_cell_types(
        data,
        markers,
        de_test,
        de_alpha=de_alpha,
        de_key=de_key,
        threshold=threshold,
        ignore_nonde=ignore_nonde,
        output_file=output_file,
    )
Exemple #7
0
def write_output(assignment_file: str, input_mat_file: str, output_zarr_file: str, matching: dict) -> None:
	df = pd.read_csv(assignment_file, sep = '\t', header = 0, index_col = 0)
	df.index = pd.Index([x[:-2] for x in df.index])
	f = np.vectorize(translate_donor_name)
	df['assignment'] = f(df['assignment'].values, matching)
	idx = df['status'].values == 'unassigned'
	df.loc[idx, 'status'] = 'unknown'
	df.loc[idx, 'assignment'] = ''

	type_counts = df['status'].value_counts()
	print("\nSinglets = {}, doublets = {}, unknown = {}.".format(type_counts['singlet'], type_counts['doublet'], type_counts['unknown']))

	idx = df['status'] == 'singlet'
	singlet_counts = df.loc[idx, 'assignment'].value_counts()
	print("Among {} singlets, we have the following statistics:".format(type_counts['singlet']))
	for donor in natsorted(singlet_counts.index):
		print("  Reference donor {}: {}".format(donor, singlet_counts[donor]))
	print()

	data = pegasusio.read_input(input_mat_file)
	data.obs['demux_type'] = ''
	data.obs['assignment'] = ''

	idx = data.obs_names.isin(df.index)
	barcodes = data.obs_names[idx]
	ndf = df.loc[barcodes, ['status', 'assignment']]
	data.obs.loc[idx, 'demux_type'] = ndf['status'].values
	data.obs.loc[idx, 'assignment'] = ndf['assignment'].values

	pegasusio.write_output(data, output_zarr_file, zarr_zipstore = True)
Exemple #8
0
def run_pipeline(input_file: str, output_name: str, **kwargs):
    is_raw = not kwargs["processed"]

    black_list = set()
    if kwargs["black_list"] is not None:
        black_list = set(kwargs["black_list"].split(","))

    # load input data
    data = read_input(input_file, black_list=black_list)

    # process focus_list
    focus_list = kwargs["focus"]
    if len(focus_list) == 0:
        focus_list = [data.current_key()]

    append_data = None
    if kwargs["append"] is not None:
        append_data = data.get_data(kwargs["append"])

    logger.info("Inputs are loaded.")

    if is_raw and not kwargs["subcluster"]:
        # filter out low quality cells/genes
        tools._run_filter_data(
            data,
            focus_list=focus_list,
            output_filt=kwargs["output_filt"],
            plot_filt=kwargs["plot_filt"],
            plot_filt_figsize=kwargs["plot_filt_figsize"],
            min_genes_before_filt=kwargs["min_genes_before_filt"],
            select_singlets=kwargs["select_singlets"],
            remap_string=kwargs["remap_singlets"],
            subset_string=kwargs["subset_singlets"],
            min_genes=kwargs["min_genes"],
            max_genes=kwargs["max_genes"],
            min_umis=kwargs["min_umis"],
            max_umis=kwargs["max_umis"],
            mito_prefix=kwargs["mito_prefix"],
            percent_mito=kwargs["percent_mito"],
            percent_cells=kwargs["percent_cells"],
        )

    for key in focus_list:
        unidata = data.get_data(key)
        analyze_one_modality(unidata, f"{output_name}.{unidata.get_uid()}",
                             is_raw, append_data, **kwargs)

    print()

    # if kwargs["subcluster"]:
    #     unidata = tools.get_anndata_for_subclustering(adata, kwargs["subset_selections"])
    #     is_raw = True  # get submat and then set is_raw to True

    # write out results

    write_output(data, f"{output_name}.zarr.zip")

    print("Results are written.")
    def test_zarr(self):
        import pegasusio as io
        data = io.read_input("tests/inmf_result.zarr.zip")

        self.assertEqual(data.shape, (self.n_cells, self.n_features), "Count matrix shape not correct!")
        self.assertEqual(data.obsm['H'].shape, (self.n_cells, self.n_factors), "H shape not correct!")
        self.assertEqual(data.uns['V'].shape, (self.n_batches, self.n_factors, self.n_hvfs), "V shape not correct!")
        self.assertEqual(data.uns['W'].shape, (self.n_hvfs, self.n_factors), "W shape not correct!")

        self.assertEqual(data.obsm['X_inmf'].shape, (self.n_cells, self.n_factors), "iNMF embedding shape not correct!")
def attach_demux_results(input_rna_file: str,
                         rna_data: UnimodalData) -> MultimodalData:
    """ Write demultiplexing results into raw gene expression matrix.

    Parameters
    ----------
    input_rna_file: ``str``
        Input file for the raw gene count matrix.

    rna_data: ``UnimodalData``
        Processed gene count matrix containing demultiplexing results

    Returns
    -------
    ``MultimodalData``
    A multimodal data object.

    Examples
    --------
    >>> data = attach_demux_results('raw_data.h5', rna_data)
    """
    demux_results = read_input(input_rna_file)
    demux_results.subset_data(modality_subset=['rna'])
    # Assume all matrices are of the same dimension
    assert demux_results.uns["modality"] == "rna"
    barcodes = demux_results.obs_names
    idx = barcodes.isin(rna_data.obs_names)
    selected = barcodes[idx]

    demux_type = np.empty(barcodes.size, dtype="object")
    demux_type[:] = ""
    demux_type[idx] = rna_data.obs.loc[selected, "demux_type"]

    assignment = np.empty(barcodes.size, dtype="object")
    assignment[:] = ""
    assignment[idx] = rna_data.obs.loc[selected, "assignment"]

    assignment_dedup = None
    if "assignment.dedup" in rna_data.obs:
        assignment_dedup = np.empty(barcodes.size, dtype="object")
        assignment_dedup[:] = ""
        assignment_dedup[idx] = rna_data.obs.loc[selected, "assignment.dedup"]

    for keyword in demux_results.list_data():
        unidata = demux_results.get_data(keyword)
        assert unidata.uns["modality"] == "rna"
        unidata.obs["demux_type"] = demux_type
        unidata.obs["assignment"] = assignment
        if assignment_dedup is not None:
            unidata.obs["assignment.dedup"] = assignment_dedup

    logger.info("Demultiplexing results are added to raw expression matrices.")

    return demux_results
Exemple #11
0
def annotate_data_object(input_file: str, annotation: str) -> None:
    """ For command line use.
        annotation:  anno_name:clust_name:cell_type1;...cell_typen
    """
    from pegasusio import read_input, write_output

    data = read_input(input_file, mode="r")
    anno_name, clust_name, anno_str = annotation.split(":")
    anno_dict = {str(i + 1): x for i, x in enumerate(anno_str.split(";"))}
    annotate(data, anno_name, clust_name, anno_dict)
    write_output(data, input_file)
 def test_mixture_data(self):
     data = io.read_input(
         "pegasusio-test-data/case2/1k_hgmm_v3_filtered_feature_bc_matrix.h5"
     )
     data.select_data('mm10-rna')
     self.assertEqual(data.shape, (1063, 54232),
                      "Mouse data shape differs!")
     self.assertEqual(data.get_genome(), "mm10",
                      "Mouse data genome differs!")
     self.assertEqual(data.get_modality(), "rna",
                      "Mouse data modality differs!")
     data.select_data('hg19-rna')
     self.assertEqual(data.shape, (1063, 57905),
                      "Human data shape differs!")
     self.assertEqual(data.get_genome(), "hg19",
                      "Human data genome differs!")
     self.assertEqual(data.get_modality(), "rna",
                      "Human data modality differs!")
    def execute(self):
        kwargs = {
            "restrictions": self.args["--restriction"],
            "attrs": self.convert_to_list(self.args["--attributes"]),
            "basis": self.args["--basis"],
            "alpha": self.convert_to_list(self.args["--alpha"], converter=float),
            "legend_loc": self.convert_to_list(self.args["--legend-loc"]),
            "palettes" : self.args["--palette"],
            "show_background": self.args["--show-background"],
            "nrows": self.convert_to_int(self.args["--nrows"]),
            "ncols": self.convert_to_int(self.args["--ncols"]),
            "panel_size": self.convert_to_list(self.args["--panel-size"], converter=float),
            "left": self.convert_to_float(self.args["--left"]),
            "bottom": self.convert_to_float(self.args["--bottom"]),
            "wspace": self.convert_to_float(self.args["--wspace"]),
            "hspace": self.convert_to_float(self.args["--hspace"]),
            "groupby": self.args["--groupby"],
            "condition": self.args["--condition"],
            "style": self.args["--style"],
            "factor": int(self.args["--factor"]) if self.args["--factor"] is not None else self.args["--factor"],
            "max_words": int(self.args["--max-words"]),
            "return_fig": True,
            "dpi": int(self.args["--dpi"]),
        }

        for key in ["nrows", "ncols", "panel_size", "left", "bottom", "wspace", "hspace"]:
            if kwargs[key] is None:
                del kwargs[key]

        if self.args["<plot_type>"] == "scatter" and kwargs["attrs"] is None:
            raise KeyError("--attributes must be provided for scatter plots!")
        if self.args["<plot_type>"] == "compo" and (kwargs["groupby"] is None or kwargs["condition"] is None):
            raise KeyError("--groupby and --condition must be provided for composition plots!")
        if self.args["<plot_type>"] == "wordcloud" and kwargs["factor"] is None:
            raise KeyError("--factor must be provided for word cloud plots!")

        plot_type2keyword = {"scatter": "scatter", "compo" : "compo_plot", "wordcloud": "wordcloud"}

        data = read_input(self.args["<input_file>"])
        fig = getattr(pegasus.plotting, plot_type2keyword[self.args["<plot_type>"]])(data, **kwargs)

        output_file = self.args["<output_file>"]
        fig.savefig(output_file)
        logger.info(f"{output_file} is generated.")
def run_de_analysis(
    input_file: str,
    output_excel_file: str,
    cluster: str,
    condition: Optional[str] = None,
    de_key: str = "de_res",
    n_jobs: int = -1,
    auc: bool = True,
    t: bool = True,
    fisher: bool = False,
    mwu: bool = False,
    temp_folder: str = None,
    verbose: bool = True,
    alpha: float = 0.05,
    ndigits: int = 3,
) -> None:
    """ For command line only
    """

    from pegasusio import read_input, write_output

    data = read_input(input_file, mode='r')

    de_analysis(
        data,
        cluster,
        condition=condition,
        de_key=de_key,
        n_jobs=n_jobs,
        t=t,
        fisher=fisher,
        temp_folder=temp_folder,
        verbose=verbose,
    )

    write_output(data, input_file)
    logger.info(
        f"Differential expression results are written to varm/{de_key}.")

    results = markers(data, de_key=de_key, alpha=alpha)
    write_results_to_excel(results, output_excel_file, ndigits=ndigits)
Exemple #15
0
def write_output(assignment_file: str, input_mat_file: str,
                 output_zarr_file: str) -> None:
    df = pd.read_csv(assignment_file, sep='\t', header=0, index_col='BARCODE')
    df.index = pd.Index([x[:-2] for x in df.index])
    df['demux_type'] = df['DROPLET.TYPE'].apply(lambda s: demux_type_dict[s])
    df['assignment'] = ''
    df.loc[df['demux_type'] == 'singlet',
           'assignment'] = df.loc[df['demux_type'] == 'singlet',
                                  'SNG.BEST.GUESS']
    df.loc[df['demux_type'] == 'doublet', 'assignment'] = df.loc[
        df['demux_type'] == 'doublet',
        'DBL.BEST.GUESS'].apply(lambda s: ','.join(s.split(',')[:-1]))

    data = io.read_input(input_mat_file)
    data.obs['demux_type'] = ''
    data.obs['assignment'] = ''

    idx = data.obs_names.isin(df.index)
    barcodes = data.obs_names[idx]
    df_valid = df.loc[barcodes, ['demux_type', 'assignment']]
    data.obs.loc[idx, 'demux_type'] = df_valid['demux_type'].values
    data.obs.loc[idx, 'assignment'] = df_valid['assignment'].values

    io.write_output(data, output_zarr_file)
#!/usr/bin/env python

from sys import argv, exit
import pegasusio

if len(argv) != 4:
    print(
        "Usage: python extract_barcodes_from_rna.py input_raw.h5 output_barcodes.tsv ngene"
    )
    exit(-1)

data = pegasusio.read_input(argv[1], ngene=int(argv[3]))

with open(argv[2], "w") as fout:
    fout.write('\n'.join([x + '-1' for x in data.obs_names]) + '\n')
Exemple #17
0
def run_pipeline(input_rna_file, input_hto_file, output_name, **kwargs):
    # load input rna data
    data = io.read_input(input_rna_file,
                         genome=kwargs["genome"],
                         modality="rna")
    data.concat_data()  # in case of multi-organism mixing data
    rna_key = data.uns["genome"]

    # load input hashing data
    data.update(
        io.read_input(input_hto_file, genome="hashing", modality="hashing"))
    hashing_key = "hashing"

    # Extract rna and hashing data
    rna_data = data.get_data(rna_key)
    hashing_data = data.get_data(hashing_key)

    # Filter the RNA matrix
    rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1)
    rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
    obs_index = np.logical_and.reduce((
        rna_data.obs["n_genes"] >= kwargs["min_num_genes"],
        rna_data.obs["n_counts"] >= kwargs["min_num_umis"],
    ))
    rna_data._inplace_subset_obs(obs_index)

    # run demuxEM
    estimate_background_probs(hashing_data,
                              random_state=kwargs["random_state"])

    demultiplex(
        rna_data,
        hashing_data,
        min_signal=kwargs["min_signal"],
        alpha=kwargs["alpha"],
        n_threads=kwargs["n_jobs"],
    )

    # annotate raw matrix with demuxEM results
    demux_results = attach_demux_results(input_rna_file, rna_data)

    # generate plots
    if kwargs["gen_plots"]:
        plot_hto_hist(hashing_data,
                      "hto_type",
                      output_name + ".ambient_hashtag.hist.pdf",
                      alpha=1.0)
        plot_bar(
            hashing_data.uns["background_probs"],
            hashing_data.var_names,
            "Sample ID",
            "Background probability",
            output_name + ".background_probabilities.bar.pdf",
        )
        plot_hto_hist(hashing_data,
                      "rna_type",
                      output_name + ".real_content.hist.pdf",
                      alpha=0.5)
        plot_rna_hist(rna_data, output_name + ".rna_demux.hist.pdf")
        logger.info("Diagnostic plots are generated.")

    if len(kwargs["gen_gender_plot"]) > 0:
        rna_data.matrices["raw.X"] = rna_data.X.copy()
        rna_data.as_float()
        scale = 1e5 / rna_data.X.sum(axis=1).A1
        rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr))
        rna_data.X.data = np.log1p(rna_data.X.data)

        for gene_name in kwargs["gen_gender_plot"]:
            plot_gene_violin(
                rna_data,
                gene_name,
                "{output_name}.{gene_name}.violin.pdf".format(
                    output_name=output_name, gene_name=gene_name),
                title="{gene_name}: a gender-specific gene".format(
                    gene_name=gene_name),
            )

        logger.info(
            "Gender-specific gene expression violin plots are generated.")

    # output results
    io.write_output(demux_results,
                    output_name + "_demux.zarr",
                    zarr_zipstore=True)
    io.write_output(data,
                    output_name + ".out.demuxEM.zarr",
                    zarr_zipstore=True)

    # output summary statistics
    print("\nSummary statistics:")
    print("total\t{}".format(rna_data.shape[0]))
    for name, value in rna_data.obs["demux_type"].value_counts().iteritems():
        print("{}\t{}".format(name, value))
def aggregate_matrices(
    csv_file: Union[str, Dict[str, np.ndarray], pd.DataFrame],
    restrictions: Optional[Union[List[str], str]] = [],
    attributes: Optional[Union[List[str], str]] = [],
    default_ref: Optional[str] = None,
    append_sample_name: Optional[bool] = True,
    select_singlets: Optional[bool] = False,
    remap_string: Optional[str] = None,
    subset_string: Optional[str] = None,
    min_genes: Optional[int] = None,
    max_genes: Optional[int] = None,
    min_umis: Optional[int] = None,
    max_umis: Optional[int] = None,
    mito_prefix: Optional[str] = None,
    percent_mito: Optional[float] = None,
) -> MultimodalData:
    """Aggregate channel-specific count matrices into one big count matrix.

    This function takes as input a csv_file, which contains at least 2 columns — Sample, sample name; Location, file that contains the count matrices (e.g. filtered_gene_bc_matrices_h5.h5), and merges matrices from the same genome together. If multi-modality exists, a third Modality column might be required. An aggregated Multimodal Data will be returned.

    Parameters
    ----------

    csv_file : `str`
        The CSV file containing information about each channel. Alternatively, a dictionary or pd.Dataframe can be passed.
    restrictions : `list[str]` or `str`, optional (default: [])
        A list of restrictions used to select channels, each restriction takes the format of name:value,…,value or name:~value,..,value, where ~ refers to not. If only one restriction is provided, it can be provided as a string instead of a list.
    attributes : `list[str]` or `str`, optional (default: [])
        A list of attributes need to be incorporated into the output count matrix. If only one attribute is provided, this attribute can be provided as a string instead of a list.
    default_ref : `str`, optional (default: None)
        Default reference name to use. If there is no Reference column in the csv_file, a Reference column will be added with default_ref as its value. This argument can also be used for replacing genome names. For example, if default_ref is 'hg19:GRCh38,GRCh38', we will change any genome with name 'hg19' to 'GRCh38' and if no genome is provided, 'GRCh38' is the default.
    append_sample_name : `bool`, optional (default: True)
        By default, append sample_name to each channel. Turn this option off if each channel has distinct barcodes.
    select_singlets : `bool`, optional (default: False)
        If we have demultiplexed data, turning on this option will make pegasus only include barcodes that are predicted as singlets.
    remap_string: ``str``, optional, default ``None``
        Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'.
    subset_string: ``str``, optional, default ``None``
        If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3".
    min_genes: ``int``, optional, default: None
       Only keep cells with at least ``min_genes`` genes.
    max_genes: ``int``, optional, default: None
       Only keep cells with less than ``max_genes`` genes.
    min_umis: ``int``, optional, default: None
       Only keep cells with at least ``min_umis`` UMIs.
    max_umis: ``int``, optional, default: None
       Only keep cells with less than ``max_umis`` UMIs.
    mito_prefix: ``str``, optional, default: None
       Prefix for mitochondrial genes.
    percent_mito: ``float``, optional, default: None
       Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Only when both mito_prefix and percent_mito set, the mitochondrial filter will be triggered.

    Returns
    -------
    `MultimodalData` object.
        The aggregated count matrix as an MultimodalData object.

    Examples
    --------
    >>> data = aggregate_matrix('example.csv', restrictions=['Source:pbmc', 'Donor:1'], attributes=['Source', 'Platform', 'Donor'])
    """
    if isinstance(csv_file, str):
        df = pd.read_csv(csv_file, header=0,
                         index_col=False)  # load sample sheet
    elif isinstance(csv_file, dict):
        df = pd.DataFrame(csv_file)
    else:
        df = csv_file

    # Remove duplicated items
    if isinstance(restrictions, str):
        restrictions = [restrictions]
    restrictions = set(restrictions)
    if isinstance(attributes, str):
        attributes = [attributes]
    attributes = set(attributes)

    # Select data
    rvec = [_parse_restriction_string(x) for x in restrictions]

    idx = pd.Series([True] * df.shape[0], index=df.index, name="selected")
    for name, isin, content in rvec:
        assert name in df.columns
        if isin:
            idx = idx & df[name].isin(content)
        else:
            idx = idx & (~(df[name].isin(content)))

    if idx.sum() == 0:
        raise ValueError("No data pass the restrictions!")

    df = df.loc[idx].sort_values(by="Sample")  # sort by sample_name

    # parse default_ref
    def_genome, genome_dict = _parse_genome_string(default_ref)

    # Load data
    tot = 0
    dest_paths = [
    ]  # record localized file paths so that we can remove them later
    curr_sample = ""
    curr_row = curr_data = None
    aggrData = AggrData()

    for idx_num, row in df.iterrows():
        input_file = os.path.expanduser(
            os.path.expandvars(row["Location"].rstrip(
                os.sep)))  # extend all user variables
        file_type, copy_path, copy_type = infer_file_type(
            input_file)  # infer file type

        if row["Location"].lower().startswith('gs://'):  # if Google bucket
            base_name = os.path.basename(copy_path)
            dest_path = f"{idx_num}_tmp_{base_name}"  # id_num will make sure dest_path is unique in the sample sheet
            if not os.path.exists(
                    dest_path
            ):  # if dest_path exists, we may try to localize it once and may have the file cached
                if copy_type == "directory":
                    check_call(["mkdir", "-p", dest_path])
                    call_args = [
                        "gsutil", "-m", "rsync", "-r", copy_path, dest_path
                    ]
                else:
                    call_args = ["gsutil", "-m", "cp", copy_path, dest_path]
                check_call(call_args)
            dest_paths.append(dest_path)

            if input_file == copy_path:
                input_file = dest_path
            else:
                input_file = os.path.join(dest_path,
                                          os.path.basename(input_file))

        genome = row.get("Reference", None)
        if (genome
                is not None) and (not isinstance(genome, str)):  # to avoid NaN
            genome = None
        if genome is None:
            genome = def_genome
        modality = row.get("Modality", None)
        data = read_input(input_file,
                          file_type=file_type,
                          genome=genome,
                          modality=modality)
        if len(genome_dict) > 0:
            data._update_genome(genome_dict)

        if row["Sample"] != curr_sample:
            if curr_data is not None:
                curr_data._propogate_genome()
                curr_data.filter_data(select_singlets=select_singlets,
                                      remap_string=remap_string,
                                      subset_string=subset_string,
                                      min_genes=min_genes,
                                      max_genes=max_genes,
                                      min_umis=min_umis,
                                      max_umis=max_umis,
                                      mito_prefix=mito_prefix,
                                      percent_mito=percent_mito)
                curr_data._update_barcode_metadata_info(
                    curr_row, attributes, append_sample_name)
                aggrData.add_data(curr_data)
            curr_data = data
            curr_row = row
            curr_sample = row["Sample"]
        else:
            curr_data.update(data)

        tot += 1

    if curr_data is not None:
        curr_data._propogate_genome()
        curr_data.filter_data(select_singlets=select_singlets,
                              remap_string=remap_string,
                              subset_string=subset_string,
                              min_genes=min_genes,
                              max_genes=max_genes,
                              min_umis=min_umis,
                              max_umis=max_umis,
                              mito_prefix=mito_prefix,
                              percent_mito=percent_mito)
        curr_data._update_barcode_metadata_info(curr_row, attributes,
                                                append_sample_name)
        aggrData.add_data(curr_data)

    # Merge data
    aggregated_data = aggrData.aggregate()
    attributes.add("Channel")
    aggregated_data._convert_attributes_to_categorical(attributes)
    logger.info(f"Aggregated {tot} files.")

    # Delete temporary file
    if len(dest_paths) > 0:
        for dest_path in dest_paths:
            check_call(["rm", "-rf", dest_path])
        logger.info("Temporary files are deleted.")

    return aggregated_data
Exemple #19
0
def run_find_markers(
    input_h5ad_file: str,
    output_file: str,
    label_attr: str,
    de_key: str = "de_res",
    n_jobs: int = -1,
    min_gain: float = 1.0,
    random_state: int = 0,
    remove_ribo: bool = False,
) -> None:
    """
    For command line use.
    """
    import xlsxwriter
    from natsort import natsorted

    data = read_input(input_h5ad_file)
    markers = find_markers(
        data,
        label_attr,
        de_key=de_key,
        n_jobs=n_jobs,
        min_gain=min_gain,
        random_state=random_state,
        remove_ribo=remove_ribo,
    )

    keywords = [("strong", "strong_gain"), ("weak", "weak_gain"),
                ("down", "down_gain")]

    writer = pd.ExcelWriter(output_file, engine="xlsxwriter")

    for clust_id in natsorted(markers.keys()):
        clust_markers = markers[clust_id]

        sizes = []
        for keyword in keywords:
            sizes.append(len(clust_markers[keyword[0]]))

        arr = np.zeros((max(sizes), 8), dtype=object)
        arr[:] = ""

        for i in range(3):
            arr[0:sizes[i], i * 3] = clust_markers[keywords[i][0]]
            arr[0:sizes[i], i * 3 + 1] = clust_markers[keywords[i][1]]

        df = pd.DataFrame(
            data=arr,
            columns=[
                "strongly up-regulated",
                "gain",
                "",
                "weakly up-regulated",
                "gain",
                "",
                "down-regulated",
                "gain",
            ],
        )
        df.to_excel(writer, sheet_name=clust_id, index=False)

    writer.save()
#!/usr/bin/env python

from sys import argv, exit
import pegasusio as io

if len(argv) != 4:
    print(
        "Usage: python extract_barcodes_from_rna.py input_raw.h5 output_barcodes.tsv ngene"
    )
    exit(-1)

data = io.read_input(argv[1])
data.filter_data(min_genes=int(argv[3]))

with open(argv[2], "w") as fout:
    fout.write('\n'.join([x + '-1' for x in data.obs_names]) + '\n')
Exemple #21
0
                            type=int,
                            help='Random seed',
                            default=0)

    parser_plot = subparsers.add_parser('plot',
                                        help='Plot topic modelling stats')
    parser_plot.add_argument('stats', type=str, nargs='+')
    args = parser.parse_args()

    if args.sub_parser == 'prepare':

        prefix_exclude = None
        if args.prefix_exclude is not None:
            prefix_exclude = args.prefix_exclude.split(',')
        input_path = args.input
        d = pio.read_input(input_path)
        lda_setup(adata=d,
                  prefix_exclude=prefix_exclude,
                  min_percent=args.min_percent,
                  max_percent=args.max_percent)
    elif args.sub_parser == 'run':
        dictionary = gensim.corpora.Dictionary.load(args.dictionary)
        corpus = gensim.corpora.MmCorpus(args.corpus)
        compute_lda(corpus=corpus,
                    cell_ids=pd.read_csv(args.cell_ids,
                                         index_col=0).index.values,
                    dictionary=dictionary,
                    topics=args.topics,
                    random_state=args.random_seed)
    elif args.sub_parser == 'plot':
        stats = []