Beispiel #1
0
def chipseq_analysis_with_peaks(chipseq_analysis):
    import numpy as np
    from ngs_toolkit.utils import bed_to_index

    df = chipseq_analysis.sites.to_dataframe()

    # for homer peaks add p-value column
    df["name"] = bed_to_index(df)  # dummy column
    df["score"] = np.random.random(df.shape[0])  # p-value
    df["strand"] = "."  # dummy column

    for name, comp in chipseq_analysis.comparisons.items():

        os.makedirs(comp["output_dir"])

        for peak_type in ["original", "filtered"]:
            for peak_caller, file in comp["peak_calls"][peak_type].items():
                # select 30% of all peaks to be present in each sample
                df2 = df.sample(frac=0.3)

                # for homer the column order is different
                if "homer" in peak_caller:
                    df2 = df2[[
                        "name", "chrom", "start", "end", "score", "strand"
                    ]]

                with open(file, "w") as handle:
                    for _, entry in df2.iterrows():
                        handle.write("\t".join(entry.astype(str)) + "\n")

    return chipseq_analysis
Beispiel #2
0
def get_random_genomic_locations(n_regions,
                                 width_mean=500,
                                 width_std=400,
                                 min_width=300,
                                 genome_assembly="hg38"):
    """Get `n_regions`` number of random genomic locations respecting the boundaries of the ``genome_assembly``"""
    from ngs_toolkit.utils import bed_to_index

    # weight chroms by their size, excluding others
    csizes = {
        k: v[-1]
        for k, v in dict(pybedtools.chromsizes(genome_assembly)).items()
        if "_" not in k
    }
    gsize = sum(csizes.values())
    csizes = {k: v / gsize for k, v in csizes.items()}
    chrom = pd.Series(
        np.random.choice(a=list(csizes.keys()),
                         size=n_regions,
                         p=list(csizes.values())))
    start = np.array([0] * n_regions)
    end = np.absolute(np.random.normal(width_mean, width_std,
                                       n_regions)).astype(int)
    df = pd.DataFrame([chrom.tolist(), start.tolist(), end.tolist()]).T
    df.loc[(df[2] - df[1]) < min_width, 2] += min_width
    bed = (pybedtools.BedTool.from_dataframe(df).shuffle(
        genome=genome_assembly,
        chromFirst=True,
        noOverlapping=True,
        chrom=True).sort().to_dataframe())
    return bed_to_index(bed)
Beispiel #3
0
def main(cli=None):
    print("Region type analysis")
    # Parse command-line arguments.
    args = parse_arguments().parse_args(cli)
    if os.path.exists(args.output_file) and (not args.overwrite):
        print("Output exists and `overwrite` is False, so not doing anything.")
        return 0

    print("Reading up the analysis object.")
    a = ATACSeqAnalysis(from_pep=args.pep)
    a.load_data()
    # (
    #     "genomic_region",
    #     "region_annotation_mapping",
    #     "region_annotation_b_mapping",
    # ),
    # (
    #     "chromatin_state",
    #     "chrom_state_annotation_mapping",
    #     "chrom_state_annotation_b_mapping",
    # ),
    print("Reading up the BED file.")
    df = pd.read_csv(args.bed_file, sep="\t", header=None)
    df.columns = ['chrom', 'start', 'end']
    print("Getting the index.")
    index = bed_to_index(df)
    print("Doing enrichment.")
    enr = a.region_context_enrichment(index)
    print("Saving.")
    enr.to_csv(args.output_file)
    print("Done.")
Beispiel #4
0
def get_genomic_bins(n_bins, genome_assembly="hg38", resolution=None):
    """Get a ``size`` number of random genomic bins respecting the boundaries of the ``genome_assembly``"""
    from ngs_toolkit.utils import bed_to_index

    bed = pybedtools.BedTool.from_dataframe(
        pd.DataFrame(dict(
            pybedtools.chromsizes(genome_assembly))).T.reset_index())
    w = bed.makewindows(genome=genome_assembly,
                        w=sum([i.length
                               for i in bed]) / n_bins).to_dataframe()
    if resolution is not None:
        if isinstance(resolution, str):
            resolution = int(resolution.replace("kb", "000"))
        w["end"] = w["start"] + resolution
    return bed_to_index(w.head(n_bins))
Beispiel #5
0
    def calculate_peak_support(
            self, samples=None, region_type="summits", peak_type="filtered", permissive=True,
            comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"):
        """
        Calculate a measure of support for each region in peak set
        (i.e. ratio of samples containing a peak overlapping region in union set of peaks).

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            DataFrame with signal/background combinations used to call peaks

            Defaults to analysis' own `comparison_table`.
        peak_dir : :obj:`str`, optional
            Path to peaks output directory.
            Defaults to {analysis.results_dir}/chipseq_peaks
        samples: :obj:`list`
            Not used. Provided for compatibility with ATACSeqAnalysis class.
        region_type: :obj:`str`
            Not used. Provided for compatibility with ATACSeqAnalysis class.
        permissive: :obj:`bool`
            Not used. Provided for compatibility with ATACSeqAnalysis class.

        Attributes
        ----------
        support : :obj:`pandas.DataFrame`
            DataFrame with signal/background combinations used to call peaks
        """
        import pybedtools
        from tqdm import tqdm
        from ngs_toolkit.utils import bed_to_index

        if comparison_table is None:
            comparison_table = self.comparison_table

        peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir))

        # get index
        index = bed_to_index(self.sites.to_dataframe())

        # calculate support (number of samples overlaping each merged peak)
        support = pd.DataFrame(index=index)
        for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"):
            for peak_caller, peak_file in comp['peak_calls'][peak_type].items():
                try:
                    sample_support = self.sites.intersect(peak_file, wa=True, c=True).to_dataframe()
                except (
                    ValueError,
                    pybedtools.MalformedBedLineError,
                    pybedtools.helpers.BEDToolsError,
                ):
                    _LOGGER.warning(
                        "Peaks for comparison %s (%s) not found!", (name, peak_file))
                    if permissive:
                        continue
                    else:
                        raise
                sample_support.index = index
                support[(name, peak_caller)] = sample_support.iloc[:, 3]

        # Make multiindex labeling comparisons and peak type
        support.columns = pd.MultiIndex.from_tuples(
            support.columns, names=["comparison", "peak_caller"]
        )
        support.to_csv(
            os.path.join(
                self.results_dir, self.name + "_peaks.binary_overlap_support.csv"
            ),
            index=True,
        )

        # divide sum (of unique overlaps) by total to get support value between 0 and 1
        support["support"] = support.astype(bool).sum(axis=1) / float(support.shape[1])

        # save
        support.to_csv(
            os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True
        )

        self.support = support
Beispiel #6
0
    def get_cnv_data(self,
                     resolutions=None,
                     samples=None,
                     save=True,
                     assign=True,
                     permissive=False):
        """
        Load CNV data from ATAC-seq CNV pipeline and create CNV matrix at various resolutions.

        Parameters
        ----------
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        save: :obj:`bool`, optional
            Whether results should be saved to disc.

            Defaults to :obj:`True`
        assign: :obj:`bool`, optional
            Whether results should be assigned to an attribute in the Analsyis object.

            Defaults to :obj:`True`
        permissive: :obj:`bool`, optional
            Whether missing files should be allowed.

            Defaults to :obj:`False`

        Returns
        -------
        dict
            Dictionary with CNV matrices for each resolution.

        Raises
        -------
        IOError
            If not permissive and input files can't be read.

        Attributes
        ----------
        matrix : :obj:`dict`
            Sets a `matrix` dictionary with CNV matrices for each resolution.
        """
        # TODO: figure out a way of having the input file specified before hand
        from tqdm import tqdm
        from ngs_toolkit.utils import bed_to_index

        if resolutions is None:
            resolutions = self.resolutions

        if samples is None:
            samples = self.samples

        matrix_raw = dict()

        for resolution in tqdm(resolutions,
                               total=len(resolutions),
                               desc="Resolution"):
            matrix_raw[resolution] = pd.DataFrame()

            for sample in tqdm(samples, total=len(samples), desc="Sample"):
                # Read log2 file
                if not hasattr(sample, "log2_read_counts"):
                    msg = "Sample does not have a 'log2_read_counts' attribute."
                    warn_or_raise(AttributeError(msg), permissive)

                input_file = sample.log2_read_counts[resolution].format(
                    resolution=resolution)
                try:
                    cov = pd.read_csv(input_file, sep="\t",
                                      comment="#").set_index("Feature")
                except IOError as e:
                    e = IOError(
                        "Sample %s does not have a 'log2_read_counts' file: '%s'."
                        % (sample.name, input_file))
                    warn_or_raise(e, permissive)

                # TODO: this is specific to CopyWriter, should be removed later
                # and probably replaced with the column position
                cov.columns = (cov.columns.str.replace(
                    "log2.",
                    "").str.replace(".trimmed.bowtie2.filtered.bam",
                                    "").str.replace(
                                        ".merged.sorted.subsample.bam", ""))

                # normalize signal to control
                # # TODO: check whether there was a reason I was previously
                # # undoing and redoing the log
                # matrix_raw[resolution][sample.name] = np.log2(
                #     (
                #         (0.1 + (2 ** cov.loc[:, sample.name]))
                #         / (0.1 + (2 ** cov.iloc[:, -1]))
                #     )
                # )
                matrix_raw[resolution][
                    sample.name] = cov.loc[:, sample.name] - cov.iloc[:, -1]
            if "cov" not in locals():
                msg = "None of the samples had a valid 'log2_read_counts' file."
                _LOGGER.error(msg)
                raise ValueError(msg)

            c = cov.columns.tolist()
            c[:3] = ["chrom", "start", "end"]
            cov.columns = c
            matrix_raw[resolution].index = bed_to_index(cov)

            if save:
                matrix_raw[resolution].to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".{}.matrix_raw.csv".format(resolution),
                    ),
                    index=True,
                )

        if assign:
            self.matrix_raw = matrix_raw

        return matrix_raw