Esempio n. 1
0
def create_requirements_file(project_name, project_dir, requirements=None, overwrite=False):
    """
    Create a requirements.txt file with pip requirements.
    """

    def get_current_requirements():
        import requests

        package_name = "ngs-toolkit"
        url = "https://pypi.python.org/pypi/" + str(package_name) + "/json"
        data = requests.get(url).json()
        requirements = [
            x.replace(r" ", "").replace("(", "").replace(")", "")
            for x in data["info"]["requires_dist"]
            if "extra" not in x
        ]
        requirements.append("ngs_toolkit=={}".format(ngs_toolkit.__version__))
        return requirements

    if requirements is None:
        requirements = get_current_requirements()

    requirements_file = os.path.join(project_dir, "requirements.txt")

    if os.path.exists(requirements_file):
        if not overwrite:
            _LOGGER.warning("'requirements.txt' file already existing, skipping.")
            return

    requirements_filecontent = "\n".join(requirements)

    # write requirements file
    with open(requirements_file, "w", 1) as handle:
        handle.write(textwrap.dedent(requirements_filecontent) + "\n")
Esempio n. 2
0
def warn_or_raise(exception, permissive=False):
    from ngs_toolkit import _LOGGER

    msg = exception.args[0]
    if permissive:
        _LOGGER.warning(msg)
    else:
        _LOGGER.error(msg)
        raise exception
Esempio n. 3
0
def count_reads_in_intervals(bam, intervals, permissive=True):
    """
    Count total number of reads in a iterable holding strings
    representing genomic intervals of the form ``"chrom:start-end"``.

    Please make sure both ``intervals`` and ``bam`` file are
    zero- or one-indexed.

    Parameters
    ----------
    bam : :obj:`str`
        Path to BAM file.

    intervals : :obj:`list`
        List of strings with genomic coordinates in format
        ``"chrom:start-end"``.

    Returns
    -------
    :obj:`dict`
        Dict of read counts for each interval.
    """
    import pysam
    from ngs_toolkit import _LOGGER

    counts = dict()

    bam = pysam.AlignmentFile(bam, mode="rb")

    errors: int = 0
    for interval in intervals:
        try:
            counts[interval] = bam.count(region=interval)
        except ValueError:
            if permissive:
                errors += 1
            else:
                raise
            # if fix_off_by_one:
            #     i = interval.split(":")[1]
            #     s = (
            #         interval.split(":")[0] +
            #         ":" + str(int(i.split("-")[0]) + 1) +
            #         "-" + str(int(i.split("-")[1]) + 1))
            #     counts[interval] = bam.count(region=s)
    bam.close()
    if errors > 0:
        _LOGGER.warning("There have been %i errors. Beware.", errors)

    return counts
Esempio n. 4
0
def get_this_file_or_timestamped(file, permissive=True):
    """
    Get a path to an existing timestamped file based on an non-timestamped path.

    Parameters
    ----------
    file_name : :obj:`str`
        File name of analysis output to record.
    permissive : :obj:`bool`
        Whether failure to find timestamped file should return the original file
        or raise a IndexError.

    Raises
    ----------
    IndexError
        If not `permissive` and can't find timestamped file.
    """
    from glob import glob
    import re

    from ngs_toolkit.utils import sorted_nicely
    from ngs_toolkit import _LOGGER

    split = file.split(".")
    body = ".".join(split[:-1])
    end = split[-1]

    res = sorted_nicely(glob(body + "*" + end))
    res = [x for x in res if re.search(body + r"\.\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.", x)]
    if len(res) > 1:
        _LOGGER.warning(
            "Could not get unequivocal timestamped file for '{}'.".format(file)
            + " Returning latest: '{}'.".format(res[-1])
        )
    try:
        # get newest file
        return res[-1]
    except IndexError:
        if permissive:
            return file
        else:
            msg = "Could not remove timestamp from file path."
            msg += " Probabably it does not exist."
            _LOGGER.error(msg)
            raise IndexError(msg)
Esempio n. 5
0
    def collect_esat_output(self, samples=None, permissive=True):
        """
        Collect gene expression (read counts, gene-level) output from ESAT
        into expression matrix for `samples`.
        """
        if samples is None:
            samples = self.samples

        first = True
        for sample in samples:
            try:
                c = pd.read_csv(
                    os.path.join(
                        sample.sample_root,
                        "ESAT_{}".format(sample.genome),
                        sample.name + ".gene.txt",
                    ),
                    sep="\t",
                )
            except IOError:
                if permissive:
                    _LOGGER.warning("Sample '%s' is missing file: %s",
                                    sample.name, sample.counts)
                    continue
                else:
                    raise
            # extract only gene ID and counts
            c = c[["Symbol", "Exp1"]]
            c = c.rename(columns={
                "Symbol": "gene_symbol",
                "Exp1": sample.name
            }).set_index("gene_symbol")

            # Append
            if first:
                expr = c
            else:
                expr = expr.join(c)
            first = False

        return expr.sort_index()
Esempio n. 6
0
    def __init__(
        self,
        name=None,
        from_pep=False,
        from_pickle=False,
        root_dir=None,
        data_dir="data",
        results_dir="results",
        prj=None,
        samples=None,
        **kwargs
    ):
        # The check for existance is to make sure other classes can inherit from this
        default_args = {
            "data_type": "ChIP-seq",
            "__data_type__": "ChIP-seq",
            "var_unit_name": "region",
            "quantity": "binding",
            "norm_units": "RPM"}
        for k, v in default_args.items():
            if not hasattr(self, k):
                setattr(self, k, v)

        super(ChIPSeqAnalysis, self).__init__(
            name=name,
            from_pep=from_pep,
            from_pickle=from_pickle,
            root_dir=root_dir,
            data_dir=data_dir,
            results_dir=results_dir,
            prj=prj,
            samples=samples,
            **kwargs
        )

        if hasattr(self, "comparison_table"):
            self.set_comparisons()
        else:
            msg = "No comparison table was given. Will not prefill peak calling comparisons."
            _LOGGER.warning(msg)
Esempio n. 7
0
try:
    DEV = os.environ["TRAVIS_BRANCH"] == "dev"
except KeyError:
    pass
try:
    DEV = os.environ["GITHUB_REF"] == "dev"
except KeyError:
    import subprocess

    try:
        o = subprocess.check_output("git status".split(" "))
        DEV = "dev" in o.decode().split("\n")[0]
    except subprocess.CalledProcessError:
        msg = "Could not detect whether on a development branch."
        _LOGGER.warning(msg)

# Test-specifc options
# # Note:
# # The DESeq2 1.24.0 version in Debian archives
# # differs from the DESeq2 1.24.0 version in bioconductor version 3.9
# # If estimateDispersions with default fitType="parametric" fails,
# # (as often happens with the quickly generated synthetic data from tests),
# # it tries to use local fit using the locfit package, but in Debian
# # version this is not a valid choice of fit, causing failure.
# # Due to this, and since I'm using Debian packages for faster testing
# # I'm manually setting fitType="mean" for testing only.
Analysis.differential_analysis = partialmethod(
    Analysis.differential_analysis, deseq_kwargs={"fitType": "mean"})

# This a part of the "example" config that is required for some analysis
Esempio n. 8
0
    def calculate_peak_support(
            self, samples=None, region_type="summits", peak_type="filtered", permissive=True,
            comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"):
        """
        Calculate a measure of support for each region in peak set
        (i.e. ratio of samples containing a peak overlapping region in union set of peaks).

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            DataFrame with signal/background combinations used to call peaks

            Defaults to analysis' own `comparison_table`.
        peak_dir : :obj:`str`, optional
            Path to peaks output directory.
            Defaults to {analysis.results_dir}/chipseq_peaks
        samples: :obj:`list`
            Not used. Provided for compatibility with ATACSeqAnalysis class.
        region_type: :obj:`str`
            Not used. Provided for compatibility with ATACSeqAnalysis class.
        permissive: :obj:`bool`
            Not used. Provided for compatibility with ATACSeqAnalysis class.

        Attributes
        ----------
        support : :obj:`pandas.DataFrame`
            DataFrame with signal/background combinations used to call peaks
        """
        import pybedtools
        from tqdm import tqdm
        from ngs_toolkit.utils import bed_to_index

        if comparison_table is None:
            comparison_table = self.comparison_table

        peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir))

        # get index
        index = bed_to_index(self.sites.to_dataframe())

        # calculate support (number of samples overlaping each merged peak)
        support = pd.DataFrame(index=index)
        for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"):
            for peak_caller, peak_file in comp['peak_calls'][peak_type].items():
                try:
                    sample_support = self.sites.intersect(peak_file, wa=True, c=True).to_dataframe()
                except (
                    ValueError,
                    pybedtools.MalformedBedLineError,
                    pybedtools.helpers.BEDToolsError,
                ):
                    _LOGGER.warning(
                        "Peaks for comparison %s (%s) not found!", (name, peak_file))
                    if permissive:
                        continue
                    else:
                        raise
                sample_support.index = index
                support[(name, peak_caller)] = sample_support.iloc[:, 3]

        # Make multiindex labeling comparisons and peak type
        support.columns = pd.MultiIndex.from_tuples(
            support.columns, names=["comparison", "peak_caller"]
        )
        support.to_csv(
            os.path.join(
                self.results_dir, self.name + "_peaks.binary_overlap_support.csv"
            ),
            index=True,
        )

        # divide sum (of unique overlaps) by total to get support value between 0 and 1
        support["support"] = support.astype(bool).sum(axis=1) / float(support.shape[1])

        # save
        support.to_csv(
            os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True
        )

        self.support = support
Esempio n. 9
0
    def get_consensus_sites(
            self,
            samples=None,
            region_type="summits",
            peak_type="filtered",
            extension=250,
            blacklist_bed=None,
            filter_chroms=True,
            permissive=False,
            save=True,
            assign=True,
            **kwargs):
        """
        Get consensus (union) of enriched sites (peaks) across all comparisons.
        There are two modes possible, defined by the value of ``region_type``:

         * peaks: simple union of all sites;
         * summits: peak summits are extended by ``extension`` and a union is made.

        For ChIP-seq, the ``comparison_table`` keyword argument or a
        ``comparison_table`` attribute set is required. Peaks/summits will be
        aggregated for the peaks called in each sample comparison.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            Must have a ``peaks`` attribute set.

            Defaults to all samples in the analysis (``samples`` attribute).
        region_type : :obj:`str`
            The type of region to use to create the consensus region set
            - one of "summits" or "peaks".
            If "summits", peak summits will be extended by ``extension``
            before union.
            If "peaks", sample peaks will be used with no modification prior to
            union.

            Default is "summits".
        extension : :obj:`int`
            Amount to extend peaks summits by in both directions.

            Default is 250.
        blacklist_bed : {:obj:`False`, :obj:`str`}
            Either :obj:`False` or a path to a BED file with genomic positions
            to exclude from consensus peak set.

            Default is to use a blacklist file for the analysis ``genome``.
        filter_chroms : {:obj:`list`, :obj:`str`}
            A list of chromosomes to filter out or
            a string with a pattern to match to exclude chromosomes.
            Uses Pandas string methods :class:`pandas.Series.str.match`.
            Pass for example `'.*_.*|chrM'` to filter out chromosomes with a "_"
            character and a "chrM" chromosome.

            Default is not to filter anything.
        permissive : :obj:`bool`
            Whether Samples that which ``region_type`` attribute file
            does not exist should be simply skipped or an error thrown.

        comparison_table : :obj:`pandas.DataFrame`, optional
            DataFrame with signal/background combinations used to call peaks.
            Part of kwargs.

            Defaults to analysis own ``comparison_table``.
        peak_dir : :obj:`str`, optional
            Path to peaks output directory. Part of kwargs.

            Defaults to "{analysis.results_dir}/chipseq_peaks".

        Attributes
        ----------
        sites : :class:`pybedtools.BedTool`
            Bedtool with consensus sites.
        """
        import re
        from ngs_toolkit.general import get_blacklist_annotations
        import pybedtools
        from tqdm import tqdm
        import tempfile

        if "comparison_table" not in kwargs:
            # TODO: allow not requiring peak_dir to be passed if specifying a new table
            self.set_comparisons(kwargs["comparison_table"], peak_dir=kwargs["peak_dir"])

        if region_type not in ["summits", "peaks"]:
            msg = "`region_type` attribute must be one of 'summits' or 'peaks'!"
            _LOGGER.error(msg)
            raise ValueError(msg)

        if blacklist_bed is None:
            _LOGGER.info("Blacklist file not provided. Downloading...")
            try:
                blacklist_bed = get_blacklist_annotations(self.organism, self.genome)
            except AttributeError:
                msg = "Blacklist file was not provided and cannot"
                msg += " get one without analysis having `organism` and `genome` set."
                _LOGGER.error(msg)
                raise AttributeError(msg)

        # Simply concatenate all peaks in one file
        f = tempfile.NamedTemporaryFile()
        with open(f.name, "a") as handle:
            for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"):
                for peak_caller, peak_file in comp['peak_calls'][peak_type].items():
                    try:
                        # TODO: check if homer has summits and they match this pattern
                        summit = re.sub("_peaks.narrowPeak", "_summits.bed", peak_file)
                        file = (
                            pybedtools.BedTool(summit).slop(b=extension, genome=comp['genome']).fn
                            if region_type == "summits"
                            else peak_file)
                    except (ValueError, FileNotFoundError):
                        _LOGGER.warning("Input file for comparison {} ({}) not found!", (name, f))
                        if not permissive:
                            raise

                    for line in open(file, 'r'):
                        handle.write(line)

        # Merge overlaping peaks across comparisons
        sites = pybedtools.BedTool(f.name).sort().merge()

        # Filter
        # # remove blacklist regions
        if blacklist_bed is not False:
            if not isinstance(blacklist_bed, pybedtools.BedTool):
                blacklist = pybedtools.BedTool(blacklist_bed)
            sites = sites.intersect(v=True, b=blacklist)

        # # filter requested chromosomes
        if filter_chroms is not None:
            if isinstance(filter_chroms, list):
                sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas()
            elif isinstance(filter_chroms, str):
                s = sites.to_dataframe()
                sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)])

        # Save and assign
        if save:
            output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed")
            sites.saveas(output_file)
            sites = pybedtools.BedTool(output_file)
        if assign:
            self.sites = sites
        return sites
Esempio n. 10
0
    def summarize_peaks_from_comparisons(
        self,
        comparison_table=None,
        output_dir="{results_dir}/chipseq_peaks",
        filtered=True,
        permissive=True,
    ):
        """
        Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which
        samples represent signal or background.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        output_dir : :obj:`str`
            Parent directory where peaks will be created. Will be created if does not exist.
        permissive: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

        Raises
        ----------
        ValueError
            Will be raised if not `permissive` and incomplete/incoherent comparisons are detected.
        """
        from ngs_toolkit.utils import homer_peaks_to_bed

        if comparison_table is None:
            comparison_table = self.comparison_table

        req_columns = [
            "comparison_name",
            "sample_name",
            "comparison_side",
            "sample_group",
        ]
        msg = "Comparison table is missing some of the following columns: '{}'.".format(
            ",".join(req_columns)
        )
        if not all([col in comparison_table.columns for col in req_columns]):
            _LOGGER.error(msg)
            raise AssertionError(msg)

        # Complement default `output_dir`
        if "{results_dir}" in output_dir:
            output_dir = os.path.abspath(
                output_dir.format(results_dir=self.results_dir)
            )

        # For each comparison, count called peaks
        peak_type = "filtered" if filtered else "original"
        peak_counts = list()
        for name, comp in self.comparisons.items():
            _LOGGER.info(name)

            for peak_caller, file in comp['peak_calls'][peak_type].items():
                error = "Peak files for comparison '%s' with '%s' parameters don't exist."

                if "homer" in peak_caller and not filtered:
                    try:
                        homer_peaks_to_bed(file, file.replace("narrowPeak", "bed"))
                    except IOError:
                        if permissive:
                            _LOGGER.warning(error, (name, peak_caller))
                            peak_counts.append([name, peak_caller, np.nan])
                            continue
                        else:
                            raise
                    except pd.errors.EmptyDataError:
                        peak_counts.append([name, peak_caller, 0.0])
                    file = file.replace("narrowPeak", "bed")
                try:
                    df = pd.read_csv(file, sep="\t")
                except IOError:
                    if permissive:
                        _LOGGER.warning(error, (name, peak_caller))
                        peak_counts.append([name, peak_caller, np.nan])
                        continue
                    else:
                        raise
                except pd.errors.EmptyDataError:
                    peak_counts.append([name, peak_caller, 0.0])
                peak_counts.append([name, peak_caller, df.shape[0]])
        peak_counts = pd.DataFrame(peak_counts, columns=["comparison_name", "peak_caller", "peak_counts"])

        return peak_counts  # .fillna(0)
Esempio n. 11
0
    def call_peaks_from_comparisons(
        self,
        comparison_table=None,
        output_dir="{results_dir}/chipseq_peaks",
        permissive=True,
        overwrite=True,
        distributed=True,
    ):
        """
        Call peaks for ChIP-seq samples using an annotation of which samples
        belong in each comparison and which samples represent signal or background.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        output_dir : :obj:`str`
            Parent directory where peaks will be created.

            Will be created if does not exist.
        permissive: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

            Default is :obj:`True`.
        overwrite: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

            Default is :obj:`True`.
        distributed: :obj:`bool`
            Whether peak calling should be run in serial or in distributed mode as jobs.

            Default is :obj:`True`.

        Raises
        ----------
        ValueError
            If not `permissive` and incomplete/incoherent comparisons are detected.
        """
        import subprocess

        from ngs_toolkit.utils import (
            macs2_call_chipseq_peak,
            homer_call_chipseq_peak_job,
            filter_kwargs_by_callable
        )
        from tqdm import tqdm

        if comparison_table is None:
            comparison_table = self.comparison_table
        req_columns = [
            "comparison_name",
            "sample_name",
            "comparison_side",
            "sample_group",
        ]
        msg = "Comparison table is missing some of the following columns: '{}'.".format(
            ",".join(req_columns)
        )
        if not all([col in comparison_table.columns for col in req_columns]):
            _LOGGER.error(msg)
            raise AssertionError(msg)

        # Complement default `output_dir`
        if "{results_dir}" in output_dir:
            output_dir = os.path.abspath(
                output_dir.format(results_dir=self.results_dir)
            )
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # For each comparison
        for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"):

            _LOGGER.info(
                "Doing comparison '{}' with positive samples '{}' and background samples '{}'".format(
                    name,
                    [s.name for s in comp['signal_samples']],
                    [s.name for s in comp['control_samples']],
                )
            )
            # Call peaks
            cmds = list()
            bkws = filter_kwargs_by_callable(comp, macs2_call_chipseq_peak)
            kwargs = {
                "name": name, "distributed": distributed, **bkws}
            if overwrite:
                cmds += [macs2_call_chipseq_peak(**kwargs), homer_call_chipseq_peak_job(**kwargs)]
            else:
                if not os.path.exists(comp['peak_calls']['original']['macs']):
                    cmds += [macs2_call_chipseq_peak(**kwargs)]
                if not os.path.exists(comp['peak_calls']['original']['homer_factor']):
                    cmds += [homer_call_chipseq_peak_job(**kwargs)]
                else:
                    _LOGGER.warning("Peak files for comparison '%s' already exist. Skipping.", name)
            if not distributed:
                for cmd in cmds:
                    _LOGGER.info("Calling peaks for comparison '%s' with command: '%s'.\n", (name, cmd))
                    subprocess.call(cmd.split(" "))
Esempio n. 12
0
    def set_comparisons(self, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"):
        """
        Set up an attribute containing information about the
        sample comparisons necessary for peak calling.

        Structure:

            * comparison_name:
                * signal_samples
                * background_samples
                * directory
                * prefix
                * resulting_files
                    * macs
                    * homer_histone
                    * homer_factor

        Parameters
        ----------
        comparison_table : :obj:`str`, optional
            Comparison table wit peak comparisons.

            Defaults to one from PEP project if available.
        peak_dir : :obj:`str`, optional
            Directory with peak calls.

            Defaults to "{results_dir}/chipseq_peaks".

        Returns
        -------
        :obj:`dict`
            The dictionary with the attributes.

        Attributes
        ----------
        :obj:`dict`
            The dictionary with the attributes.

        Raises
        ------
        ValueError
            If comparisons are not correctly specified.
        """
        if comparison_table is None:
            comparison_table = self.comparison_table

        comparison_names = (
            comparison_table.loc[
                comparison_table['comparison_type'] == 'peaks',
                "comparison_name"]
            .drop_duplicates().sort_values()).tolist()
        if not comparison_names:
            _LOGGER.warning("Could not find any comparisons of type 'peak'.")

        peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir))

        self.comparisons = dict()
        for name in comparison_names:
            _LOGGER.info("Setting comparison '%s' up", name)

            # If there aren't two sides to each comparison, skip it or throw error
            if len(set(comparison_table.query("comparison_name == '{}'".format(name))["comparison_side"])) != 2:
                error = "Comparison '{}' does not contain two sides.".format(name)
                _LOGGER.error(error)
                raise ValueError(error)

            # Get the sample names of samples in each side
            pos_names = comparison_table.loc[
                (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] == 1),
                "sample_name"].tolist()
            neg_names = comparison_table.loc[
                (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] < 1),
                "sample_name"].tolist()

            signal_samples = [s for s in self.samples if s.name in pos_names]
            control_samples = [s for s in self.samples if s.name in neg_names]

            co = dict()
            co['signal_samples'] = signal_samples
            co['control_samples'] = control_samples

            # Additional info
            co['output_dir'] = os.path.join(peak_dir, name)
            co['prefix'] = os.path.join(co['output_dir'], name)
            g = comparison_table.query("comparison_name == '{}'".format(name))['comparison_genome'].drop_duplicates().squeeze()
            if not isinstance(g, str):
                msg = "Could not determine genome of comparison '%s'." % g
                _LOGGER.error(msg)
                raise AssertionError(msg)
            co['genome'] = g

            # resulting files files
            res = dict()
            res['macs'] = co['prefix'] + "_peaks.narrowPeak"
            res["homer_factor"] = co['prefix'] + "_homer_peaks.factor.narrowPeak"
            res["homer_histone"] = co['prefix'] + "_homer_peaks.histone.narrowPeak"
            co['peak_calls'] = dict()
            co['peak_calls']["original"] = res
            co['peak_calls']["filtered"] = {
                k: v.replace(".narrowPeak", ".filtered.bed")
                for k, v in res.items()}

            self.comparisons[name] = co
        return self.comparisons
Esempio n. 13
0
    def collect_bitseq_output(self,
                              samples=None,
                              permissive=True,
                              expression_type="counts"):
        """
        Collect gene expression (read counts, transcript-level) output from Bitseq
        into expression matrix for `samples`.
        """
        # TODO: drop support for legacy pipeline output and assume one input file with all required columns
        # TODO: add support for RPKM
        if samples is None:
            samples = self.samples

        if expression_type != "counts":
            msg = "`expression_type` must be 'counts'!"
            _LOGGER.error(msg)
            raise NotImplementedError(msg)

        expr = list()
        for i, sample in enumerate(samples):
            _LOGGER.debug(
                "Reading transcriptome files for sample '{}'.".format(
                    sample.name))
            tr_file = os.path.join(
                sample.sample_root,
                "bowtie1_{}".format(sample.transcriptome),
                "bitSeq",
                sample.name + ".tr",
            )
            counts_file = os.path.join(
                sample.sample_root,
                "bowtie1_{}".format(sample.transcriptome),
                "bitSeq",
                sample.name + ".counts",
            )

            # read the "tr" file of one sample to get indexes
            try:
                tr = pd.read_csv(
                    tr_file,
                    sep=" ",
                    header=None,
                    skiprows=1,
                    names=[
                        "ensembl_gene_id", "ensembl_transcript_id", "v1", "v2"
                    ],
                )
            except IOError:
                msg = "Could not open file '{}'' is missing.".format(tr_file)
                if permissive:
                    _LOGGER.warning(msg)
                    continue
                else:
                    raise

            # read the "counts" file of one sample to get indexes
            try:
                e = pd.read_csv(counts_file, sep=" ")
            except IOError:
                msg = "Could not open file '{}'' is missing.".format(
                    counts_file)
                if permissive:
                    _LOGGER.warning(msg)
                    continue
                else:
                    raise

            e = tr.drop(["v1", "v2"], axis=1).join(e)
            e.loc[:, "sample_name"] = sample.name
            expr.append(e)

        if len(expr) == 0:
            msg = "No sample had a valid expression file!"
            if permissive:
                _LOGGER.warning(msg)
                return
            else:
                _LOGGER.error(msg)
                raise IOError(msg)

        expr = (pd.concat(expr, axis=0, sort=False).melt(id_vars=[
            "ensembl_gene_id", "ensembl_transcript_id", "sample_name"
        ]).pivot_table(
            index=["ensembl_gene_id", "ensembl_transcript_id"],
            columns="sample_name",
            values="value",
            fill_value=0,
        ).astype(int, downcast=True))

        return expr
Esempio n. 14
0
def plot_features(
    analysis=None,
    knockout_genes=None,
    matrix="matrix_norm",
    samples=None,
    differential_results=None,
    output_dir=None,
    output_prefix="knockout_expression",
):
    """
    Plot expression of genes in samples or sample groups.

    Parameters
    ----------

    analysis : :class:`ngs_toolkit.RNASeqAnalysis`, optional
        Analysis object.

        Not required if `matrix` is given.
    knockout_genes : :obj:`list`, optional
        List of perturbed genes to plot.

        Defaults to the set of `knockout` attributes in the analysis'
        samples if `analysis` is given. Otherwise must be given.
    matrix : str, optional
        Matrix with expression values to use.

        Defaults to "matrix_norm"
    samples : [type], optional
        [description]

        Defaults to :obj:`None`.
    differential_results : [type], optional
        [description]

        Defaults to :obj:`None`.
    output_dir : [type], optional
        [description]

        Defaults to :obj:`None`.
    output_prefix : str, optional
        Prefix for output files.

        Defaults to "knockout_expression"
    """
    from ngs_toolkit.graphics import clustermap_varieties

    if (analysis is None) and (matrix is None):
        raise AssertionError("One of `analysis` or `matrix` must be provided.")

    msg = "If an `analysis` object is not provided, you must provide a list of `knockout_genes`."
    if (analysis is None) and (knockout_genes is None):
        raise AssertionError(msg)
    elif (analysis is not None) and (knockout_genes is None):
        msg = "If `knockout_genes` is not given, Samples in `analysis` must have a `knockout` attribute."
        try:
            knockout_genes = list(set([s.knockout for s in analysis.samples]))
        except KeyError(msg) as e:
            raise e

    matrix = analysis.get_matrix(matrix=matrix, samples=samples)

    if output_dir is None:
        if analysis is not None:
            output_dir = analysis.results_dir
        else:
            output_dir = os.path.curdir

    knockout_genes = sorted(knockout_genes)

    missing = [k for k in knockout_genes if k not in matrix.index]
    msg = "Some `knockout_genes` were not found in the expression matrix: '%s'"
    if len(missing) > 0:
        _LOGGER.warning(msg % ", ".join(missing))
    knockout_genes = [k for k in knockout_genes if k in matrix.index]

    ko = matrix.loc[knockout_genes, :]
    msg = "None of the `knockout_genes` were found in the expression matrix.\nCannot proceed."
    if ko.empty:
        _LOGGER.warning(msg)
        return

    # expression values
    clustermap_varieties(ko,
                         output_dir=output_dir,
                         output_prefix=output_prefix)

    # p-values and fold-changes for knockout genes
    if differential_results is None:
        differential_results = getattr(analysis, "differential_results", None)
    if differential_results is None:
        return

    if len(differential_results["comparison_name"].unique()) <= 1:
        msg = "Could not plot values per comparison as only one found!"
        _LOGGER.warning(msg)
        return

    # p-values
    p_table = pd.pivot_table(
        differential_results.loc[knockout_genes, :].reset_index(),
        index="comparison_name",
        columns="index",
        values="padj",
    )
    p_table.index.name = "Knockout gene"
    p_table.columns.name = "Gene"
    p_table = -np.log10(p_table.loc[:, knockout_genes].dropna())
    p_table = p_table.replace(np.inf, p_table[p_table != np.inf].max().max())
    p_table = p_table.replace(-np.inf, 0)

    clustermap_varieties(
        p_table,
        output_dir=output_dir,
        output_prefix=output_prefix + ".p_value",
        quantity="-log10(FDR p-value)",
    )
    clustermap_varieties(
        p_table,
        output_dir=output_dir,
        output_prefix=output_prefix + ".p_value.thresholded",
        steps=["base", "sorted"],
        quantity="-log10(FDR p-value)",
        vmax=1.3 * 5,
    )

    # logfoldchanges
    fc_table = pd.pivot_table(
        differential_results.loc[knockout_genes, :].reset_index(),
        index="comparison_name",
        columns="index",
        values="log2FoldChange",
    )
    fc_table.index.name = "Knockout gene"
    fc_table.columns.name = "Gene"
    fc_table = fc_table.loc[:, knockout_genes].dropna()

    clustermap_varieties(
        fc_table,
        output_dir=output_dir,
        output_prefix=output_prefix + "log_fc",
        steps=["base", "sorted"],
        quantity="log2(fold-change)",
    )
    clustermap_varieties(
        fc_table,
        output_dir=output_dir,
        output_prefix=output_prefix + "log_fc.thresholded",
        steps=["base", "sorted"],
        quantity="log2(fold-change)",
        vmin=-2,
        vmax=2,
    )
Esempio n. 15
0
    def _copy_cnv_profile_plots(
        self,
        output_dir="{results_dir}/cnv_profiles",
        output_prefix="log2_profile",
        resolutions=None,
        samples=None,
        permissive=True,
    ):
        """
        Convenience to copy output plots from runnning several samples independently
        to a given directory.

        Parameters
        ----------
        output_dir : :obj:`str`, optional
            Directory to copy to.

            Defaults to "{results_dir}/cnv_profiles".
        output_prefix : :obj:`str`, optional
            Prefix for copied files.

            Defaults to "log2_profile".
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        permissive: :obj:`bool`, optional
            Whether missing files should raise an error.

            Defaults to :obj:`True.`
        """
        from tqdm import tqdm
        from glob import glob
        from shutil import copyfile

        if resolutions is None:
            resolutions = self.resolutions

        if samples is None:
            samples = self.samples

        output_dir = self._format_string_with_attributes(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for resolution in tqdm(resolutions,
                               total=len(resolutions),
                               desc="Resolution"):

            for sample in tqdm(samples, total=len(samples), desc="Sample"):
                # Read log2 file
                if not hasattr(sample, "log2_read_counts"):
                    sample.log2_read_counts = os.path.join(
                        self.data_dir,
                        sample.sample_root,
                        sample.name + "_{resolution}",
                        "CNAprofiles",
                        "log2_read_counts.igv",
                    )
                if "{resolution}" in sample.log2_read_counts:
                    input_file = sample.log2_read_counts.format(
                        resolution=resolution)
                f = glob(input_file)
                if len(f) == 1:
                    f = f[0]
                else:
                    msg = "Sample '{}' does not have a PDF file!".format(
                        sample.name)
                    if permissive:
                        _LOGGER.warning(msg)
                        continue
                    else:
                        raise OSError(msg)

                d = os.path.join(
                    output_dir, sample.name + "." + resolution + "." +
                    output_prefix + ".pdf")
                try:
                    copyfile(f, d)
                except OSError:
                    msg = "Could not copy file '{}' to '{}'!".format(f, d)
                    if permissive:
                        _LOGGER.warning(msg)
                    else:
                        raise OSError(msg)
Esempio n. 16
0
    def load_data(
        self,
        output_map=None,
        only_these_keys=None,
        resolutions=None,
        prefix="{results_dir}/{name}",
        permissive=True,
    ):
        """
        Load the output files of the major functions of the Analysis.

        Parameters
        ----------
        output_map : :obj:`dict`
            Dictionary with {attribute_name: (file_path, kwargs)} to load the files.
            The kwargs in the tuple will be passed to :meth:`pandas.read_csv`.

            Defaults to the required to read the keys in ``only_these_keys``.
        only_these_keys : :obj:`list`, optional
            Iterable of analysis attributes to load up.
            Possible attributes:

                * "matrix_raw"
                * "matrix_norm"
                * "matrix_features"
                * "differential_results"

            Defaults to all of the above.

        resolutions: :obj:`list`
            List of resolution strings to get data for.

            Defaults to value of ``resolutions`` attribute of Analysis.
        prefix : :obj:`str`, optional
            String prefix of files to load.
            Variables in curly braces will be formated with attributes of analysis.

            Defaults to "{results_dir}/{name}".
        permissive : :obj:`bool`, optional
            Whether an error should be ignored if reading a file causes IOError.

            Default is :obj:`True`.

        Attributes
        ----------
        pandas.DataFrame
            Dataframes holding the respective data, available as attributes described
            in the `only_these_keys` parameter.

        Raises
        ----------
        IOError
            If not permissive and a file is not found
        """
        from ngs_toolkit.utils import fix_dataframe_header

        prefix = self._format_string_with_attributes(prefix)

        if resolutions is None:
            resolutions = self.resolutions

        if output_map is None:
            kwargs = {"index_col": 0}
            output_map = {
                "matrix_raw": {
                    r: (prefix + ".{}.matrix_raw.csv".format(r), kwargs)
                    for r in resolutions
                },
                "matrix_norm": {
                    r: (prefix + ".{}.matrix_norm.csv".format(r), kwargs)
                    for r in resolutions
                },
                "segmentation": {
                    r: (prefix + ".{}.segmentation.csv".format(r), {})
                    for r in resolutions
                },
                "segmentation_annot": {
                    r:
                    (prefix + ".{}.segmentation.annotated.csv".format(r), {})
                    for r in resolutions
                },
            }
        if only_these_keys is None:
            only_these_keys = list(output_map.keys())

        output_map = {
            k: v
            for k, v in output_map.items() if k in only_these_keys
        }

        for name, f in output_map.items():
            for resolution, (file, kwargs) in f.items():
                file = file.format(resolution)
                _LOGGER.info(
                    "Loading '{}' analysis attribute for resolution '{}'.".
                    format(name, resolution))
                if not hasattr(self, name):
                    setattr(self, name, {resolution: None})
                try:
                    getattr(self,
                            name)[resolution] = pd.read_csv(file, **kwargs)
                    # Fix possible multiindex for matrix_norm
                    if name == "matrix_norm":
                        getattr(self, name)[resolution] = fix_dataframe_header(
                            getattr(self, name)[resolution])
                except IOError as e:
                    if not permissive:
                        raise e
                    else:
                        _LOGGER.warning(e)