Ejemplo n.º 1
0
def all_to_igv(matrix, output_prefix, **kwargs):
    """
    Convert dictionary of DataFrame with CNV data in several resolutions to IGV format.

    Parameters
    ----------
    matrix : :obj:`pandas.DataFrame`
        DataFrame with CNV data to convert.

    output_prefix : str
        Prefix to add to plots.

    **kwargs : :obj:`dict`, optional
        Additional parameters will be passed to ngs_toolkit.cnv.to_igv

    Returns
    -------
    dict
        Dictionary of CNV data in IGV format for each resolution.
    """
    from tqdm import tqdm

    igvs = dict()
    resolutions = matrix.keys()
    for resolution in tqdm(resolutions,
                           total=len(resolutions),
                           desc="Resolution"):
        _LOGGER.info(
            "Making IGV visualization for resolution '{}'.".format(resolution))
        igvs[resolution] = to_igv(matrix[resolution],
                                  output_file="{}_{}.igv".format(
                                      output_prefix, resolution),
                                  **kwargs)

    return igvs
Ejemplo n.º 2
0
def to_igv(matrix, output_file=None, save=True, view_limits=(-2, 2)):
    """
    Convert DataFrame with CNV data to IGV format.

    Parameters
    ----------
    matrix : :obj:`pandas.DataFrame`
        DataFrame with CNV data to convert.
    output_file : str, optional
        Output file.

        Required if `save` is True.
    save: :obj:`bool`, optional
        Whether results should be saved to disc.

        Defaults to :obj:`True`.
    view_limits : tuple, optional
        Extreme values (min, max) of color scale used to visualize in IGV.

        Defaults to (-2, 2).

    Returns
    -------
    pandas.DataFrame
        CNV data in IGV format.

    Raises
    -------
    ValueError:
        If `save` is True but `output_file` is None.
    """
    _LOGGER.info("Making IGV visualization")

    # as IGV file
    igv = pd.DataFrame(index=matrix.index)
    igv.loc[:, "Chromosome"] = list(
        map(lambda x: x[0], matrix.index.str.split(":")))
    igv.loc[:, "Start"] = list(
        map(lambda x: int(x[1].split("-")[0]), matrix.index.str.split(":")))
    igv.loc[:, "End"] = list(
        map(lambda x: int(x[1].split("-")[1]), matrix.index.str.split(":")))
    igv.loc[:, "Name"] = igv.index

    igv = igv.join(matrix).reset_index(drop=True)

    if save:
        if output_file is None:
            raise ValueError(
                "If the 'save' option is specified, 'output_file' must also be!"
            )
        open(output_file, "w")
        output_handle = open(output_file, "a")
        output_handle.write(
            "#track viewLimits=-{}:{} graphType=heatmap color=255,0,0\n".
            format(*view_limits))
        igv.to_csv(output_handle, sep="\t", index=False)
        output_handle.close()

    return igv
Ejemplo n.º 3
0
def test_config_has_all_required_fields(log):
    import logging
    from ngs_toolkit import _LOGGER

    assert isinstance(_LOGGER, logging.Logger)
    previous_size = os.stat(log).st_size
    _LOGGER.info("Testing logger")
    new_size = os.stat(log).st_size
    assert new_size > previous_size
Ejemplo n.º 4
0
def main():
    """
    Program"s main entry point.
    """
    # Parse command-line arguments.
    _LOGGER.debug("Parsing command-line arguments")
    args = parse_arguments()

    if args.command == "create":
        _LOGGER.info("Creating project '{}' in '{}'.".format(args.project_name, args.root_dir))

        genome_assemblies = {
            x.split(":")[0]: x.split(":")[1] for x in args.genome_assemblies.split(",")
        }
        # Create project.
        git_ok = create_project(
            project_name=args.project_name,
            genome_assemblies=genome_assemblies,
            overwrite=args.overwrite,
            root_projects_dir=args.root_dir,
        )
        if git_ok != 0:
            _LOGGER.error("Initialization of project failed.")
            return git_ok

        # Create requirements file.
        _LOGGER.info("Creating requirements file for project '{}'.".format(args.project_name))
        create_requirements_file(
            project_name=args.project_name,
            project_dir=os.path.join(args.root_dir, args.project_name),
            overwrite=args.overwrite,
        )

        # Create Makefile.
        _LOGGER.info("Creating Makefile file for project '{}'.".format(args.project_name))
        create_makefile(
            project_name=args.project_name,
            project_dir=os.path.join(args.root_dir, args.project_name),
            overwrite=args.overwrite,
        )

    elif args.command == "recipe":

        if args.list_only:
            import pkgutil
            import ngs_toolkit.recipes

            n = pkgutil.iter_modules(ngs_toolkit.recipes.__path__)
            print("Available ngs_toolkit recipes: '{}'.".format("', '".join([x[1] for x in n])))
        else:
            _LOGGER.info("Running recipe '{}'.".format(args.recipe_name))
            run_recipe(recipe_name=args.recipe_name, project_config=args.project_config)

    _LOGGER.debug("Completed.")
Ejemplo n.º 5
0
    def get_consensus_sites(
            self,
            samples=None,
            region_type="summits",
            peak_type="filtered",
            extension=250,
            blacklist_bed=None,
            filter_chroms=True,
            permissive=False,
            save=True,
            assign=True,
            **kwargs):
        """
        Get consensus (union) of enriched sites (peaks) across all comparisons.
        There are two modes possible, defined by the value of ``region_type``:

         * peaks: simple union of all sites;
         * summits: peak summits are extended by ``extension`` and a union is made.

        For ChIP-seq, the ``comparison_table`` keyword argument or a
        ``comparison_table`` attribute set is required. Peaks/summits will be
        aggregated for the peaks called in each sample comparison.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            Must have a ``peaks`` attribute set.

            Defaults to all samples in the analysis (``samples`` attribute).
        region_type : :obj:`str`
            The type of region to use to create the consensus region set
            - one of "summits" or "peaks".
            If "summits", peak summits will be extended by ``extension``
            before union.
            If "peaks", sample peaks will be used with no modification prior to
            union.

            Default is "summits".
        extension : :obj:`int`
            Amount to extend peaks summits by in both directions.

            Default is 250.
        blacklist_bed : {:obj:`False`, :obj:`str`}
            Either :obj:`False` or a path to a BED file with genomic positions
            to exclude from consensus peak set.

            Default is to use a blacklist file for the analysis ``genome``.
        filter_chroms : {:obj:`list`, :obj:`str`}
            A list of chromosomes to filter out or
            a string with a pattern to match to exclude chromosomes.
            Uses Pandas string methods :class:`pandas.Series.str.match`.
            Pass for example `'.*_.*|chrM'` to filter out chromosomes with a "_"
            character and a "chrM" chromosome.

            Default is not to filter anything.
        permissive : :obj:`bool`
            Whether Samples that which ``region_type`` attribute file
            does not exist should be simply skipped or an error thrown.

        comparison_table : :obj:`pandas.DataFrame`, optional
            DataFrame with signal/background combinations used to call peaks.
            Part of kwargs.

            Defaults to analysis own ``comparison_table``.
        peak_dir : :obj:`str`, optional
            Path to peaks output directory. Part of kwargs.

            Defaults to "{analysis.results_dir}/chipseq_peaks".

        Attributes
        ----------
        sites : :class:`pybedtools.BedTool`
            Bedtool with consensus sites.
        """
        import re
        from ngs_toolkit.general import get_blacklist_annotations
        import pybedtools
        from tqdm import tqdm
        import tempfile

        if "comparison_table" not in kwargs:
            # TODO: allow not requiring peak_dir to be passed if specifying a new table
            self.set_comparisons(kwargs["comparison_table"], peak_dir=kwargs["peak_dir"])

        if region_type not in ["summits", "peaks"]:
            msg = "`region_type` attribute must be one of 'summits' or 'peaks'!"
            _LOGGER.error(msg)
            raise ValueError(msg)

        if blacklist_bed is None:
            _LOGGER.info("Blacklist file not provided. Downloading...")
            try:
                blacklist_bed = get_blacklist_annotations(self.organism, self.genome)
            except AttributeError:
                msg = "Blacklist file was not provided and cannot"
                msg += " get one without analysis having `organism` and `genome` set."
                _LOGGER.error(msg)
                raise AttributeError(msg)

        # Simply concatenate all peaks in one file
        f = tempfile.NamedTemporaryFile()
        with open(f.name, "a") as handle:
            for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"):
                for peak_caller, peak_file in comp['peak_calls'][peak_type].items():
                    try:
                        # TODO: check if homer has summits and they match this pattern
                        summit = re.sub("_peaks.narrowPeak", "_summits.bed", peak_file)
                        file = (
                            pybedtools.BedTool(summit).slop(b=extension, genome=comp['genome']).fn
                            if region_type == "summits"
                            else peak_file)
                    except (ValueError, FileNotFoundError):
                        _LOGGER.warning("Input file for comparison {} ({}) not found!", (name, f))
                        if not permissive:
                            raise

                    for line in open(file, 'r'):
                        handle.write(line)

        # Merge overlaping peaks across comparisons
        sites = pybedtools.BedTool(f.name).sort().merge()

        # Filter
        # # remove blacklist regions
        if blacklist_bed is not False:
            if not isinstance(blacklist_bed, pybedtools.BedTool):
                blacklist = pybedtools.BedTool(blacklist_bed)
            sites = sites.intersect(v=True, b=blacklist)

        # # filter requested chromosomes
        if filter_chroms is not None:
            if isinstance(filter_chroms, list):
                sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas()
            elif isinstance(filter_chroms, str):
                s = sites.to_dataframe()
                sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)])

        # Save and assign
        if save:
            output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed")
            sites.saveas(output_file)
            sites = pybedtools.BedTool(output_file)
        if assign:
            self.sites = sites
        return sites
Ejemplo n.º 6
0
    def summarize_peaks_from_comparisons(
        self,
        comparison_table=None,
        output_dir="{results_dir}/chipseq_peaks",
        filtered=True,
        permissive=True,
    ):
        """
        Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which
        samples represent signal or background.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        output_dir : :obj:`str`
            Parent directory where peaks will be created. Will be created if does not exist.
        permissive: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

        Raises
        ----------
        ValueError
            Will be raised if not `permissive` and incomplete/incoherent comparisons are detected.
        """
        from ngs_toolkit.utils import homer_peaks_to_bed

        if comparison_table is None:
            comparison_table = self.comparison_table

        req_columns = [
            "comparison_name",
            "sample_name",
            "comparison_side",
            "sample_group",
        ]
        msg = "Comparison table is missing some of the following columns: '{}'.".format(
            ",".join(req_columns)
        )
        if not all([col in comparison_table.columns for col in req_columns]):
            _LOGGER.error(msg)
            raise AssertionError(msg)

        # Complement default `output_dir`
        if "{results_dir}" in output_dir:
            output_dir = os.path.abspath(
                output_dir.format(results_dir=self.results_dir)
            )

        # For each comparison, count called peaks
        peak_type = "filtered" if filtered else "original"
        peak_counts = list()
        for name, comp in self.comparisons.items():
            _LOGGER.info(name)

            for peak_caller, file in comp['peak_calls'][peak_type].items():
                error = "Peak files for comparison '%s' with '%s' parameters don't exist."

                if "homer" in peak_caller and not filtered:
                    try:
                        homer_peaks_to_bed(file, file.replace("narrowPeak", "bed"))
                    except IOError:
                        if permissive:
                            _LOGGER.warning(error, (name, peak_caller))
                            peak_counts.append([name, peak_caller, np.nan])
                            continue
                        else:
                            raise
                    except pd.errors.EmptyDataError:
                        peak_counts.append([name, peak_caller, 0.0])
                    file = file.replace("narrowPeak", "bed")
                try:
                    df = pd.read_csv(file, sep="\t")
                except IOError:
                    if permissive:
                        _LOGGER.warning(error, (name, peak_caller))
                        peak_counts.append([name, peak_caller, np.nan])
                        continue
                    else:
                        raise
                except pd.errors.EmptyDataError:
                    peak_counts.append([name, peak_caller, 0.0])
                peak_counts.append([name, peak_caller, df.shape[0]])
        peak_counts = pd.DataFrame(peak_counts, columns=["comparison_name", "peak_caller", "peak_counts"])

        return peak_counts  # .fillna(0)
Ejemplo n.º 7
0
    def filter_peaks(
        self,
        comparison_table=None,
        filter_bed=None,
        peaks_dir="{results_dir}/chipseq_peaks",
    ):
        """
        Filter peak calls for various comparisons for entries that do not overlap another BED file.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        filter_bed : :obj:`str`
            BED file with entries to filter out from the BED files of each comparison.

            Defaults to the set of Blacklisted regions from the analysis' genome.
            In that case it will be fetched if not present.
        peaks_dir : :obj:`str`
            Parent directory where peak calls for each comparison exist.
            Will be created if does not exist.

            Defaults to "{results_dir}/chipseq_peaks".

        Raises
        ----------
        AttributeError
            If `filter_bed` is not given and failes to be retrieved.
        """
        from ngs_toolkit.utils import homer_peaks_to_bed, filter_bed_file

        if comparison_table is None:
            comparison_table = self.comparison_table

        if filter_bed is None:
            _LOGGER.info("Blacklist file not provided. Downloading...")
            try:
                filter_bed = self.get_resources(steps=["blacklist"])[
                    "blacklist_file"
                ]
            except AttributeError:
                msg = "Blacklist file was not provided and cannot be"
                msg += " get one without analysis having `organism` and `genome` set."
                _LOGGER.error(msg)
                raise AttributeError(msg)

        peaks_dir = self._format_string_with_attributes(peaks_dir)
        if not os.path.exists(peaks_dir):
            os.makedirs(peaks_dir)

        for name, comp in self.comparisons.items():
            # MACS2
            filter_bed_file(comp['peak_calls']['original']['macs'], filter_bed, comp['peak_calls']['filtered']['macs'])
            # HOMER
            tmp_bed = comp['peak_calls']['original']['homer_factor'].replace(".narrowPeak", ".bed")
            homer_peaks_to_bed(comp['peak_calls']['original']['homer_factor'], tmp_bed)
            filter_bed_file(tmp_bed, filter_bed, comp['peak_calls']['filtered']['homer_factor'])
            os.remove(tmp_bed)

            tmp_bed = comp['peak_calls']['original']['homer_histone'].replace(".narrowPeak", ".bed")
            homer_peaks_to_bed(comp['peak_calls']['original']['homer_histone'], tmp_bed)
            filter_bed_file(tmp_bed, filter_bed, comp['peak_calls']['filtered']['homer_histone'])
            os.remove(tmp_bed)
Ejemplo n.º 8
0
    def call_peaks_from_comparisons(
        self,
        comparison_table=None,
        output_dir="{results_dir}/chipseq_peaks",
        permissive=True,
        overwrite=True,
        distributed=True,
    ):
        """
        Call peaks for ChIP-seq samples using an annotation of which samples
        belong in each comparison and which samples represent signal or background.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        output_dir : :obj:`str`
            Parent directory where peaks will be created.

            Will be created if does not exist.
        permissive: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

            Default is :obj:`True`.
        overwrite: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

            Default is :obj:`True`.
        distributed: :obj:`bool`
            Whether peak calling should be run in serial or in distributed mode as jobs.

            Default is :obj:`True`.

        Raises
        ----------
        ValueError
            If not `permissive` and incomplete/incoherent comparisons are detected.
        """
        import subprocess

        from ngs_toolkit.utils import (
            macs2_call_chipseq_peak,
            homer_call_chipseq_peak_job,
            filter_kwargs_by_callable
        )
        from tqdm import tqdm

        if comparison_table is None:
            comparison_table = self.comparison_table
        req_columns = [
            "comparison_name",
            "sample_name",
            "comparison_side",
            "sample_group",
        ]
        msg = "Comparison table is missing some of the following columns: '{}'.".format(
            ",".join(req_columns)
        )
        if not all([col in comparison_table.columns for col in req_columns]):
            _LOGGER.error(msg)
            raise AssertionError(msg)

        # Complement default `output_dir`
        if "{results_dir}" in output_dir:
            output_dir = os.path.abspath(
                output_dir.format(results_dir=self.results_dir)
            )
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # For each comparison
        for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"):

            _LOGGER.info(
                "Doing comparison '{}' with positive samples '{}' and background samples '{}'".format(
                    name,
                    [s.name for s in comp['signal_samples']],
                    [s.name for s in comp['control_samples']],
                )
            )
            # Call peaks
            cmds = list()
            bkws = filter_kwargs_by_callable(comp, macs2_call_chipseq_peak)
            kwargs = {
                "name": name, "distributed": distributed, **bkws}
            if overwrite:
                cmds += [macs2_call_chipseq_peak(**kwargs), homer_call_chipseq_peak_job(**kwargs)]
            else:
                if not os.path.exists(comp['peak_calls']['original']['macs']):
                    cmds += [macs2_call_chipseq_peak(**kwargs)]
                if not os.path.exists(comp['peak_calls']['original']['homer_factor']):
                    cmds += [homer_call_chipseq_peak_job(**kwargs)]
                else:
                    _LOGGER.warning("Peak files for comparison '%s' already exist. Skipping.", name)
            if not distributed:
                for cmd in cmds:
                    _LOGGER.info("Calling peaks for comparison '%s' with command: '%s'.\n", (name, cmd))
                    subprocess.call(cmd.split(" "))
Ejemplo n.º 9
0
    def set_comparisons(self, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"):
        """
        Set up an attribute containing information about the
        sample comparisons necessary for peak calling.

        Structure:

            * comparison_name:
                * signal_samples
                * background_samples
                * directory
                * prefix
                * resulting_files
                    * macs
                    * homer_histone
                    * homer_factor

        Parameters
        ----------
        comparison_table : :obj:`str`, optional
            Comparison table wit peak comparisons.

            Defaults to one from PEP project if available.
        peak_dir : :obj:`str`, optional
            Directory with peak calls.

            Defaults to "{results_dir}/chipseq_peaks".

        Returns
        -------
        :obj:`dict`
            The dictionary with the attributes.

        Attributes
        ----------
        :obj:`dict`
            The dictionary with the attributes.

        Raises
        ------
        ValueError
            If comparisons are not correctly specified.
        """
        if comparison_table is None:
            comparison_table = self.comparison_table

        comparison_names = (
            comparison_table.loc[
                comparison_table['comparison_type'] == 'peaks',
                "comparison_name"]
            .drop_duplicates().sort_values()).tolist()
        if not comparison_names:
            _LOGGER.warning("Could not find any comparisons of type 'peak'.")

        peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir))

        self.comparisons = dict()
        for name in comparison_names:
            _LOGGER.info("Setting comparison '%s' up", name)

            # If there aren't two sides to each comparison, skip it or throw error
            if len(set(comparison_table.query("comparison_name == '{}'".format(name))["comparison_side"])) != 2:
                error = "Comparison '{}' does not contain two sides.".format(name)
                _LOGGER.error(error)
                raise ValueError(error)

            # Get the sample names of samples in each side
            pos_names = comparison_table.loc[
                (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] == 1),
                "sample_name"].tolist()
            neg_names = comparison_table.loc[
                (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] < 1),
                "sample_name"].tolist()

            signal_samples = [s for s in self.samples if s.name in pos_names]
            control_samples = [s for s in self.samples if s.name in neg_names]

            co = dict()
            co['signal_samples'] = signal_samples
            co['control_samples'] = control_samples

            # Additional info
            co['output_dir'] = os.path.join(peak_dir, name)
            co['prefix'] = os.path.join(co['output_dir'], name)
            g = comparison_table.query("comparison_name == '{}'".format(name))['comparison_genome'].drop_duplicates().squeeze()
            if not isinstance(g, str):
                msg = "Could not determine genome of comparison '%s'." % g
                _LOGGER.error(msg)
                raise AssertionError(msg)
            co['genome'] = g

            # resulting files files
            res = dict()
            res['macs'] = co['prefix'] + "_peaks.narrowPeak"
            res["homer_factor"] = co['prefix'] + "_homer_peaks.factor.narrowPeak"
            res["homer_histone"] = co['prefix'] + "_homer_peaks.histone.narrowPeak"
            co['peak_calls'] = dict()
            co['peak_calls']["original"] = res
            co['peak_calls']["filtered"] = {
                k: v.replace(".narrowPeak", ".filtered.bed")
                for k, v in res.items()}

            self.comparisons[name] = co
        return self.comparisons
Ejemplo n.º 10
0
    def load_data(
        self,
        output_map=None,
        only_these_keys=None,
        resolutions=None,
        prefix="{results_dir}/{name}",
        permissive=True,
    ):
        """
        Load the output files of the major functions of the Analysis.

        Parameters
        ----------
        output_map : :obj:`dict`
            Dictionary with {attribute_name: (file_path, kwargs)} to load the files.
            The kwargs in the tuple will be passed to :meth:`pandas.read_csv`.

            Defaults to the required to read the keys in ``only_these_keys``.
        only_these_keys : :obj:`list`, optional
            Iterable of analysis attributes to load up.
            Possible attributes:

                * "matrix_raw"
                * "matrix_norm"
                * "matrix_features"
                * "differential_results"

            Defaults to all of the above.

        resolutions: :obj:`list`
            List of resolution strings to get data for.

            Defaults to value of ``resolutions`` attribute of Analysis.
        prefix : :obj:`str`, optional
            String prefix of files to load.
            Variables in curly braces will be formated with attributes of analysis.

            Defaults to "{results_dir}/{name}".
        permissive : :obj:`bool`, optional
            Whether an error should be ignored if reading a file causes IOError.

            Default is :obj:`True`.

        Attributes
        ----------
        pandas.DataFrame
            Dataframes holding the respective data, available as attributes described
            in the `only_these_keys` parameter.

        Raises
        ----------
        IOError
            If not permissive and a file is not found
        """
        from ngs_toolkit.utils import fix_dataframe_header

        prefix = self._format_string_with_attributes(prefix)

        if resolutions is None:
            resolutions = self.resolutions

        if output_map is None:
            kwargs = {"index_col": 0}
            output_map = {
                "matrix_raw": {
                    r: (prefix + ".{}.matrix_raw.csv".format(r), kwargs)
                    for r in resolutions
                },
                "matrix_norm": {
                    r: (prefix + ".{}.matrix_norm.csv".format(r), kwargs)
                    for r in resolutions
                },
                "segmentation": {
                    r: (prefix + ".{}.segmentation.csv".format(r), {})
                    for r in resolutions
                },
                "segmentation_annot": {
                    r:
                    (prefix + ".{}.segmentation.annotated.csv".format(r), {})
                    for r in resolutions
                },
            }
        if only_these_keys is None:
            only_these_keys = list(output_map.keys())

        output_map = {
            k: v
            for k, v in output_map.items() if k in only_these_keys
        }

        for name, f in output_map.items():
            for resolution, (file, kwargs) in f.items():
                file = file.format(resolution)
                _LOGGER.info(
                    "Loading '{}' analysis attribute for resolution '{}'.".
                    format(name, resolution))
                if not hasattr(self, name):
                    setattr(self, name, {resolution: None})
                try:
                    getattr(self,
                            name)[resolution] = pd.read_csv(file, **kwargs)
                    # Fix possible multiindex for matrix_norm
                    if name == "matrix_norm":
                        getattr(self, name)[resolution] = fix_dataframe_header(
                            getattr(self, name)[resolution])
                except IOError as e:
                    if not permissive:
                        raise e
                    else:
                        _LOGGER.warning(e)
Ejemplo n.º 11
0
def main(cli=None):
    args = parse_arguments().parse_args(cli)
    _LOGGER.info(
        "This is the 'merge_signal' recipe from ngs_toolkit, "
        "version: %s", __version__)
    # Start project
    _LOGGER.debug(
        "Starting Analysis with PEP project configuration file: "
        "'%s'", args.config_file)
    an = Analysis(from_pep=args.config_file)
    if args.pass_qc:
        _LOGGER.info(
            "Filtering samples out which didn't pass QC as specified in sample "
            "annotation in column 'pass_qc'")
        an.samples = [
            s for s in an.samples
            if getattr(s, "pass_qc") not in ["0", 0, "False", False]
        ]

    if an.samples:
        print("Samples under consideration: '{}'. ".format(",".join(
            [s.name for s in an.samples])) +
              "\nTotal of {} samples.".format(len([s.name
                                                   for s in an.samples])))
    else:
        raise ValueError(
            "There were no valid samples after filtering for quality!")

    # Get only samples with signal
    an.samples = [
        s for s in an.samples if getattr(s, "protocol", None) in
        ["ATAC-seq", "ChIP-seq", "ChIPmentation"]
    ]
    an.set_organism_genome()
    sheet = an.prj.sheet.reindex([s.name for s in an.samples])

    _LOGGER.info(
        "Selecting samples with appropriate data type."
        "\nSamples under consideration: '%s'. "
        "\nTotal of %i samples.", ",".join(sheet["sample_name"].tolist()),
        sheet.shape[0])

    # Get default attributes if not set
    if args.attributes is None:
        if an.group_attributes:
            args.attributes = an.group_attributes
        else:
            _LOGGER.error(
                "Sample attributes to group by were not set and none could be"
                " found in project configuration file!"
                " Aborting!")
            return 1
    else:
        if "," in args.attributes:
            args.attributes = args.attributes.split(",")
        else:
            args.attributes = [args.attributes]

    _LOGGER.info(
        "Using the following attributes to merge samples: '%s', "
        "resulting in a total of %i groups.", "', '".join(args.attributes),
        len(sheet.groupby(args.attributes).groups.items()))

    merge_signal(
        sheet,
        an.samples,
        args.attributes,
        output_dir=args.output_dir,
        normalization_method=args.normalization_method,
        nucleosome=args.nucleosome,
        overwrite=args.overwrite,
        cpus=args.cpus,
        as_job=args.as_job,
        dry_run=args.dry_run,
    )