def all_to_igv(matrix, output_prefix, **kwargs): """ Convert dictionary of DataFrame with CNV data in several resolutions to IGV format. Parameters ---------- matrix : :obj:`pandas.DataFrame` DataFrame with CNV data to convert. output_prefix : str Prefix to add to plots. **kwargs : :obj:`dict`, optional Additional parameters will be passed to ngs_toolkit.cnv.to_igv Returns ------- dict Dictionary of CNV data in IGV format for each resolution. """ from tqdm import tqdm igvs = dict() resolutions = matrix.keys() for resolution in tqdm(resolutions, total=len(resolutions), desc="Resolution"): _LOGGER.info( "Making IGV visualization for resolution '{}'.".format(resolution)) igvs[resolution] = to_igv(matrix[resolution], output_file="{}_{}.igv".format( output_prefix, resolution), **kwargs) return igvs
def to_igv(matrix, output_file=None, save=True, view_limits=(-2, 2)): """ Convert DataFrame with CNV data to IGV format. Parameters ---------- matrix : :obj:`pandas.DataFrame` DataFrame with CNV data to convert. output_file : str, optional Output file. Required if `save` is True. save: :obj:`bool`, optional Whether results should be saved to disc. Defaults to :obj:`True`. view_limits : tuple, optional Extreme values (min, max) of color scale used to visualize in IGV. Defaults to (-2, 2). Returns ------- pandas.DataFrame CNV data in IGV format. Raises ------- ValueError: If `save` is True but `output_file` is None. """ _LOGGER.info("Making IGV visualization") # as IGV file igv = pd.DataFrame(index=matrix.index) igv.loc[:, "Chromosome"] = list( map(lambda x: x[0], matrix.index.str.split(":"))) igv.loc[:, "Start"] = list( map(lambda x: int(x[1].split("-")[0]), matrix.index.str.split(":"))) igv.loc[:, "End"] = list( map(lambda x: int(x[1].split("-")[1]), matrix.index.str.split(":"))) igv.loc[:, "Name"] = igv.index igv = igv.join(matrix).reset_index(drop=True) if save: if output_file is None: raise ValueError( "If the 'save' option is specified, 'output_file' must also be!" ) open(output_file, "w") output_handle = open(output_file, "a") output_handle.write( "#track viewLimits=-{}:{} graphType=heatmap color=255,0,0\n". format(*view_limits)) igv.to_csv(output_handle, sep="\t", index=False) output_handle.close() return igv
def test_config_has_all_required_fields(log): import logging from ngs_toolkit import _LOGGER assert isinstance(_LOGGER, logging.Logger) previous_size = os.stat(log).st_size _LOGGER.info("Testing logger") new_size = os.stat(log).st_size assert new_size > previous_size
def main(): """ Program"s main entry point. """ # Parse command-line arguments. _LOGGER.debug("Parsing command-line arguments") args = parse_arguments() if args.command == "create": _LOGGER.info("Creating project '{}' in '{}'.".format(args.project_name, args.root_dir)) genome_assemblies = { x.split(":")[0]: x.split(":")[1] for x in args.genome_assemblies.split(",") } # Create project. git_ok = create_project( project_name=args.project_name, genome_assemblies=genome_assemblies, overwrite=args.overwrite, root_projects_dir=args.root_dir, ) if git_ok != 0: _LOGGER.error("Initialization of project failed.") return git_ok # Create requirements file. _LOGGER.info("Creating requirements file for project '{}'.".format(args.project_name)) create_requirements_file( project_name=args.project_name, project_dir=os.path.join(args.root_dir, args.project_name), overwrite=args.overwrite, ) # Create Makefile. _LOGGER.info("Creating Makefile file for project '{}'.".format(args.project_name)) create_makefile( project_name=args.project_name, project_dir=os.path.join(args.root_dir, args.project_name), overwrite=args.overwrite, ) elif args.command == "recipe": if args.list_only: import pkgutil import ngs_toolkit.recipes n = pkgutil.iter_modules(ngs_toolkit.recipes.__path__) print("Available ngs_toolkit recipes: '{}'.".format("', '".join([x[1] for x in n]))) else: _LOGGER.info("Running recipe '{}'.".format(args.recipe_name)) run_recipe(recipe_name=args.recipe_name, project_config=args.project_config) _LOGGER.debug("Completed.")
def get_consensus_sites( self, samples=None, region_type="summits", peak_type="filtered", extension=250, blacklist_bed=None, filter_chroms=True, permissive=False, save=True, assign=True, **kwargs): """ Get consensus (union) of enriched sites (peaks) across all comparisons. There are two modes possible, defined by the value of ``region_type``: * peaks: simple union of all sites; * summits: peak summits are extended by ``extension`` and a union is made. For ChIP-seq, the ``comparison_table`` keyword argument or a ``comparison_table`` attribute set is required. Peaks/summits will be aggregated for the peaks called in each sample comparison. Parameters ---------- samples : :obj:`list` Iterable of :class:`peppy.Sample` objects to restrict to. Must have a ``peaks`` attribute set. Defaults to all samples in the analysis (``samples`` attribute). region_type : :obj:`str` The type of region to use to create the consensus region set - one of "summits" or "peaks". If "summits", peak summits will be extended by ``extension`` before union. If "peaks", sample peaks will be used with no modification prior to union. Default is "summits". extension : :obj:`int` Amount to extend peaks summits by in both directions. Default is 250. blacklist_bed : {:obj:`False`, :obj:`str`} Either :obj:`False` or a path to a BED file with genomic positions to exclude from consensus peak set. Default is to use a blacklist file for the analysis ``genome``. filter_chroms : {:obj:`list`, :obj:`str`} A list of chromosomes to filter out or a string with a pattern to match to exclude chromosomes. Uses Pandas string methods :class:`pandas.Series.str.match`. Pass for example `'.*_.*|chrM'` to filter out chromosomes with a "_" character and a "chrM" chromosome. Default is not to filter anything. permissive : :obj:`bool` Whether Samples that which ``region_type`` attribute file does not exist should be simply skipped or an error thrown. comparison_table : :obj:`pandas.DataFrame`, optional DataFrame with signal/background combinations used to call peaks. Part of kwargs. Defaults to analysis own ``comparison_table``. peak_dir : :obj:`str`, optional Path to peaks output directory. Part of kwargs. Defaults to "{analysis.results_dir}/chipseq_peaks". Attributes ---------- sites : :class:`pybedtools.BedTool` Bedtool with consensus sites. """ import re from ngs_toolkit.general import get_blacklist_annotations import pybedtools from tqdm import tqdm import tempfile if "comparison_table" not in kwargs: # TODO: allow not requiring peak_dir to be passed if specifying a new table self.set_comparisons(kwargs["comparison_table"], peak_dir=kwargs["peak_dir"]) if region_type not in ["summits", "peaks"]: msg = "`region_type` attribute must be one of 'summits' or 'peaks'!" _LOGGER.error(msg) raise ValueError(msg) if blacklist_bed is None: _LOGGER.info("Blacklist file not provided. Downloading...") try: blacklist_bed = get_blacklist_annotations(self.organism, self.genome) except AttributeError: msg = "Blacklist file was not provided and cannot" msg += " get one without analysis having `organism` and `genome` set." _LOGGER.error(msg) raise AttributeError(msg) # Simply concatenate all peaks in one file f = tempfile.NamedTemporaryFile() with open(f.name, "a") as handle: for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"): for peak_caller, peak_file in comp['peak_calls'][peak_type].items(): try: # TODO: check if homer has summits and they match this pattern summit = re.sub("_peaks.narrowPeak", "_summits.bed", peak_file) file = ( pybedtools.BedTool(summit).slop(b=extension, genome=comp['genome']).fn if region_type == "summits" else peak_file) except (ValueError, FileNotFoundError): _LOGGER.warning("Input file for comparison {} ({}) not found!", (name, f)) if not permissive: raise for line in open(file, 'r'): handle.write(line) # Merge overlaping peaks across comparisons sites = pybedtools.BedTool(f.name).sort().merge() # Filter # # remove blacklist regions if blacklist_bed is not False: if not isinstance(blacklist_bed, pybedtools.BedTool): blacklist = pybedtools.BedTool(blacklist_bed) sites = sites.intersect(v=True, b=blacklist) # # filter requested chromosomes if filter_chroms is not None: if isinstance(filter_chroms, list): sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas() elif isinstance(filter_chroms, str): s = sites.to_dataframe() sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)]) # Save and assign if save: output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed") sites.saveas(output_file) sites = pybedtools.BedTool(output_file) if assign: self.sites = sites return sites
def summarize_peaks_from_comparisons( self, comparison_table=None, output_dir="{results_dir}/chipseq_peaks", filtered=True, permissive=True, ): """ Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which samples represent signal or background. Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. output_dir : :obj:`str` Parent directory where peaks will be created. Will be created if does not exist. permissive: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Raises ---------- ValueError Will be raised if not `permissive` and incomplete/incoherent comparisons are detected. """ from ngs_toolkit.utils import homer_peaks_to_bed if comparison_table is None: comparison_table = self.comparison_table req_columns = [ "comparison_name", "sample_name", "comparison_side", "sample_group", ] msg = "Comparison table is missing some of the following columns: '{}'.".format( ",".join(req_columns) ) if not all([col in comparison_table.columns for col in req_columns]): _LOGGER.error(msg) raise AssertionError(msg) # Complement default `output_dir` if "{results_dir}" in output_dir: output_dir = os.path.abspath( output_dir.format(results_dir=self.results_dir) ) # For each comparison, count called peaks peak_type = "filtered" if filtered else "original" peak_counts = list() for name, comp in self.comparisons.items(): _LOGGER.info(name) for peak_caller, file in comp['peak_calls'][peak_type].items(): error = "Peak files for comparison '%s' with '%s' parameters don't exist." if "homer" in peak_caller and not filtered: try: homer_peaks_to_bed(file, file.replace("narrowPeak", "bed")) except IOError: if permissive: _LOGGER.warning(error, (name, peak_caller)) peak_counts.append([name, peak_caller, np.nan]) continue else: raise except pd.errors.EmptyDataError: peak_counts.append([name, peak_caller, 0.0]) file = file.replace("narrowPeak", "bed") try: df = pd.read_csv(file, sep="\t") except IOError: if permissive: _LOGGER.warning(error, (name, peak_caller)) peak_counts.append([name, peak_caller, np.nan]) continue else: raise except pd.errors.EmptyDataError: peak_counts.append([name, peak_caller, 0.0]) peak_counts.append([name, peak_caller, df.shape[0]]) peak_counts = pd.DataFrame(peak_counts, columns=["comparison_name", "peak_caller", "peak_counts"]) return peak_counts # .fillna(0)
def filter_peaks( self, comparison_table=None, filter_bed=None, peaks_dir="{results_dir}/chipseq_peaks", ): """ Filter peak calls for various comparisons for entries that do not overlap another BED file. Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. filter_bed : :obj:`str` BED file with entries to filter out from the BED files of each comparison. Defaults to the set of Blacklisted regions from the analysis' genome. In that case it will be fetched if not present. peaks_dir : :obj:`str` Parent directory where peak calls for each comparison exist. Will be created if does not exist. Defaults to "{results_dir}/chipseq_peaks". Raises ---------- AttributeError If `filter_bed` is not given and failes to be retrieved. """ from ngs_toolkit.utils import homer_peaks_to_bed, filter_bed_file if comparison_table is None: comparison_table = self.comparison_table if filter_bed is None: _LOGGER.info("Blacklist file not provided. Downloading...") try: filter_bed = self.get_resources(steps=["blacklist"])[ "blacklist_file" ] except AttributeError: msg = "Blacklist file was not provided and cannot be" msg += " get one without analysis having `organism` and `genome` set." _LOGGER.error(msg) raise AttributeError(msg) peaks_dir = self._format_string_with_attributes(peaks_dir) if not os.path.exists(peaks_dir): os.makedirs(peaks_dir) for name, comp in self.comparisons.items(): # MACS2 filter_bed_file(comp['peak_calls']['original']['macs'], filter_bed, comp['peak_calls']['filtered']['macs']) # HOMER tmp_bed = comp['peak_calls']['original']['homer_factor'].replace(".narrowPeak", ".bed") homer_peaks_to_bed(comp['peak_calls']['original']['homer_factor'], tmp_bed) filter_bed_file(tmp_bed, filter_bed, comp['peak_calls']['filtered']['homer_factor']) os.remove(tmp_bed) tmp_bed = comp['peak_calls']['original']['homer_histone'].replace(".narrowPeak", ".bed") homer_peaks_to_bed(comp['peak_calls']['original']['homer_histone'], tmp_bed) filter_bed_file(tmp_bed, filter_bed, comp['peak_calls']['filtered']['homer_histone']) os.remove(tmp_bed)
def call_peaks_from_comparisons( self, comparison_table=None, output_dir="{results_dir}/chipseq_peaks", permissive=True, overwrite=True, distributed=True, ): """ Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which samples represent signal or background. Parameters ---------- comparison_table : :obj:`pandas.DataFrame` Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. output_dir : :obj:`str` Parent directory where peaks will be created. Will be created if does not exist. permissive: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Default is :obj:`True`. overwrite: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Default is :obj:`True`. distributed: :obj:`bool` Whether peak calling should be run in serial or in distributed mode as jobs. Default is :obj:`True`. Raises ---------- ValueError If not `permissive` and incomplete/incoherent comparisons are detected. """ import subprocess from ngs_toolkit.utils import ( macs2_call_chipseq_peak, homer_call_chipseq_peak_job, filter_kwargs_by_callable ) from tqdm import tqdm if comparison_table is None: comparison_table = self.comparison_table req_columns = [ "comparison_name", "sample_name", "comparison_side", "sample_group", ] msg = "Comparison table is missing some of the following columns: '{}'.".format( ",".join(req_columns) ) if not all([col in comparison_table.columns for col in req_columns]): _LOGGER.error(msg) raise AssertionError(msg) # Complement default `output_dir` if "{results_dir}" in output_dir: output_dir = os.path.abspath( output_dir.format(results_dir=self.results_dir) ) if not os.path.exists(output_dir): os.makedirs(output_dir) # For each comparison for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"): _LOGGER.info( "Doing comparison '{}' with positive samples '{}' and background samples '{}'".format( name, [s.name for s in comp['signal_samples']], [s.name for s in comp['control_samples']], ) ) # Call peaks cmds = list() bkws = filter_kwargs_by_callable(comp, macs2_call_chipseq_peak) kwargs = { "name": name, "distributed": distributed, **bkws} if overwrite: cmds += [macs2_call_chipseq_peak(**kwargs), homer_call_chipseq_peak_job(**kwargs)] else: if not os.path.exists(comp['peak_calls']['original']['macs']): cmds += [macs2_call_chipseq_peak(**kwargs)] if not os.path.exists(comp['peak_calls']['original']['homer_factor']): cmds += [homer_call_chipseq_peak_job(**kwargs)] else: _LOGGER.warning("Peak files for comparison '%s' already exist. Skipping.", name) if not distributed: for cmd in cmds: _LOGGER.info("Calling peaks for comparison '%s' with command: '%s'.\n", (name, cmd)) subprocess.call(cmd.split(" "))
def set_comparisons(self, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"): """ Set up an attribute containing information about the sample comparisons necessary for peak calling. Structure: * comparison_name: * signal_samples * background_samples * directory * prefix * resulting_files * macs * homer_histone * homer_factor Parameters ---------- comparison_table : :obj:`str`, optional Comparison table wit peak comparisons. Defaults to one from PEP project if available. peak_dir : :obj:`str`, optional Directory with peak calls. Defaults to "{results_dir}/chipseq_peaks". Returns ------- :obj:`dict` The dictionary with the attributes. Attributes ---------- :obj:`dict` The dictionary with the attributes. Raises ------ ValueError If comparisons are not correctly specified. """ if comparison_table is None: comparison_table = self.comparison_table comparison_names = ( comparison_table.loc[ comparison_table['comparison_type'] == 'peaks', "comparison_name"] .drop_duplicates().sort_values()).tolist() if not comparison_names: _LOGGER.warning("Could not find any comparisons of type 'peak'.") peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir)) self.comparisons = dict() for name in comparison_names: _LOGGER.info("Setting comparison '%s' up", name) # If there aren't two sides to each comparison, skip it or throw error if len(set(comparison_table.query("comparison_name == '{}'".format(name))["comparison_side"])) != 2: error = "Comparison '{}' does not contain two sides.".format(name) _LOGGER.error(error) raise ValueError(error) # Get the sample names of samples in each side pos_names = comparison_table.loc[ (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] == 1), "sample_name"].tolist() neg_names = comparison_table.loc[ (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] < 1), "sample_name"].tolist() signal_samples = [s for s in self.samples if s.name in pos_names] control_samples = [s for s in self.samples if s.name in neg_names] co = dict() co['signal_samples'] = signal_samples co['control_samples'] = control_samples # Additional info co['output_dir'] = os.path.join(peak_dir, name) co['prefix'] = os.path.join(co['output_dir'], name) g = comparison_table.query("comparison_name == '{}'".format(name))['comparison_genome'].drop_duplicates().squeeze() if not isinstance(g, str): msg = "Could not determine genome of comparison '%s'." % g _LOGGER.error(msg) raise AssertionError(msg) co['genome'] = g # resulting files files res = dict() res['macs'] = co['prefix'] + "_peaks.narrowPeak" res["homer_factor"] = co['prefix'] + "_homer_peaks.factor.narrowPeak" res["homer_histone"] = co['prefix'] + "_homer_peaks.histone.narrowPeak" co['peak_calls'] = dict() co['peak_calls']["original"] = res co['peak_calls']["filtered"] = { k: v.replace(".narrowPeak", ".filtered.bed") for k, v in res.items()} self.comparisons[name] = co return self.comparisons
def load_data( self, output_map=None, only_these_keys=None, resolutions=None, prefix="{results_dir}/{name}", permissive=True, ): """ Load the output files of the major functions of the Analysis. Parameters ---------- output_map : :obj:`dict` Dictionary with {attribute_name: (file_path, kwargs)} to load the files. The kwargs in the tuple will be passed to :meth:`pandas.read_csv`. Defaults to the required to read the keys in ``only_these_keys``. only_these_keys : :obj:`list`, optional Iterable of analysis attributes to load up. Possible attributes: * "matrix_raw" * "matrix_norm" * "matrix_features" * "differential_results" Defaults to all of the above. resolutions: :obj:`list` List of resolution strings to get data for. Defaults to value of ``resolutions`` attribute of Analysis. prefix : :obj:`str`, optional String prefix of files to load. Variables in curly braces will be formated with attributes of analysis. Defaults to "{results_dir}/{name}". permissive : :obj:`bool`, optional Whether an error should be ignored if reading a file causes IOError. Default is :obj:`True`. Attributes ---------- pandas.DataFrame Dataframes holding the respective data, available as attributes described in the `only_these_keys` parameter. Raises ---------- IOError If not permissive and a file is not found """ from ngs_toolkit.utils import fix_dataframe_header prefix = self._format_string_with_attributes(prefix) if resolutions is None: resolutions = self.resolutions if output_map is None: kwargs = {"index_col": 0} output_map = { "matrix_raw": { r: (prefix + ".{}.matrix_raw.csv".format(r), kwargs) for r in resolutions }, "matrix_norm": { r: (prefix + ".{}.matrix_norm.csv".format(r), kwargs) for r in resolutions }, "segmentation": { r: (prefix + ".{}.segmentation.csv".format(r), {}) for r in resolutions }, "segmentation_annot": { r: (prefix + ".{}.segmentation.annotated.csv".format(r), {}) for r in resolutions }, } if only_these_keys is None: only_these_keys = list(output_map.keys()) output_map = { k: v for k, v in output_map.items() if k in only_these_keys } for name, f in output_map.items(): for resolution, (file, kwargs) in f.items(): file = file.format(resolution) _LOGGER.info( "Loading '{}' analysis attribute for resolution '{}'.". format(name, resolution)) if not hasattr(self, name): setattr(self, name, {resolution: None}) try: getattr(self, name)[resolution] = pd.read_csv(file, **kwargs) # Fix possible multiindex for matrix_norm if name == "matrix_norm": getattr(self, name)[resolution] = fix_dataframe_header( getattr(self, name)[resolution]) except IOError as e: if not permissive: raise e else: _LOGGER.warning(e)
def main(cli=None): args = parse_arguments().parse_args(cli) _LOGGER.info( "This is the 'merge_signal' recipe from ngs_toolkit, " "version: %s", __version__) # Start project _LOGGER.debug( "Starting Analysis with PEP project configuration file: " "'%s'", args.config_file) an = Analysis(from_pep=args.config_file) if args.pass_qc: _LOGGER.info( "Filtering samples out which didn't pass QC as specified in sample " "annotation in column 'pass_qc'") an.samples = [ s for s in an.samples if getattr(s, "pass_qc") not in ["0", 0, "False", False] ] if an.samples: print("Samples under consideration: '{}'. ".format(",".join( [s.name for s in an.samples])) + "\nTotal of {} samples.".format(len([s.name for s in an.samples]))) else: raise ValueError( "There were no valid samples after filtering for quality!") # Get only samples with signal an.samples = [ s for s in an.samples if getattr(s, "protocol", None) in ["ATAC-seq", "ChIP-seq", "ChIPmentation"] ] an.set_organism_genome() sheet = an.prj.sheet.reindex([s.name for s in an.samples]) _LOGGER.info( "Selecting samples with appropriate data type." "\nSamples under consideration: '%s'. " "\nTotal of %i samples.", ",".join(sheet["sample_name"].tolist()), sheet.shape[0]) # Get default attributes if not set if args.attributes is None: if an.group_attributes: args.attributes = an.group_attributes else: _LOGGER.error( "Sample attributes to group by were not set and none could be" " found in project configuration file!" " Aborting!") return 1 else: if "," in args.attributes: args.attributes = args.attributes.split(",") else: args.attributes = [args.attributes] _LOGGER.info( "Using the following attributes to merge samples: '%s', " "resulting in a total of %i groups.", "', '".join(args.attributes), len(sheet.groupby(args.attributes).groups.items())) merge_signal( sheet, an.samples, args.attributes, output_dir=args.output_dir, normalization_method=args.normalization_method, nucleosome=args.nucleosome, overwrite=args.overwrite, cpus=args.cpus, as_job=args.as_job, dry_run=args.dry_run, )