def chipseq_analysis_with_peaks(chipseq_analysis): import numpy as np from ngs_toolkit.utils import bed_to_index df = chipseq_analysis.sites.to_dataframe() # for homer peaks add p-value column df["name"] = bed_to_index(df) # dummy column df["score"] = np.random.random(df.shape[0]) # p-value df["strand"] = "." # dummy column for name, comp in chipseq_analysis.comparisons.items(): os.makedirs(comp["output_dir"]) for peak_type in ["original", "filtered"]: for peak_caller, file in comp["peak_calls"][peak_type].items(): # select 30% of all peaks to be present in each sample df2 = df.sample(frac=0.3) # for homer the column order is different if "homer" in peak_caller: df2 = df2[[ "name", "chrom", "start", "end", "score", "strand" ]] with open(file, "w") as handle: for _, entry in df2.iterrows(): handle.write("\t".join(entry.astype(str)) + "\n") return chipseq_analysis
def get_random_genomic_locations(n_regions, width_mean=500, width_std=400, min_width=300, genome_assembly="hg38"): """Get `n_regions`` number of random genomic locations respecting the boundaries of the ``genome_assembly``""" from ngs_toolkit.utils import bed_to_index # weight chroms by their size, excluding others csizes = { k: v[-1] for k, v in dict(pybedtools.chromsizes(genome_assembly)).items() if "_" not in k } gsize = sum(csizes.values()) csizes = {k: v / gsize for k, v in csizes.items()} chrom = pd.Series( np.random.choice(a=list(csizes.keys()), size=n_regions, p=list(csizes.values()))) start = np.array([0] * n_regions) end = np.absolute(np.random.normal(width_mean, width_std, n_regions)).astype(int) df = pd.DataFrame([chrom.tolist(), start.tolist(), end.tolist()]).T df.loc[(df[2] - df[1]) < min_width, 2] += min_width bed = (pybedtools.BedTool.from_dataframe(df).shuffle( genome=genome_assembly, chromFirst=True, noOverlapping=True, chrom=True).sort().to_dataframe()) return bed_to_index(bed)
def main(cli=None): print("Region type analysis") # Parse command-line arguments. args = parse_arguments().parse_args(cli) if os.path.exists(args.output_file) and (not args.overwrite): print("Output exists and `overwrite` is False, so not doing anything.") return 0 print("Reading up the analysis object.") a = ATACSeqAnalysis(from_pep=args.pep) a.load_data() # ( # "genomic_region", # "region_annotation_mapping", # "region_annotation_b_mapping", # ), # ( # "chromatin_state", # "chrom_state_annotation_mapping", # "chrom_state_annotation_b_mapping", # ), print("Reading up the BED file.") df = pd.read_csv(args.bed_file, sep="\t", header=None) df.columns = ['chrom', 'start', 'end'] print("Getting the index.") index = bed_to_index(df) print("Doing enrichment.") enr = a.region_context_enrichment(index) print("Saving.") enr.to_csv(args.output_file) print("Done.")
def get_genomic_bins(n_bins, genome_assembly="hg38", resolution=None): """Get a ``size`` number of random genomic bins respecting the boundaries of the ``genome_assembly``""" from ngs_toolkit.utils import bed_to_index bed = pybedtools.BedTool.from_dataframe( pd.DataFrame(dict( pybedtools.chromsizes(genome_assembly))).T.reset_index()) w = bed.makewindows(genome=genome_assembly, w=sum([i.length for i in bed]) / n_bins).to_dataframe() if resolution is not None: if isinstance(resolution, str): resolution = int(resolution.replace("kb", "000")) w["end"] = w["start"] + resolution return bed_to_index(w.head(n_bins))
def calculate_peak_support( self, samples=None, region_type="summits", peak_type="filtered", permissive=True, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"): """ Calculate a measure of support for each region in peak set (i.e. ratio of samples containing a peak overlapping region in union set of peaks). Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional DataFrame with signal/background combinations used to call peaks Defaults to analysis' own `comparison_table`. peak_dir : :obj:`str`, optional Path to peaks output directory. Defaults to {analysis.results_dir}/chipseq_peaks samples: :obj:`list` Not used. Provided for compatibility with ATACSeqAnalysis class. region_type: :obj:`str` Not used. Provided for compatibility with ATACSeqAnalysis class. permissive: :obj:`bool` Not used. Provided for compatibility with ATACSeqAnalysis class. Attributes ---------- support : :obj:`pandas.DataFrame` DataFrame with signal/background combinations used to call peaks """ import pybedtools from tqdm import tqdm from ngs_toolkit.utils import bed_to_index if comparison_table is None: comparison_table = self.comparison_table peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir)) # get index index = bed_to_index(self.sites.to_dataframe()) # calculate support (number of samples overlaping each merged peak) support = pd.DataFrame(index=index) for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"): for peak_caller, peak_file in comp['peak_calls'][peak_type].items(): try: sample_support = self.sites.intersect(peak_file, wa=True, c=True).to_dataframe() except ( ValueError, pybedtools.MalformedBedLineError, pybedtools.helpers.BEDToolsError, ): _LOGGER.warning( "Peaks for comparison %s (%s) not found!", (name, peak_file)) if permissive: continue else: raise sample_support.index = index support[(name, peak_caller)] = sample_support.iloc[:, 3] # Make multiindex labeling comparisons and peak type support.columns = pd.MultiIndex.from_tuples( support.columns, names=["comparison", "peak_caller"] ) support.to_csv( os.path.join( self.results_dir, self.name + "_peaks.binary_overlap_support.csv" ), index=True, ) # divide sum (of unique overlaps) by total to get support value between 0 and 1 support["support"] = support.astype(bool).sum(axis=1) / float(support.shape[1]) # save support.to_csv( os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True ) self.support = support
def get_cnv_data(self, resolutions=None, samples=None, save=True, assign=True, permissive=False): """ Load CNV data from ATAC-seq CNV pipeline and create CNV matrix at various resolutions. Parameters ---------- resolutions : :obj:`list`, optional Resolutions of analysis. Defaults to resolutions in Analysis object. samples : :obj:`list`, optional Samples to restrict analysis to. Defaults to samples in Analysis object. save: :obj:`bool`, optional Whether results should be saved to disc. Defaults to :obj:`True` assign: :obj:`bool`, optional Whether results should be assigned to an attribute in the Analsyis object. Defaults to :obj:`True` permissive: :obj:`bool`, optional Whether missing files should be allowed. Defaults to :obj:`False` Returns ------- dict Dictionary with CNV matrices for each resolution. Raises ------- IOError If not permissive and input files can't be read. Attributes ---------- matrix : :obj:`dict` Sets a `matrix` dictionary with CNV matrices for each resolution. """ # TODO: figure out a way of having the input file specified before hand from tqdm import tqdm from ngs_toolkit.utils import bed_to_index if resolutions is None: resolutions = self.resolutions if samples is None: samples = self.samples matrix_raw = dict() for resolution in tqdm(resolutions, total=len(resolutions), desc="Resolution"): matrix_raw[resolution] = pd.DataFrame() for sample in tqdm(samples, total=len(samples), desc="Sample"): # Read log2 file if not hasattr(sample, "log2_read_counts"): msg = "Sample does not have a 'log2_read_counts' attribute." warn_or_raise(AttributeError(msg), permissive) input_file = sample.log2_read_counts[resolution].format( resolution=resolution) try: cov = pd.read_csv(input_file, sep="\t", comment="#").set_index("Feature") except IOError as e: e = IOError( "Sample %s does not have a 'log2_read_counts' file: '%s'." % (sample.name, input_file)) warn_or_raise(e, permissive) # TODO: this is specific to CopyWriter, should be removed later # and probably replaced with the column position cov.columns = (cov.columns.str.replace( "log2.", "").str.replace(".trimmed.bowtie2.filtered.bam", "").str.replace( ".merged.sorted.subsample.bam", "")) # normalize signal to control # # TODO: check whether there was a reason I was previously # # undoing and redoing the log # matrix_raw[resolution][sample.name] = np.log2( # ( # (0.1 + (2 ** cov.loc[:, sample.name])) # / (0.1 + (2 ** cov.iloc[:, -1])) # ) # ) matrix_raw[resolution][ sample.name] = cov.loc[:, sample.name] - cov.iloc[:, -1] if "cov" not in locals(): msg = "None of the samples had a valid 'log2_read_counts' file." _LOGGER.error(msg) raise ValueError(msg) c = cov.columns.tolist() c[:3] = ["chrom", "start", "end"] cov.columns = c matrix_raw[resolution].index = bed_to_index(cov) if save: matrix_raw[resolution].to_csv( os.path.join( self.results_dir, self.name + ".{}.matrix_raw.csv".format(resolution), ), index=True, ) if assign: self.matrix_raw = matrix_raw return matrix_raw