def makePureSignal(row): signal = row[0] control = row[1] norm_factor = row[2] bam = BedTool('../H3K27me3/bw/' + signal + '.bam') bam.genome_coverage(bg=True, output="../H3K27me3/bw/" + signal + ".bdg") bam_control = BedTool('../H3K27me3/control/' + control + '.bam') bam_control.genome_coverage(bg=True, scale=norm_factor, output="../H3K27me3/control/" + control + ".bdg") s = BedTool('../H3K27me3/bw/' + signal + '.bdg') c = BedTool('../H3K27me3/control/' + control + '.bdg') BedTool().union_bedgraphs(i=[s.fn, c.fn], output="../H3K27me3/bw/" + signal + "_" + control + ".bdg") union = pd.read_csv("../H3K27me3/bw/" + signal + "_" + control + ".bdg", sep='\t', index_col=False, header=None) diff = union[3] - union[4] diff = diff.apply(lambda x: 0 if x < 0 else x) union["diff"] = diff union.drop([3, 4], axis=1, inplace=True) without_zero = union[union['diff'] != 0] os.remove("../H3K27me3/bw/" + signal + "_" + control + ".bdg") without_zero.to_csv("../H3K27me3/bw/pure_" + signal + ".bdg", sep='\t')
def cov_at_loci(bam, cov_fn): bed = BedTool(bam) cov_df = bed.genome_coverage(bg=True).to_dataframe() # get coverage inter = {} for idx, row in cov_df.iterrows(): chrom,s,e,n = row s = int(s); e = int(e) if chrom not in inter: inter[chrom] = [[s,e]] else: if s - inter[chrom][-1][1] < 1000: inter[chrom][-1][1] = e else: inter[chrom].append([s,e]) # prepare coverage list with open(cov_fn,'w') as out_f: for k,v in inter.items(): # k is chrom, v is list of region pos for c in v: s,e = c cov = cov_df.query('chrom==@k and start >=@s and end <=@e')['name'].mean() out_f.write('\t'.join([k,str(s),str(e),str(cov)])+'\n') cov_df = pd.read_csv(cov_fn,sep='\t',header=0,names=['chr','s','e','cov']) cov_df = cov_df.sort_values('cov',ascending=False) cov_df.to_csv(cov_fn,sep='\t',index=False)
def make_bg(files): tfp = files[1]+str(uuid.uuid4()) bamf = BedTool(files[0]) bgf = bamf.genome_coverage(bg=True, strand='+') bgf.saveas(tfp) #os.system("cat "+tfp+" | awk '$4 > 3' > "+tfp) # Filter low cov regions df = pd.read_csv(tfp, sep='\t', header=None, names=['Chr', 'Start', 'End', 'Strand'], index_col=None) df['Strand'] = '+' df.to_csv(tfp, sep='\t', index=False, header=None) tfn = files[1]+str(uuid.uuid4()) bgf = bamf.genome_coverage(bg=True, strand='-') bgf.saveas(tfn) #os.system("cat "+tfn+" | awk '$4 > 3' > "+tfn) # Filter low cov regions df = pd.read_csv(tfn, sep='\t', header=None, names=['Chr', 'Start', 'End', 'Strand'], index_col=None) df['Strand'] = '-' df.to_csv(tfn, sep='\t', index=False, header=None) os.system('cat ' + tfp + ' ' + tfn + ' > ' + files[1]+files[0].split("/")[-1].replace('.bam', '.bg')) temp = BedTool(files[1]+files[0].split("/")[-1].replace('.bam', '.bg')) temp = temp.sort() temp.saveas(files[1]+files[0].split("/")[-1].replace('.bam', '.bg')) os.system('rm ' + tfp + ' ' + tfn) return(1)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, bin_large_plots=False, binning_summary_statistic="max", out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # only sort if not sorted bam_sorted = util.file.mkstempfname('.sorted.bam') should_remove_sorted = True if not util.file.bam_is_sorted(bam_dupe_processed): samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) else: bam_sorted = bam_dupe_processed if not plot_only_non_duplicates: # in this case we are passing through the original in_bam directly should_remove_sorted = False # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) # only remove the sorted bam if it is not the original input bam # which we use directly in some casess if should_remove_sorted: os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(float(row[2])) domain_max += 1 with matplotlib.pyplot.style.context(plot_style): fig = matplotlib.pyplot.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = matplotlib.pyplot.subplot( ) # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) # Binning bin_size = 1 if bin_large_plots: # Bin locations and take summary value (maximum or minimum) in each bin binning_fn = { "min": min, "max": max, "mean": mean, "median": median } binning_action = binning_fn.get(binning_summary_statistic, "max") inner_plot_width_inches = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()).width inner_plot_width_px = inner_plot_width_inches * fig.dpi # width of actual plot (sans whitespace and y axis text) bins_per_pixel = 1 # increase to make smaller (but less visible) bins bin_size = 1 + int(domain_max / (inner_plot_width_px * bins_per_pixel)) binned_segment_depths = OrderedDict() for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): summary_depths_in_bins = [ binning_action(position_depths[i:i + bin_size]) for i in range(0, len(position_depths), bin_size) ] binned_segment_depths[segment_name] = summary_depths_in_bins segment_depths = binned_segment_depths # Plotting domain_max = 0 for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list( matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index x_values = range(prior_domain_max, domain_max) x_values = [x * bin_size for x in x_values] if plot_data_style == "filled": matplotlib.pyplot.fill_between(x_values, position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": matplotlib.pyplot.plot(x_values, position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": matplotlib.pyplot.plot(x_values, position_depths, 'ro', antialiased=True, color=segment_color) matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2) matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1) ylabel = "read depth" if (bin_size > 1): ylabel = "read depth ({summary} in {size}-bp bin)".format( size=bin_size, summary=binning_summary_statistic) matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1) if plot_x_limits is not None: x_min, x_max = plot_x_limits matplotlib.pyplot.xlim(x_min, x_max) if plot_y_limits is not None: y_min, y_max = plot_y_limits matplotlib.pyplot.ylim(y_min, y_max) # to squash a backend renderer error on OSX related to tight layout if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() matplotlib.pyplot.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot(range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)