def test_readsToWiggle_pysam_wiggles(self): """ Tests the ability of reads to wiggle to generate correct wiggle files """ reads = pysam.Samfile( os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam( reads, 91537632, 91537675, '-', 'center', True) wiggle_true = [ 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0 ] print wiggle_true print wiggle for true, test in zip(wiggle_true, wiggle): self.assertAlmostEqual(test, true, 4)
def test_readsToWiggle_pysam(self): reads = pysam.Samfile( os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam( reads, 91537632, 91537675, '-', 'center', False) #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False) wiggle_true = [ 2., 2., 2., 2., 2., 2., 2., 2., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 9., 9., 9., 9., 9., 9., 9., 9., 0., 0., 0. ] print wiggle for true, test in zip(wiggle_true, wiggle): self.assertEqual(test, true) # pos_counts_true = [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 9., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ] for true, test in zip(pos_counts_true, pos_counts): self.assertEqual(test, true) assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]
def count_gene(bam_file, gene, flip): """ get read counts for genic regions in the gene specified by annotation in passed value 'keys' bam_file - pysam bam file """ region_counts = {} bam_file = pysam.Samfile(bam_file, 'rb') # fetch reads from bam file for the gene referenced by keys (Ensembl ID) subset_reads = bam_file.fetch(reference = gene['chrom'], start = int(gene["start"]), end = int(gene["stop"])) # determine strand to keep based on flip option keep_strand = gene["strand"] if str(flip) == "flip": if str(keep_strand) == '-': keep_strand = '+' elif str(keep_strand) == '+': keep_strand = '-' elif str(flip) == "both": keep_strand = 0; wig, jxns, nr_counts, read_lengths, reads = readsToWiggle_pysam(subset_reads, int(gene["start"]), int(gene["stop"]), keep_strand, 'center', True) gene_sum = 0 for region_start, region_stop in gene['regions']: start = int(region_start) - gene["start"] stop = int(region_stop) - gene["start"] gene_sum += sum(wig[start:stop]) region_counts[gene['gene_id'] + ":" + str(region_start) + "-" + str(region_stop)] = sum(wig[start:stop]) bam_file.close() return [(gene['gene_id'] + ":" + str(start) + "-" + str(stop), {"chrom" : gene['chrom'], "start" : start, "stop" : stop, "strand" : gene["strand"], "gene_id": gene['gene_id'], 'frea' : gene["frea"], "counts" : count(gene_sum, region_counts[gene['gene_id'] + ":" + str(start) + "-" + str(stop)])}) for start, stop in gene['regions']]
def test_readsToWiggle_pysam_wiggles(self): """ Tests the ability of reads to wiggle to generate correct wiggle files """ reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True) wiggle_true = [0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0] print wiggle_true print wiggle for true, test in zip(wiggle_true, wiggle): self.assertAlmostEqual(test, true, 4)
def test_readsToWiggle_pysam(self): reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False) #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False) wiggle_true = [ 2. , 2., 2. , 2. , 2. , 2. , 2. , 2. , 11. , 11., 11. , 11. ,11. , 11. , 11., 11. , 11., 11., 11. , 11. ,11. , 11. , 11. , 11., 11. , 11. , 11. ,11. , 11. ,11., 11. , 11., 11., 9. , 9. , 9. , 9. , 9., 9. , 9., 9. , 0. , 0., 0.] print wiggle for true, test in zip(wiggle_true, wiggle): self.assertEqual(test, true) # pos_counts_true = [ 0. , 0., 0. , 0. ,0. , 0., 0., 0., 0., 0., 0., 0., 0. , 0. , 0. , 0. , 2., 0., 0. , 0., 0., 0., 0. , 0., 9., 0. , 0., 0. , 0. , 0. , 0. , 0. , 0. , 0., 0., 0., 0. , 0., 0. , 0. , 0., 0., 0. , 0.] for true, test in zip(pos_counts_true, pos_counts): self.assertEqual(test, true) assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]
def call_peaks(interval, gene_length, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial", min_reads=3, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, max_width=None, min_width=None, algorithm="spline", reverse_strand=False, input_bam=None): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak """ if plotit: plt.rcParams['interactive'] = True pass bam_fileobj = pysam.Samfile(bam_file, 'rb') #fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr"): interval.chrom = "chr" + interval.chrom subset_reads = list( bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) strand = str(interval.strand) if reverse_strand: if strand == "+": strand = "-" elif strand == "-": strand = "+" (wiggle, jxns, pos_counts, lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, strand, "start", False) #This is the worst of hacks, need to factor out pysam eventually bam_fileobj = Robust_BAM_Reader(bam_file) subset_reads = list( bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) array_of_reads = read_array(subset_reads, interval.start, interval.stop) if input_bam: #if not none input_bam_fileobj = Robust_BAM_Reader(input_bam) input_subset_reads = list( input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop) nreads_in_gene = sum(pos_counts) gene_length = int(gene_length) lengths = [ gene_length - 1 if read >= gene_length else read for read in lengths ] if user_threshold is None: if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha) elif method == "random": gene_threshold = get_FDR_cutoff_mean(readlengths=lengths, genelength=gene_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) else: logging.info("using user threshold") gene_threshold = user_threshold if not isinstance(gene_threshold, int): raise TypeError #these are what is built in this dict, complicated enough that it might #be worth turning into an object peak_dict = {} peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = interval peak_number = 0 sections = find_sections(wiggle, max_gap) if plotit: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1, strand) Nreads = count_reads_in_interval(cur_interval, array_of_reads) cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(len(data)) peak_dict['sections'][sect] = {} peak_dict['sections'][sect]['nreads'] = int(Nreads) #makes sure there are enough reads if Nreads < min_reads: logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads)) pass if user_threshold is None: if SloP: half_width = 500 section_start = max(0, sectstart + interval.start - half_width) section_stop = sectstop + interval.start + 1 + half_width expanded_sect_length = section_stop - section_start cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop, strand) expanded_Nreads = get_reads_in_interval( cur_interval, array_of_reads) sect_read_lengths = read_lengths_from_htseq(expanded_Nreads) sect_read_lengths = [ sect_length - 1 if read > sect_length else read for read in sect_read_lengths ] peak_dict['sections'][sect]['expanded_Nreads'] = len( expanded_Nreads) if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user threshold = max( gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha)) elif method == "random": #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = max( gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha)) else: raise ValueError("Method %s does not exist" % (method)) logging.info("Using super-local threshold %d" % (threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": data = map(float, data) #Magic number for initial smoothing, but it works initial_smoothing_value = ( (sectstop - sectstart + 1)**(1 / 3)) + 10 peak_dict['sections'][sect][ 'smoothing_factor'] = initial_smoothing_value logging.info("initial smoothing value: %.2f" % initial_smoothing_value) fitter = SmoothingSpline( xvals, data, smoothing_factor=initial_smoothing_value, lossFunction="get_turn_penalized_residuals", threshold=threshold, num_reads=Nreads) elif algorithm == "gaussian": cts = map(float, cts) fitter = GaussMix(xvals, cts) elif algorithm == "classic": data = map(float, data) fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks() logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor) peak_dict['sections'][sect][ 'final_smoothing_factor'] = fitter.smoothing_factor if peak_definitions is None: numpeaks = 0 else: numpeaks = len(peak_definitions) logging.info("I identified %d potential peaks" % (numpeaks)) except Exception as error: logging.error("peak finding failed:, %s, %s" % (interval.name, error)) raise error #subsections that are above threshold #peak center is actually the location where we think binding should #occur, not the average of start and stop #Need to get all ranges, count number of reads in each range and compute from there for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = interval.start + sectstart + peak_start genomic_stop = interval.start + sectstart + peak_stop cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop, strand) number_reads_in_peak = count_reads_in_interval( cur_interval, array_of_reads) if input_bam: input_number_reads_in_peak = count_reads_in_interval( cur_interval, input_array_of_reads) else: input_number_reads_in_peak = 0 peak_length = genomic_stop - genomic_start + 1 logging.info("""Peak %d (%d - %d) has %d reads""" % (peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) #highest point in start stop genomic_center = interval.start + sectstart + peak_center #makes it thicker so we can see on the browser #error checking logic to keep bed files from breaking thick_start = max(genomic_center - 2, genomic_start) thick_stop = min(genomic_center + 2, genomic_stop) #super local logic area_start = max(0, (peak_center + sectstart) - windowsize) area_stop = min((peak_center + sectstart) + windowsize, len(wiggle)) cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop, strand) number_reads_in_area = count_reads_in_interval( cur_interval, array_of_reads) area_length = area_stop - area_start + 1 peak_dict['clusters'].append( Peak( chrom=interval.chrom, genomic_start=genomic_start, genomic_stop=genomic_stop, gene_name=interval.attrs['gene_id'], strand=interval.strand, thick_start=thick_start, thick_stop=thick_stop, peak_number=peak_number, number_reads_in_peak=number_reads_in_peak, size=peak_length, p=0, effective_length=int(interval.attrs['effective_length']), peak_length=peak_length, area_reads=number_reads_in_area, area_size=area_length, nreads_in_gene=nreads_in_gene, #nreads_in_input=input_number_reads_in_peak, )) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] += 1 peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) return peak_dict
def call_peaks(interval, gene_length, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial", min_reads=3, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, max_width=None, min_width=None, algorithm="spline", reverse_strand=False, input_bam=None): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak """ if plotit: plt.rcParams['interactive'] = True pass bam_fileobj = pysam.Samfile(bam_file, 'rb') #fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr"): interval.chrom = "chr" + interval.chrom subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) strand = str(interval.strand) if reverse_strand: if strand == "+": strand = "-" elif strand == "-": strand = "+" (wiggle, jxns, pos_counts, lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, strand, "start", False) #This is the worst of hacks, need to factor out pysam eventually bam_fileobj = Robust_BAM_Reader(bam_file) subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) array_of_reads = read_array(subset_reads, interval.start, interval.stop) if input_bam: #if not none input_bam_fileobj = Robust_BAM_Reader(input_bam) input_subset_reads = list(input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop) nreads_in_gene = sum(pos_counts) gene_length = int(gene_length) lengths = [gene_length - 1 if read >= gene_length else read for read in lengths] if user_threshold is None: if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha) elif method == "random": gene_threshold = get_FDR_cutoff_mean(readlengths=lengths, genelength=gene_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) else: logging.info("using user threshold") gene_threshold = user_threshold if not isinstance(gene_threshold, int): raise TypeError #these are what is built in this dict, complicated enough that it might #be worth turning into an object peak_dict = {} peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = interval peak_number = 0 sections = find_sections(wiggle, max_gap) if plotit: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1, strand) Nreads = count_reads_in_interval(cur_interval, array_of_reads) cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(len(data)) peak_dict['sections'][sect] = {} peak_dict['sections'][sect]['nreads'] = int(Nreads) #makes sure there are enough reads if Nreads < min_reads: logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads)) pass if user_threshold is None: if SloP: half_width = 500 section_start = max(0, sectstart + interval.start - half_width) section_stop = sectstop + interval.start + 1 + half_width expanded_sect_length = section_stop - section_start cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop,strand ) expanded_Nreads = get_reads_in_interval(cur_interval, array_of_reads) sect_read_lengths = read_lengths_from_htseq(expanded_Nreads) sect_read_lengths = [sect_length - 1 if read > sect_length else read for read in sect_read_lengths] if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user threshold = max(gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha)) elif method == "random": #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = max(gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha)) else: raise ValueError("Method %s does not exist" % (method)) logging.info("Using super-local threshold %d" %(threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['expanded_Nreads'] = len(expanded_Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": data = map(float, data) #Magic number for initial smoothing, but it works initial_smoothing_value = ((sectstop - sectstart + 1)**(1/3)) + 10 peak_dict['sections'][sect]['smoothing_factor'] = initial_smoothing_value logging.info("initial smoothing value: %.2f" % initial_smoothing_value) fitter = SmoothingSpline(xvals, data, smoothing_factor=initial_smoothing_value, lossFunction="get_turn_penalized_residuals", threshold=threshold, num_reads=Nreads) elif algorithm == "gaussian": cts = map(float, cts) fitter = GaussMix(xvals, cts) elif algorithm == "classic": data = map(float, data) fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks() logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor) peak_dict['sections'][sect]['final_smoothing_factor'] = fitter.smoothing_factor if peak_definitions is None: numpeaks = 0 else: numpeaks = len(peak_definitions) logging.info("I identified %d potential peaks" % (numpeaks)) except Exception as error: logging.error("peak finding failed:, %s, %s" % (interval.name, error)) raise error #subsections that are above threshold #peak center is actually the location where we think binding should #occur, not the average of start and stop #Need to get all ranges, count number of reads in each range and compute from there for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = interval.start + sectstart + peak_start genomic_stop = interval.start + sectstart + peak_stop cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop, strand) number_reads_in_peak = count_reads_in_interval(cur_interval, array_of_reads) if input_bam: input_number_reads_in_peak = count_reads_in_interval(cur_interval, input_array_of_reads) else: input_number_reads_in_peak = 0 peak_length = genomic_stop - genomic_start + 1 logging.info("""Peak %d (%d - %d) has %d reads""" % (peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) #highest point in start stop genomic_center = interval.start + sectstart + peak_center #makes it thicker so we can see on the browser #error checking logic to keep bed files from breaking thick_start = max(genomic_center - 2, genomic_start) thick_stop = min(genomic_center + 2, genomic_stop) #super local logic area_start = max(0, (peak_center + sectstart) - windowsize) area_stop = min((peak_center + sectstart) + windowsize, len(wiggle)) cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop, strand) number_reads_in_area = count_reads_in_interval(cur_interval, array_of_reads) area_length = area_stop - area_start + 1 peak_dict['clusters'].append(Peak(chrom=interval.chrom, genomic_start=genomic_start, genomic_stop=genomic_stop, gene_name=interval.attrs['gene_id'], strand=interval.strand, thick_start=thick_start, thick_stop=thick_stop, peak_number=peak_number, number_reads_in_peak=number_reads_in_peak, size=peak_length, p=0, effective_length=int(interval.attrs['effective_length']), peak_length=peak_length, area_reads=number_reads_in_area, area_size=area_length, nreads_in_gene=nreads_in_gene, #nreads_in_input=input_number_reads_in_peak, )) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] += 1 peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) return peak_dict
def call_peaks(interval, gene_length, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial", min_reads=3, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, max_width=None, min_width=None, algorithm="spline", reverse_strand=False, exons=None): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarded windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value (+/- 500 b.p. of each section) max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak returns peak_dict, dictionary containing peak_dict['clusters']: list of Peak objects peak_dict['sections']: key: section ['nreads'] how many reads in this section ['threshold'] = threshold // either be suerlocal threshold, mRNA threshold or pre-mRNA threshold ['tried'] = True ['nPeaks'] = number of peaks peak_dict['nreads']: No. reads in gene peak_dict['threshold'] peak_dict['loc']: interval peak_dict['Nclusters']: total peaks in transcript """ ########################################################################### # print("starting call_peaks on gene_no:", gene_no, "interval:", interval) # genecallpeaksloggingperiode = 100 # should_log_gene_call_peaks_this_time = (gene_no % genecallpeaksloggingperiode == 0) ########################################################################### # if should_log_gene_call_peaks_this_time: # logging.info(" starting call_peaks on gene_no {}".format(gene_no)) ########################################################################### if plotit: plt.rcParams['interactive'] = True pass bam_fileobj = pysam.Samfile(bam_file, 'rb') # fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr") and not interval.chrom.startswith( "ERCC") and not interval.chrom.startswith("phiX"): interval.chrom = "chr" + interval.chrom # fetch reads in the genomic region subset_reads = list( bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) strand = str(interval.strand) if reverse_strand: if strand == "+": strand = "-" elif strand == "-": strand = "+" # convert pysam to a wiggle vector, junction, positional count(coverage), read lengths, all_reads, location (wiggle, jxns, pos_counts, lengths, allreads, read_locations) = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, strand, "start", False) nreads_in_gene = sum(pos_counts) gene_length = int(gene_length) lengths = [ gene_length - 1 if read >= gene_length else read for read in lengths ] # pre-mRNA Threshold if user_threshold is None: if method == "binomial": # Uses Binomial Distribution to get cutoff if specified by user # print(len(lengths), gene_length, binom_alpha) premRNA_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha) # print(premRNA_threshold) elif method == "random": premRNA_threshold = get_FDR_cutoff_mean(readlengths=lengths, genelength=gene_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) else: logging.info("using user threshold") premRNA_threshold = user_threshold # mRNA Threshold exons = pybedtools.BedTool(exons) exons = exons.filter( lambda x: x.name == interval.attrs['gene_id']).saveas() total_exonic_reads = [] total_exonic_length = 0 htseq_exons = HTSeq.GenomicArrayOfSets(chroms="auto", stranded=False) for exon, exon_interval in zip(exons, bed_to_genomic_interval(exons)): exon.stop += 1 exonic_reads = get_reads_in_interval_pysam(exon, interval.start, read_locations) exon_read_lengths = read_lengths_from_pysam(exonic_reads) exon_read_lengths = [ exon_interval.length - 1 if read > exon_interval.length else read for read in exon_read_lengths ] total_exonic_reads += exon_read_lengths total_exonic_length += exon_interval.length htseq_exons[exon_interval] += 'exon' mRNA_threshold = get_FDR_cutoff_binom(total_exonic_reads, total_exonic_length, binom_alpha) if not isinstance(premRNA_threshold, int): raise TypeError # these are what is built in this dict, complicated enough that it might # be worth turning into an object peak_dict = {} peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = premRNA_threshold peak_dict['loc'] = interval peak_number = 0 sections = find_sections( wiggle, max_gap) # return list of base with contiguous read > 0 (gap allowed) if plotit: plot_sections(wiggle, sections, premRNA_threshold) # for each section, call peaks for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] # make interval for teh section cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1, strand) # Logic to use variable thresholds for exons or introns, still superseded by superLocal logic overlaps_exon = len( reduce(set.union, (val for iv, val in htseq_exons[cur_interval].steps()))) > 0 gene_threshold = mRNA_threshold if overlaps_exon else premRNA_threshold # maybe make a function that takes a genomic interval and converts it into a pybedtools interval bed_format = [ interval.chrom, sectstart + interval.start, sectstop + interval.start + 1, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) Nreads = count_reads_in_interval_pysam(cur_pybedtools_interval, interval.start, read_locations) cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(len(data)) peak_dict['sections'][sect] = {} peak_dict['sections'][sect]['nreads'] = int(Nreads) # makes sure there are enough reads if Nreads < min_reads: logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads)) pass if user_threshold is None: if SloP: # super local p-value: section +/- 500 b.p.'; instead of using whole gene's length and read, use this extended region half_width = 500 section_start = max( 0, sectstart + interval.start - half_width) # aim at -500 offset from section start section_stop = sectstop + interval.start + 1 + half_width # aim at _500 from section stop expanded_sect_length = section_stop - section_start bed_format = [ interval.chrom, section_start, section_stop, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) expanded_Nreads = get_reads_in_interval_pysam( cur_pybedtools_interval, interval.start, read_locations) sect_read_lengths = read_lengths_from_pysam(expanded_Nreads) sect_read_lengths = [ sect_length - 1 if read > sect_length else read for read in sect_read_lengths ] peak_dict['sections'][sect]['expanded_Nreads'] = len( expanded_Nreads) if method == "binomial": # Uses Binomial Distribution to get cutoff if specified by user slop_threshold = get_FDR_cutoff_binom( readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=binom_alpha) elif method == "random": # use the minimum FDR cutoff between superlocal and gene-wide calculations slop_threshold = get_FDR_cutoff_mean( readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) threshold = max(gene_threshold, slop_threshold) logging.info("Using super-local threshold %d" % (threshold)) else: # if not use super local threshold (+/- 500 bp), use mRNA_threshold for exon; premRNA_threshold if section does not overlap with exon threshold = gene_threshold else: threshold = user_threshold # saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": data = list(map(float, data)) # Magic number for initial smoothing, but it works initial_smoothing_value = ( (sectstop - sectstart + 1)**(1 / 3)) + 10 peak_dict['sections'][sect][ 'smoothing_factor'] = initial_smoothing_value logging.info("initial smoothing value: %.2f" % initial_smoothing_value) fitter = SmoothingSpline( xvals, data, smoothing_factor=initial_smoothing_value, lossFunction="get_turn_penalized_residuals", threshold=threshold, num_reads=Nreads) elif algorithm == "gaussian": cts = list(map(float, cts)) fitter = GaussMix(xvals, cts) elif algorithm == "classic": data = list(map(float, data)) fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks() logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor) peak_dict['sections'][sect][ 'final_smoothing_factor'] = fitter.smoothing_factor if peak_definitions is None: numpeaks = 0 else: numpeaks = len(peak_definitions) logging.info("I identified %d potential peaks" % (numpeaks)) except Exception as error: logging.error("peak finding failed:, %s, %s" % (interval.name, error)) raise error # subsections that are above threshold # peak center is actually the location where we think binding should # occur, not the average of start and stop # Need to get all ranges, count number of reads in each range and compute from there for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = interval.start + sectstart + peak_start genomic_stop = interval.start + sectstart + peak_stop # save to bedtool bed_format = [ interval.chrom, genomic_start, genomic_stop, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) # create_interval_only_take_str cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) number_reads_in_peak = count_reads_in_interval_pysam( cur_pybedtools_interval, interval.start, read_locations) peak_length = genomic_stop - genomic_start + 1 logging.info("""Peak %d (%d - %d) has %d reads""" % (peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) # highest point in start stop genomic_center = interval.start + sectstart + peak_center # makes it thicker so we can see on the browser # error checking logic to keep bed files from breaking thick_start = max(genomic_center - 2, genomic_start) thick_stop = min(genomic_center + 2, genomic_stop) # super local logic area_start = max(0, (peak_center + sectstart) - windowsize) area_stop = min((peak_center + sectstart) + windowsize, len(wiggle)) bed_format = [ interval.chrom, interval.start + area_start, interval.start + area_stop, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) number_reads_in_area = count_reads_in_interval_pysam( cur_pybedtools_interval, interval.start, read_locations) area_length = area_stop - area_start + 1 peak_dict['clusters'].append( Peak( chrom=interval.chrom, genomic_start=genomic_start, genomic_stop=genomic_stop, gene_name=interval.attrs['gene_id'], strand=interval.strand, thick_start=thick_start, thick_stop=thick_stop, peak_number=peak_number, number_reads_in_peak=number_reads_in_peak, size=peak_length, p=0, effective_length=int(interval.attrs['effective_length']), peak_length=peak_length, area_reads=number_reads_in_area, area_size=area_length, nreads_in_gene=nreads_in_gene, # nreads_in_input=input_number_reads_in_peak, )) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] += 1 peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) ################################################### # print("returning gene_no:", gene_no, "peak_dict:", peak_dict) #################################################### return peak_dict
def call_peaks(interval, gene_length, bam_fileobj=None, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.001, method="random", minreads=20, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, correct_p=False, max_width=None, min_width=None, algorithm="spline"): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak """ #sys.stderr.write("plotit foo" + str(plotit)) if plotit: plt.rcParams['interactive']=True pass logging.info("running on gene %s" % (str(interval))) bam_fileobj = pysam.Samfile(bam_file, 'rb') #fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr"): interval.chrom = "chr" + interval.chrom subset_reads = bam_fileobj.fetch(reference=interval.chrom, start=interval.start, end=interval.stop) #need to document reads to wiggle wiggle, jxns, pos_counts, read_lengths, allreads = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, interval.strand, "center", False) #TODO have a check to kill this if there aren't any reads in a region result = peaks_from_info(bam_fileobj= bam_fileobj, wiggle=list(wiggle), pos_counts=pos_counts, lengths=read_lengths, interval=interval, gene_length=gene_length, max_gap=max_gap, fdr_alpha=fdr_alpha, binom_alpha=binom_alpha, method=method, user_threshold=user_threshold, minreads=minreads, poisson_cutoff=poisson_cutoff, plotit=plotit, width_cutoff=w_cutoff, windowsize=windowsize, SloP=SloP, correct_p=correct_p, max_width=max_width, min_width= min_width, algorithm=algorithm) return result
def count_gene(bam_file, gene, flip): """ get read counts for genic regions in the gene specified by annotation in passed value 'keys' bam_file - pysam bam file """ region_counts = {} bam_file = pysam.Samfile(bam_file, 'rb') # fetch reads from bam file for the gene referenced by keys (Ensembl ID) subset_reads = bam_file.fetch(reference=gene['chrom'], start=int(gene["start"]), end=int(gene["stop"])) # determine strand to keep based on flip option keep_strand = gene["strand"] if str(flip) == "flip": if str(keep_strand) == '-': keep_strand = '+' elif str(keep_strand) == '+': keep_strand = '-' elif str(flip) == "both": keep_strand = 0 wig, jxns, nr_counts, read_lengths, reads = readsToWiggle_pysam( subset_reads, int(gene["start"]), int(gene["stop"]), keep_strand, 'center', True) gene_sum = 0 for region_start, region_stop in gene['regions']: start = int(region_start) - gene["start"] stop = int(region_stop) - gene["start"] gene_sum += sum(wig[start:stop]) region_counts[gene['gene_id'] + ":" + str(region_start) + "-" + str(region_stop)] = sum(wig[start:stop]) bam_file.close() return [(gene['gene_id'] + ":" + str(start) + "-" + str(stop), { "chrom": gene['chrom'], "start": start, "stop": stop, "strand": gene["strand"], "gene_id": gene['gene_id'], 'frea': gene["frea"], "counts": count( gene_sum, region_counts[gene['gene_id'] + ":" + str(start) + "-" + str(stop)]) }) for start, stop in gene['regions']]