def test_find_sections(self): #setup print "testing find sectionds" #Null Case self.assertRaises(TypeError, find_sections, (None, 0)) #Case with all zero coverage wiggle = [0] * 20 result = find_sections(wiggle, 0) assert result == [] #Case with all non-zero coverage wiggle = [5] * 20 result = find_sections(wiggle, 0) self.assertEqual(result, [(0,19)]) wiggle = ([5] * 20) + [0] + ([5] * 20) #returns one segnment result = find_sections(wiggle, 1) self.assertEqual(result, [(0,40)]) #second case returns two segnments wiggle = ([5] * 9) + [0] + ([5] * 10) result = find_sections(wiggle, 0) assert result == [(0,9), (10,19)] #returns one segnment result = find_sections(wiggle, 1) assert result == [(0,19)] #Edge case where margins stop before the end of genes wiggle = [0] + ([5] * 10) result = find_sections(wiggle, 0) assert result == [(1,10)] #Edge case where margins start after the start of genes wiggle = ([5] * 10) + [0] result = find_sections(wiggle, 0) assert result == [(0,10)] #Test not integers wiggle = [.5] * 20 result = find_sections(wiggle, 0) self.assertEqual(result, [(0,19)]) #test numpy arrays wiggle = ones((20), dtype='f') wiggle = list(wiggle) result = find_sections(wiggle, 0) self.assertEqual(result, [(0,19)])
def test_find_sections(self): #setup print "testing find sectionds" #Null Case self.assertRaises(TypeError, find_sections, (None, 0)) #Case with all zero coverage wiggle = [0] * 20 result = find_sections(wiggle, 0) assert result == [] #Case with all non-zero coverage wiggle = [5] * 20 result = find_sections(wiggle, 0) self.assertEqual(result, [(0, 19)]) wiggle = ([5] * 20) + [0] + ([5] * 20) #returns one segnment result = find_sections(wiggle, 1) self.assertEqual(result, [(0, 40)]) #second case returns two segnments wiggle = ([5] * 9) + [0] + ([5] * 10) result = find_sections(wiggle, 0) assert result == [(0, 9), (10, 19)] #returns one segnment result = find_sections(wiggle, 1) assert result == [(0, 19)] #Edge case where margins stop before the end of genes wiggle = [0] + ([5] * 10) result = find_sections(wiggle, 0) assert result == [(1, 10)] #Edge case where margins start after the start of genes wiggle = ([5] * 10) + [0] result = find_sections(wiggle, 0) assert result == [(0, 10)] #Test not integers wiggle = [.5] * 20 result = find_sections(wiggle, 0) self.assertEqual(result, [(0, 19)]) #test numpy arrays wiggle = ones((20), dtype='f') wiggle = list(wiggle) result = find_sections(wiggle, 0) self.assertEqual(result, [(0, 19)])
def test_find_sections_two_sections(self): #Case with one region on margin of one and two regions on margin of two #returns two segnments wiggle = ([5] * 20) + [0] + ([5] * 20) result = find_sections(wiggle, 0) #I believe this is zero based half open result. Need to think about it more self.assertEqual(result, [(0, 20), (21, 40)])
def test_find_sections_two_sections(self): #Case with one region on margin of one and two regions on margin of two #returns two segnments wiggle = ([5] * 20) + [0] + ([5] * 20) result = find_sections(wiggle, 0) #I believe this is zero based half open result. Need to think about it more self.assertEqual(result, [(0,20), (21,40)])
def test_find_sections_no_overlaps(self): #verify there is no overlap wiggle = [10, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] result = find_sections(wiggle, 15) print result #start is greater than end self.assertGreater(result[1][0], result[0][1], "first region: %s, second region %s, start of section value is less than end of first" %(result[0][1], result[1][0] ))
def test_find_sections_no_overlaps(self): #verify there is no overlap wiggle = [ 10, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ] result = find_sections(wiggle, 15) print result #start is greater than end self.assertGreater( result[1][0], result[0][1], "first region: %s, second region %s, start of section value is less than end of first" % (result[0][1], result[1][0]))
def peaks_from_info(bam_fileobj, wiggle, pos_counts, lengths, loc, gene_length, margin=25, fdr_alpha=0.05, binom_alpha=0.001, method="Randomization" ,user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, width_cutoff=10, windowsize=1000, SloP=False, correct_p=False, max_width=None, min_width=None, max_gap=None, algorithm="spline"): """ same args as before wiggle is converted from bam file pos_counts - one point per read instead of coverage of entire read lengths - lengths aligned portions of reads rest are the same fix later calls peaks for an individual gene gene_length - effective length of gene margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in genomic_center that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson algorithm - str the algorithm to run """ peak_dict = {} #all the information nessessary to record a genomic_center, used later, but declared outside of loops #these are what is built in this dict, complicated enough that it might #be worth turning into an object #peak_dict['clusters'] = {} #peak_dict['sections'] = {} #peak_dict['nreads'] = int() #peak_dict['threshold'] = int() #peak_dict['loc'] = loc #data munging chrom, gene_name, tx_start, tx_end, strand = loc tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] #used for poisson calclulation? nreads_in_gene = sum(pos_counts) #decides FDR calcalation, maybe move getFRDcutoff mean into c code gene_threshold = 0 if user_threshold is None: if method == "Binomial": #Uses Binomial Distribution to get cutoff if specified by user gene_threshold = get_Binom_cutoff(lengths,gene_length,binom_alpha) else: gene_threshold = get_FDR_cutoff_mean(lengths, gene_length,alpha=fdr_alpha) else: logging.info("using user threshold") gene_threshold = user_threshold if not isinstance(gene_threshold, int): raise TypeError peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = loc peak_number=1 sections = find_sections(wiggle, margin) if plotit is True: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] #this cts is alright because we know the reads are bounded cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(0, sect_length) Nreads = sum(cts) peak_dict['sections'][sect] = {} threshold = int() peak_dict['sections'][sect]['nreads'] = int(Nreads) #makes sure there are enough reads if Nreads < minreads: logging.info("""%d is not enough reads, skipping section: %s""" %(Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" %(sect, Nreads)) pass if user_threshold == None: if SloP: #gets random subset of lengths of reads for calculations on a section #not exactly the right way to do this but it should be very close. sect_read_lengths = rs(lengths, Nreads) #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = min(gene_threshold, get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha)) logging.info("Using super-local threshold %d" %(threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": initial_smoothing_value = (sectstop - sectstart + 1) fitter = SmoothingSpline(xvals, data, initial_smoothing_value, lossFunction="get_norm_penalized_residuals") elif algorithm == "gaussian": fitter = GaussMix(xvals, data) elif algorithm == "classic": fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks(threshold, plotit) except Exception as error: logging.error(gene_name) raise error #subsections that are above threshold #peak center is actually the location where we think binding should #occur, not the average of start and stop for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = tx_start + sectstart + peak_start genomic_stop = tx_start + sectstart + peak_stop number_reads_in_peak = bam_fileobj.count(chrom, start=genomic_start, end=genomic_stop) #sum(cts[peak_start:(peak_stop + 1)]) logging.info("""Peak %d (%d - %d) has %d reads""" %(peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) #makes sure there enough reads if (number_reads_in_peak < minreads or max(data[peak_start:(peak_stop + 1)]) < threshold): logging.info("""skipping genomic_center, %d is not enough reads""" %(number_reads_in_peak)) continue #highest point in start stop genomic_center = tx_start + sectstart + peak_center #makes it thicker so we can see on the browser thick_start = genomic_center - 2 thick_stop = genomic_center + 2 #best_error checking logic to keep bed files from breaking if thick_start < genomic_start: thick_start = genomic_start if thick_stop > genomic_stop: thick_stop = genomic_stop peak_length = genomic_stop - genomic_start + 1 #skip really small peaks if peak_length < width_cutoff: continue #super local logic #best_error check to make sure area is in area of gene #distance from gene start if genomic_center - tx_start - windowsize < 0: area_start = 0 #for super local gets area around genomic_center for calculation else: area_start = genomic_center - tx_start - windowsize #area_start = sectstart #same thing except for end of gene instead of start if genomic_center + windowsize > tx_end: #distance to gene stop area_stop = tx_start - tx_end + 1 else: area_stop = genomic_center - tx_start + windowsize #area_stop = sectstop #use area reads + 1/2 all other reads in gene: #area_reads = sum(pos_counts[area_start:area_stop]) + #0.5*(sum(pos_counts) - #sum(pos_counts[area_start:area_stop])) #use area reads: area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 #area_reads = sum(pos_counts[sectstart:sectstop]) #area_size = sect_length #calcluates poisson based of whole gene vs genomic_center if algorithm == "classic" and peak_length < min_width: peak_length = min_width gene_pois_p = poissonP(nreads_in_gene, number_reads_in_peak, gene_length, peak_length) if SloP is True: #same thing except for based on super local p-value slop_pois_p = poissonP(area_reads, number_reads_in_peak, area_size, peak_length) #makes sure spop_poisP is defined, even if its #just normal, something to be removed later, #slop should only be used when defined as true else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 #defines the bedline of a genomic_center for returning #TODO This should be abstracted out for now... seperate model from view peak_dict['clusters'].append(Peak(chrom, genomic_start, genomic_stop, gene_name, #need this is a unique id for later analysis slop_pois_p, strand, thick_start, thick_stop, peak_number, number_reads_in_peak, gene_pois_p, peak_length, 0 ) ) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] +=1 #inflate p-values based on # of comparisons #bonferroni corrected if correct_p is True: #best I can tell this never executes... for genomic_center in peak_dict['clusters']: genomic_center.p = genomic_center.p * peak_number #bonferroni correct p-value for MHT peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) return peak_dict
def call_peaks(interval, gene_length, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial", min_reads=3, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, max_width=None, min_width=None, algorithm="spline", reverse_strand=False, input_bam=None): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak """ if plotit: plt.rcParams['interactive'] = True pass bam_fileobj = pysam.Samfile(bam_file, 'rb') #fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr"): interval.chrom = "chr" + interval.chrom subset_reads = list( bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) strand = str(interval.strand) if reverse_strand: if strand == "+": strand = "-" elif strand == "-": strand = "+" (wiggle, jxns, pos_counts, lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, strand, "start", False) #This is the worst of hacks, need to factor out pysam eventually bam_fileobj = Robust_BAM_Reader(bam_file) subset_reads = list( bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) array_of_reads = read_array(subset_reads, interval.start, interval.stop) if input_bam: #if not none input_bam_fileobj = Robust_BAM_Reader(input_bam) input_subset_reads = list( input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop) nreads_in_gene = sum(pos_counts) gene_length = int(gene_length) lengths = [ gene_length - 1 if read >= gene_length else read for read in lengths ] if user_threshold is None: if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha) elif method == "random": gene_threshold = get_FDR_cutoff_mean(readlengths=lengths, genelength=gene_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) else: logging.info("using user threshold") gene_threshold = user_threshold if not isinstance(gene_threshold, int): raise TypeError #these are what is built in this dict, complicated enough that it might #be worth turning into an object peak_dict = {} peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = interval peak_number = 0 sections = find_sections(wiggle, max_gap) if plotit: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1, strand) Nreads = count_reads_in_interval(cur_interval, array_of_reads) cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(len(data)) peak_dict['sections'][sect] = {} peak_dict['sections'][sect]['nreads'] = int(Nreads) #makes sure there are enough reads if Nreads < min_reads: logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads)) pass if user_threshold is None: if SloP: half_width = 500 section_start = max(0, sectstart + interval.start - half_width) section_stop = sectstop + interval.start + 1 + half_width expanded_sect_length = section_stop - section_start cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop, strand) expanded_Nreads = get_reads_in_interval( cur_interval, array_of_reads) sect_read_lengths = read_lengths_from_htseq(expanded_Nreads) sect_read_lengths = [ sect_length - 1 if read > sect_length else read for read in sect_read_lengths ] peak_dict['sections'][sect]['expanded_Nreads'] = len( expanded_Nreads) if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user threshold = max( gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha)) elif method == "random": #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = max( gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha)) else: raise ValueError("Method %s does not exist" % (method)) logging.info("Using super-local threshold %d" % (threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": data = map(float, data) #Magic number for initial smoothing, but it works initial_smoothing_value = ( (sectstop - sectstart + 1)**(1 / 3)) + 10 peak_dict['sections'][sect][ 'smoothing_factor'] = initial_smoothing_value logging.info("initial smoothing value: %.2f" % initial_smoothing_value) fitter = SmoothingSpline( xvals, data, smoothing_factor=initial_smoothing_value, lossFunction="get_turn_penalized_residuals", threshold=threshold, num_reads=Nreads) elif algorithm == "gaussian": cts = map(float, cts) fitter = GaussMix(xvals, cts) elif algorithm == "classic": data = map(float, data) fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks() logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor) peak_dict['sections'][sect][ 'final_smoothing_factor'] = fitter.smoothing_factor if peak_definitions is None: numpeaks = 0 else: numpeaks = len(peak_definitions) logging.info("I identified %d potential peaks" % (numpeaks)) except Exception as error: logging.error("peak finding failed:, %s, %s" % (interval.name, error)) raise error #subsections that are above threshold #peak center is actually the location where we think binding should #occur, not the average of start and stop #Need to get all ranges, count number of reads in each range and compute from there for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = interval.start + sectstart + peak_start genomic_stop = interval.start + sectstart + peak_stop cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop, strand) number_reads_in_peak = count_reads_in_interval( cur_interval, array_of_reads) if input_bam: input_number_reads_in_peak = count_reads_in_interval( cur_interval, input_array_of_reads) else: input_number_reads_in_peak = 0 peak_length = genomic_stop - genomic_start + 1 logging.info("""Peak %d (%d - %d) has %d reads""" % (peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) #highest point in start stop genomic_center = interval.start + sectstart + peak_center #makes it thicker so we can see on the browser #error checking logic to keep bed files from breaking thick_start = max(genomic_center - 2, genomic_start) thick_stop = min(genomic_center + 2, genomic_stop) #super local logic area_start = max(0, (peak_center + sectstart) - windowsize) area_stop = min((peak_center + sectstart) + windowsize, len(wiggle)) cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop, strand) number_reads_in_area = count_reads_in_interval( cur_interval, array_of_reads) area_length = area_stop - area_start + 1 peak_dict['clusters'].append( Peak( chrom=interval.chrom, genomic_start=genomic_start, genomic_stop=genomic_stop, gene_name=interval.attrs['gene_id'], strand=interval.strand, thick_start=thick_start, thick_stop=thick_stop, peak_number=peak_number, number_reads_in_peak=number_reads_in_peak, size=peak_length, p=0, effective_length=int(interval.attrs['effective_length']), peak_length=peak_length, area_reads=number_reads_in_area, area_size=area_length, nreads_in_gene=nreads_in_gene, #nreads_in_input=input_number_reads_in_peak, )) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] += 1 peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) return peak_dict
def call_peaks(interval, gene_length, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial", min_reads=3, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, max_width=None, min_width=None, algorithm="spline", reverse_strand=False, input_bam=None): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak """ if plotit: plt.rcParams['interactive'] = True pass bam_fileobj = pysam.Samfile(bam_file, 'rb') #fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr"): interval.chrom = "chr" + interval.chrom subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) strand = str(interval.strand) if reverse_strand: if strand == "+": strand = "-" elif strand == "-": strand = "+" (wiggle, jxns, pos_counts, lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, strand, "start", False) #This is the worst of hacks, need to factor out pysam eventually bam_fileobj = Robust_BAM_Reader(bam_file) subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) array_of_reads = read_array(subset_reads, interval.start, interval.stop) if input_bam: #if not none input_bam_fileobj = Robust_BAM_Reader(input_bam) input_subset_reads = list(input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop) nreads_in_gene = sum(pos_counts) gene_length = int(gene_length) lengths = [gene_length - 1 if read >= gene_length else read for read in lengths] if user_threshold is None: if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha) elif method == "random": gene_threshold = get_FDR_cutoff_mean(readlengths=lengths, genelength=gene_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) else: logging.info("using user threshold") gene_threshold = user_threshold if not isinstance(gene_threshold, int): raise TypeError #these are what is built in this dict, complicated enough that it might #be worth turning into an object peak_dict = {} peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = interval peak_number = 0 sections = find_sections(wiggle, max_gap) if plotit: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1, strand) Nreads = count_reads_in_interval(cur_interval, array_of_reads) cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(len(data)) peak_dict['sections'][sect] = {} peak_dict['sections'][sect]['nreads'] = int(Nreads) #makes sure there are enough reads if Nreads < min_reads: logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads)) pass if user_threshold is None: if SloP: half_width = 500 section_start = max(0, sectstart + interval.start - half_width) section_stop = sectstop + interval.start + 1 + half_width expanded_sect_length = section_stop - section_start cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop,strand ) expanded_Nreads = get_reads_in_interval(cur_interval, array_of_reads) sect_read_lengths = read_lengths_from_htseq(expanded_Nreads) sect_read_lengths = [sect_length - 1 if read > sect_length else read for read in sect_read_lengths] if method == "binomial": #Uses Binomial Distribution to get cutoff if specified by user threshold = max(gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha)) elif method == "random": #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = max(gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha)) else: raise ValueError("Method %s does not exist" % (method)) logging.info("Using super-local threshold %d" %(threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['expanded_Nreads'] = len(expanded_Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": data = map(float, data) #Magic number for initial smoothing, but it works initial_smoothing_value = ((sectstop - sectstart + 1)**(1/3)) + 10 peak_dict['sections'][sect]['smoothing_factor'] = initial_smoothing_value logging.info("initial smoothing value: %.2f" % initial_smoothing_value) fitter = SmoothingSpline(xvals, data, smoothing_factor=initial_smoothing_value, lossFunction="get_turn_penalized_residuals", threshold=threshold, num_reads=Nreads) elif algorithm == "gaussian": cts = map(float, cts) fitter = GaussMix(xvals, cts) elif algorithm == "classic": data = map(float, data) fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks() logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor) peak_dict['sections'][sect]['final_smoothing_factor'] = fitter.smoothing_factor if peak_definitions is None: numpeaks = 0 else: numpeaks = len(peak_definitions) logging.info("I identified %d potential peaks" % (numpeaks)) except Exception as error: logging.error("peak finding failed:, %s, %s" % (interval.name, error)) raise error #subsections that are above threshold #peak center is actually the location where we think binding should #occur, not the average of start and stop #Need to get all ranges, count number of reads in each range and compute from there for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = interval.start + sectstart + peak_start genomic_stop = interval.start + sectstart + peak_stop cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop, strand) number_reads_in_peak = count_reads_in_interval(cur_interval, array_of_reads) if input_bam: input_number_reads_in_peak = count_reads_in_interval(cur_interval, input_array_of_reads) else: input_number_reads_in_peak = 0 peak_length = genomic_stop - genomic_start + 1 logging.info("""Peak %d (%d - %d) has %d reads""" % (peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) #highest point in start stop genomic_center = interval.start + sectstart + peak_center #makes it thicker so we can see on the browser #error checking logic to keep bed files from breaking thick_start = max(genomic_center - 2, genomic_start) thick_stop = min(genomic_center + 2, genomic_stop) #super local logic area_start = max(0, (peak_center + sectstart) - windowsize) area_stop = min((peak_center + sectstart) + windowsize, len(wiggle)) cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop, strand) number_reads_in_area = count_reads_in_interval(cur_interval, array_of_reads) area_length = area_stop - area_start + 1 peak_dict['clusters'].append(Peak(chrom=interval.chrom, genomic_start=genomic_start, genomic_stop=genomic_stop, gene_name=interval.attrs['gene_id'], strand=interval.strand, thick_start=thick_start, thick_stop=thick_stop, peak_number=peak_number, number_reads_in_peak=number_reads_in_peak, size=peak_length, p=0, effective_length=int(interval.attrs['effective_length']), peak_length=peak_length, area_reads=number_reads_in_area, area_size=area_length, nreads_in_gene=nreads_in_gene, #nreads_in_input=input_number_reads_in_peak, )) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] += 1 peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) return peak_dict
def call_peaks(interval, gene_length, bam_file=None, max_gap=25, fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial", min_reads=3, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, max_width=None, min_width=None, algorithm="spline", reverse_strand=False, exons=None): """ calls peaks for an individual gene interval - gtf interval describing the gene to query takes bam file or bam file object. Serial uses object parallel uses location (name) max_gap - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarded windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value (+/- 500 b.p. of each section) max_width - int maximum with of classic peak calling algorithm peak min_width - int min width of classic peak calling algorithm peak max_gap - int max gap of classic peak calling algorithm peak returns peak_dict, dictionary containing peak_dict['clusters']: list of Peak objects peak_dict['sections']: key: section ['nreads'] how many reads in this section ['threshold'] = threshold // either be suerlocal threshold, mRNA threshold or pre-mRNA threshold ['tried'] = True ['nPeaks'] = number of peaks peak_dict['nreads']: No. reads in gene peak_dict['threshold'] peak_dict['loc']: interval peak_dict['Nclusters']: total peaks in transcript """ ########################################################################### # print("starting call_peaks on gene_no:", gene_no, "interval:", interval) # genecallpeaksloggingperiode = 100 # should_log_gene_call_peaks_this_time = (gene_no % genecallpeaksloggingperiode == 0) ########################################################################### # if should_log_gene_call_peaks_this_time: # logging.info(" starting call_peaks on gene_no {}".format(gene_no)) ########################################################################### if plotit: plt.rcParams['interactive'] = True pass bam_fileobj = pysam.Samfile(bam_file, 'rb') # fixes non-standard chrom file names (without the chr) if not interval.chrom.startswith("chr") and not interval.chrom.startswith( "ERCC") and not interval.chrom.startswith("phiX"): interval.chrom = "chr" + interval.chrom # fetch reads in the genomic region subset_reads = list( bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop)) strand = str(interval.strand) if reverse_strand: if strand == "+": strand = "-" elif strand == "-": strand = "+" # convert pysam to a wiggle vector, junction, positional count(coverage), read lengths, all_reads, location (wiggle, jxns, pos_counts, lengths, allreads, read_locations) = readsToWiggle_pysam(subset_reads, interval.start, interval.stop, strand, "start", False) nreads_in_gene = sum(pos_counts) gene_length = int(gene_length) lengths = [ gene_length - 1 if read >= gene_length else read for read in lengths ] # pre-mRNA Threshold if user_threshold is None: if method == "binomial": # Uses Binomial Distribution to get cutoff if specified by user # print(len(lengths), gene_length, binom_alpha) premRNA_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha) # print(premRNA_threshold) elif method == "random": premRNA_threshold = get_FDR_cutoff_mean(readlengths=lengths, genelength=gene_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) else: logging.info("using user threshold") premRNA_threshold = user_threshold # mRNA Threshold exons = pybedtools.BedTool(exons) exons = exons.filter( lambda x: x.name == interval.attrs['gene_id']).saveas() total_exonic_reads = [] total_exonic_length = 0 htseq_exons = HTSeq.GenomicArrayOfSets(chroms="auto", stranded=False) for exon, exon_interval in zip(exons, bed_to_genomic_interval(exons)): exon.stop += 1 exonic_reads = get_reads_in_interval_pysam(exon, interval.start, read_locations) exon_read_lengths = read_lengths_from_pysam(exonic_reads) exon_read_lengths = [ exon_interval.length - 1 if read > exon_interval.length else read for read in exon_read_lengths ] total_exonic_reads += exon_read_lengths total_exonic_length += exon_interval.length htseq_exons[exon_interval] += 'exon' mRNA_threshold = get_FDR_cutoff_binom(total_exonic_reads, total_exonic_length, binom_alpha) if not isinstance(premRNA_threshold, int): raise TypeError # these are what is built in this dict, complicated enough that it might # be worth turning into an object peak_dict = {} peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = premRNA_threshold peak_dict['loc'] = interval peak_number = 0 sections = find_sections( wiggle, max_gap) # return list of base with contiguous read > 0 (gap allowed) if plotit: plot_sections(wiggle, sections, premRNA_threshold) # for each section, call peaks for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] # make interval for teh section cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1, strand) # Logic to use variable thresholds for exons or introns, still superseded by superLocal logic overlaps_exon = len( reduce(set.union, (val for iv, val in htseq_exons[cur_interval].steps()))) > 0 gene_threshold = mRNA_threshold if overlaps_exon else premRNA_threshold # maybe make a function that takes a genomic interval and converts it into a pybedtools interval bed_format = [ interval.chrom, sectstart + interval.start, sectstop + interval.start + 1, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) Nreads = count_reads_in_interval_pysam(cur_pybedtools_interval, interval.start, read_locations) cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(len(data)) peak_dict['sections'][sect] = {} peak_dict['sections'][sect]['nreads'] = int(Nreads) # makes sure there are enough reads if Nreads < min_reads: logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads)) pass if user_threshold is None: if SloP: # super local p-value: section +/- 500 b.p.'; instead of using whole gene's length and read, use this extended region half_width = 500 section_start = max( 0, sectstart + interval.start - half_width) # aim at -500 offset from section start section_stop = sectstop + interval.start + 1 + half_width # aim at _500 from section stop expanded_sect_length = section_stop - section_start bed_format = [ interval.chrom, section_start, section_stop, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) expanded_Nreads = get_reads_in_interval_pysam( cur_pybedtools_interval, interval.start, read_locations) sect_read_lengths = read_lengths_from_pysam(expanded_Nreads) sect_read_lengths = [ sect_length - 1 if read > sect_length else read for read in sect_read_lengths ] peak_dict['sections'][sect]['expanded_Nreads'] = len( expanded_Nreads) if method == "binomial": # Uses Binomial Distribution to get cutoff if specified by user slop_threshold = get_FDR_cutoff_binom( readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=binom_alpha) elif method == "random": # use the minimum FDR cutoff between superlocal and gene-wide calculations slop_threshold = get_FDR_cutoff_mean( readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha) else: raise ValueError("Method %s does not exist" % (method)) threshold = max(gene_threshold, slop_threshold) logging.info("Using super-local threshold %d" % (threshold)) else: # if not use super local threshold (+/- 500 bp), use mRNA_threshold for exon; premRNA_threshold if section does not overlap with exon threshold = gene_threshold else: threshold = user_threshold # saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": data = list(map(float, data)) # Magic number for initial smoothing, but it works initial_smoothing_value = ( (sectstop - sectstart + 1)**(1 / 3)) + 10 peak_dict['sections'][sect][ 'smoothing_factor'] = initial_smoothing_value logging.info("initial smoothing value: %.2f" % initial_smoothing_value) fitter = SmoothingSpline( xvals, data, smoothing_factor=initial_smoothing_value, lossFunction="get_turn_penalized_residuals", threshold=threshold, num_reads=Nreads) elif algorithm == "gaussian": cts = list(map(float, cts)) fitter = GaussMix(xvals, cts) elif algorithm == "classic": data = list(map(float, data)) fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks() logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor) peak_dict['sections'][sect][ 'final_smoothing_factor'] = fitter.smoothing_factor if peak_definitions is None: numpeaks = 0 else: numpeaks = len(peak_definitions) logging.info("I identified %d potential peaks" % (numpeaks)) except Exception as error: logging.error("peak finding failed:, %s, %s" % (interval.name, error)) raise error # subsections that are above threshold # peak center is actually the location where we think binding should # occur, not the average of start and stop # Need to get all ranges, count number of reads in each range and compute from there for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = interval.start + sectstart + peak_start genomic_stop = interval.start + sectstart + peak_stop # save to bedtool bed_format = [ interval.chrom, genomic_start, genomic_stop, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) # create_interval_only_take_str cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) number_reads_in_peak = count_reads_in_interval_pysam( cur_pybedtools_interval, interval.start, read_locations) peak_length = genomic_stop - genomic_start + 1 logging.info("""Peak %d (%d - %d) has %d reads""" % (peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) # highest point in start stop genomic_center = interval.start + sectstart + peak_center # makes it thicker so we can see on the browser # error checking logic to keep bed files from breaking thick_start = max(genomic_center - 2, genomic_start) thick_stop = min(genomic_center + 2, genomic_stop) # super local logic area_start = max(0, (peak_center + sectstart) - windowsize) area_stop = min((peak_center + sectstart) + windowsize, len(wiggle)) bed_format = [ interval.chrom, interval.start + area_start, interval.start + area_stop, interval.name, interval.score, strand ] bed_format = list(map(str, bed_format)) cur_pybedtools_interval = pybedtools.create_interval_from_list( bed_format) number_reads_in_area = count_reads_in_interval_pysam( cur_pybedtools_interval, interval.start, read_locations) area_length = area_stop - area_start + 1 peak_dict['clusters'].append( Peak( chrom=interval.chrom, genomic_start=genomic_start, genomic_stop=genomic_stop, gene_name=interval.attrs['gene_id'], strand=interval.strand, thick_start=thick_start, thick_stop=thick_stop, peak_number=peak_number, number_reads_in_peak=number_reads_in_peak, size=peak_length, p=0, effective_length=int(interval.attrs['effective_length']), peak_length=peak_length, area_reads=number_reads_in_area, area_size=area_length, nreads_in_gene=nreads_in_gene, # nreads_in_input=input_number_reads_in_peak, )) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] += 1 peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) ################################################### # print("returning gene_no:", gene_no, "peak_dict:", peak_dict) #################################################### return peak_dict
def peaks_from_info(wiggle, pos_counts, lengths, loc, gene_length, margin=25, fdr_alpha=0.05, user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, width_cutoff=10, windowsize=1000, SloP=False, correct_p=False): """ same args as before wiggle is converted from bam file pos_counts - one point per read instead of coverage of entire read lengths - lengths aligned portions of reads rest are the same fix later calls peaks for an individual gene gene_length - effective length of gene margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson """ peak_dict = {} #these are what is built in this dict, complicated enough that it might #be worth turning into an object #peak_dict['clusters'] = {} #peak_dict['sections'] = {} #peak_dict['nreads'] = int() #peak_dict['threshold'] = int() #peak_dict['loc'] = loc #data munging chrom, gene_name, tx_start, tx_end, signstrand = loc tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] #used for poisson calclulation? nreads_in_gene = sum(pos_counts) #decides FDR calcalation, maybe move getFRDcutoff mean into c code if user_threshold is None: gene_threshold = get_FDR_cutoff_mean(lengths, gene_length, alpha=fdr_alpha) else: gene_threshold = user_threshold if gene_threshold == "best_error": #verboseprint("""I had a hard time with this one: %s. # I think I'll use a threshold of 50""" % (loc)) threshold = 50 peak_dict['clusters'] = {} peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = loc peakn = 1 sections = find_sections(wiggle, margin) if plotit is True: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(0, sect_length) Nreads = sum(cts) #gets random subset of lengths of reads for calculations on a section #not exactly the right way to do this but it should be very close. sect_read_lengths = rs(lengths, Nreads) peak_dict['sections'][sect] = {} threshold = int() #makes sure there are enough reads if Nreads < minreads: #verboseprint("""%d is not enough reads, skipping section: # %s""" % (Nreads, sect)) continue else: pass #verboseprint("""Analyzing section %s with %d reads""" # % (sect, Nreads)) #sets super-local if requested, might be able to factor this if user_threshold is None: if SloP is True: #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = min( gene_theshold, get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha)) #verboseprint("Using super-local threshold %d" % (threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) #if wiggle track never excides threshold if max(data) < threshold: #verboseprint("data does not excede threshold, stopping") continue #fitting splines logic, black magic try: degree = 3 #cubic spline weights = None #for very large windows with many reads a large smoothing #parameter is required. test several different options #to determine a reasonable inital estimate #Goal is to find optimnal smooting paramater in multiple steps #initial_smoothing_value initial estimate of smoothing paramater #step 1, identify good initial value initial_smoothing_value = (sectstop - sectstart + 1) best_smoothing_value = initial_smoothing_value best_estimate = 1 #step 2, refine so as not to runinto local minima later, #try to come up with a good way of getting optimal paramater best_error = find_spline_residuals(initial_smoothing_value, xvals, data, degree, weights) for i in range(2, 11): cur_smoothing_value = initial_smoothing_value * i #tries find optimal initial smooting paraater in this loop cur_error = find_spline_residuals(cur_smoothing_value, xvals, data, degree, weights) if cur_error < best_error: best_smoothing_value = cur_smoothing_value best_estimate = i try: #fine optimization of smooting paramater cutoff = float(0) tries = 0 # shouldn't get smoothing coef's this small.. increase #the initial estimate and try again. WARNING: BLACK MAGIC while cutoff < 5: tries += 1 # increasing this may improve accuracy, #but at the cost of running time. if tries == 3: break spline = optimize.minimize( find_spline_residuals, best_smoothing_value, args=(xvals, data, degree, weights), options={ 'disp': False, 'maxiter': 10, }, #method="Powell", # old method method="L-BFGS-B", #abnormal termination sometimes #method="COBYLA", bounds=((.1, None), ), ) #fit a smoothing spline using an optimal parameter #for smoothing and with weights proportional to the #number of reads aligned at each position if weights #is set if spline.success: cutoff = spline.x #print "cutoff is %s" % (cutoff) else: #print "%s failed spline building at section %s" % (loc, sect) #print spline.message pass best_smoothing_value += sect_length except Exception as best_error: print >> sys.stderr, "best smoothing value is:", best_smoothing_value print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % ( loc, sect) print >> sys.stderr, best_error continue #verboseprint ("optimized smoothing parameter") #if we are going to save and output as a pickle fi is %s" %(str(cutoff)) #final fit spline spline = find_univariate_spline(cutoff, xvals, data, degree, weights) spline_values = array([round(x) for x in spline(xvals)]) if plotit is True: plot_spline(spline, data, xvals, peakn, threshold) starts_and_stops, starts, stops = get_regions_above_threshold( threshold, spline_values) #walks along spline, and calls peaks along spline #for each start, take the next stop and find the peak #between the start and the stop this is where I need to #fix, some peaks starts start right after another start, #but not on top of it make sure the next start is after the #previous stop #subsections that are above threshold for p_start, p_stop in starts_and_stops: #peaks with-in this subsection, indexed from section #(not subsection) start #find all local maxima peaks = [ x + p_start for x in xvals[find_local_maxima(spline_values[p_start:( p_stop + 1)])] ] #map(lambda x: x + p_start, # xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0]) if not len(peaks) in (0, 1): assert len(peaks) in ( 0, 1 ) #there should be one or zero peaks in every section #handles logic if there are multiple peaks between #start and stop if len(peaks) <= 0: continue if len(peaks) is 1: #TODO All this formatting logic doesn't belong here #should be simplifed #gets reads in peak n_reads_in_peak = sum(cts[p_start:(p_stop + 1)]) #verboseprint(""""Peak %d (%d - %d) has %d # reads""" % (peakn, # p_start, # (p_stop + 1), # n_reads_in_peak)) #makes sure there enough reads if (n_reads_in_peak < minreads or max(data[p_start:(p_stop + 1)]) < threshold): # verboseprint("""skipping peak, %d is not enough reads""" # % (n_reads_in_peak)) continue #formatting of bed track #start and stop for bed track to be created g_start = tx_start + sectstart + p_start g_stop = tx_start + sectstart + p_stop #highest point in start stop peak = tx_start + sectstart + peaks[0] #makes it thicker so we can see on the browser thick_start = peak - 2 thick_stop = peak + 2 #best_error checking logic to keep bed files from breaking if thick_start < g_start: thick_start = g_start if thick_stop > g_stop: thick_stop = g_stop peak_length = g_stop - g_start + 1 #skip really small peaks if peak_length < width_cutoff: continue peak_name = gene_name + "_" + str(peakn) + "_" + str( int(n_reads_in_peak)) #super local logic #best_error check to make sure area is in area of gene #distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 #for super local gets area around peak for calculation else: area_start = peak - tx_start - windowsize #area_start = sectstart #same thing except for end of gene instead of start if peak + windowsize > tx_end: #distance to gene stop area_stop = tx_start - tx_end + 1 else: area_stop = peak - tx_start + windowsize #area_stop = sectstop #use area reads + 1/2 all other reads in gene: #area_reads = sum(pos_counts[area_start:area_stop]) + #0.5*(sum(pos_counts) - #sum(pos_counts[area_start:area_stop])) #use area reads: area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 #area_reads = sum(pos_counts[sectstart:sectstop]) #area_size = sect_length #calcluates poisson based of whole gene vs peak gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: #same thing except for based on super local p-value slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) #makes sure spop_poisP is defined, even if its #just normal, something to be removed later, #slop should only be used when defined as true else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 #remove later if slop_pois_p > poisson_cutoff: #continue pass #defines the bedline of a peak for returning #TODO This should be abstracted out for now... seperate model from view bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop) #metadata for the specific bedline peak_dict['clusters'][bedline] = {} peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p peak_dict['clusters'][bedline]['SloP'] = slop_pois_p peak_dict['clusters'][bedline]['Nreads'] = n_reads_in_peak peak_dict['clusters'][bedline]['size'] = peak_length peakn += 1 #there are more than one peaks in this window #NO LONGER NESSESSARY SHOULD REMOVE else: #this handles peaks within peaks logic #local minima in subsection, relative to section start valleys = array( map( lambda x: x + p_start, xvals[diff( sign(diff(spline(xvals[p_start:p_stop + 1])))) > 0])) for subpeak in peaks: subpeak_start = int() subpeak_stop = int() if any(valleys < subpeak): subpeak_start = valleys[valleys < subpeak][-1] else: subpeak_start = starts[starts < subpeak][-1] if any(valleys > subpeak): subpeak_stop = valleys[valleys > subpeak][0] else: subpeak_stop = stops[stops > subpeak][0] peak_length = subpeak_stop - subpeak_start + 1 if peak_length < width_cutoff: #skip really small peaks continue n_reads_in_peak = sum(cts[subpeak_start:(subpeak_stop + 1)]) if (n_reads_in_peak < minreads or max(data[subpeak_start:(subpeak_stop + 1)]) < threshold): continue g_start = tx_start + subpeak_start + sectstart g_stop = tx_start + subpeak_stop + sectstart peak = tx_start + subpeak + sectstart thick_start = peak - 2 if thick_start < g_start: thick_start = g_start thick_stop = peak + 2 if thick_stop > g_stop: thick_stop = g_stop peak_name = "%s_%s_%s" % (gene_name, peakn, int(n_reads_in_peak)) #distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 else: area_start = peak - tx_start - windowsize if peak + windowsize > tx_end: #distance to gene stop area_stop = tx_start - tx_end + 1 else: #area_stop = sectstop area_stop = peak - tx_start + windowsize area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 #leave these in to allow for BH p-value correction if slop_pois_p > poisson_cutoff: pass #output results again bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop) peak_dict['clusters'][bedline] = {} peak_dict['clusters'][bedline]['SloP'] = slop_pois_p peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p peak_dict['clusters'][bedline][ 'Nreads'] = n_reads_in_peak peak_dict['clusters'][bedline]['size'] = peak_length peakn += 1 except NameError as best_error: print >> sys.stderr, best_error print >> sys.stderr, "spline fitting failed for %s" % (loc) raise #inflate p-values based on # of comparisons #bonferroni corrected if correct_p is True: for peak in peak_dict['clusters']: peak_dict['clusters'][peak]['p'] = peak_dict['clusters'][peak][ 'p'] * peakn #bonferroni correct p-value for MHT peak_dict['Nclusters'] = peakn return peak_dict
def peaks_from_info( wiggle, pos_counts, lengths, loc, gene_length, margin=25, fdr_alpha=0.05, user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, width_cutoff=10, windowsize=1000, SloP=False, correct_p=False, ): """ same args as before wiggle is converted from bam file pos_counts - one point per read instead of coverage of entire read lengths - lengths aligned portions of reads rest are the same fix later calls peaks for an individual gene gene_length - effective length of gene margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson """ peak_dict = {} # these are what is built in this dict, complicated enough that it might # be worth turning into an object # peak_dict['clusters'] = {} # peak_dict['sections'] = {} # peak_dict['nreads'] = int() # peak_dict['threshold'] = int() # peak_dict['loc'] = loc # data munging chrom, gene_name, tx_start, tx_end, signstrand = loc tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] # used for poisson calclulation? nreads_in_gene = sum(pos_counts) # decides FDR calcalation, maybe move getFRDcutoff mean into c code if user_threshold is None: gene_threshold = get_FDR_cutoff_mean(lengths, gene_length, alpha=fdr_alpha) else: gene_threshold = user_threshold if gene_threshold == "best_error": # verboseprint("""I had a hard time with this one: %s. # I think I'll use a threshold of 50""" % (loc)) threshold = 50 peak_dict["clusters"] = {} peak_dict["sections"] = {} peak_dict["nreads"] = int(nreads_in_gene) peak_dict["threshold"] = gene_threshold peak_dict["loc"] = loc peakn = 1 # verboseprintprint("Testing %s" % (loc)) # verboseprint("Gene threshold is: %d" % (gene_threshold)) # print wiggle # print margin sections = find_sections(wiggle, margin) if plotit is True: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart : (sectstop + 1)] cts = pos_counts[sectstart : (sectstop + 1)] xvals = arange(0, sect_length) Nreads = sum(cts) # gets random subset of lengths of reads for calculations on a section # not exactly the right way to do this but it should be very close. sect_read_lengths = rs(lengths, Nreads) peak_dict["sections"][sect] = {} threshold = int() # makes sure there are enough reads if Nreads < minreads: # verboseprint("""%d is not enough reads, skipping section: # %s""" % (Nreads, sect)) continue else: pass # verboseprint("""Analyzing section %s with %d reads""" # % (sect, Nreads)) # sets super-local if requested, might be able to factor this if user_threshold is None: if SloP is True: threshold = get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha) # verboseprint("Using super-local threshold %d" % (threshold)) else: threshold = gene_threshold else: threshold = user_threshold # saves threshold for each individual section peak_dict["sections"][sect]["threshold"] = threshold peak_dict["sections"][sect]["nreads"] = int(Nreads) # if wiggle track never excides threshold if max(data) < threshold: # verboseprint("data does not excede threshold, stopping") continue # fitting splines logic, black magic try: degree = 3 # cubic spline weights = None # for very large windows with many reads a large smoothing # parameter is required. test several different options # to determine a reasonable inital estimate # Goal is to find optimnal smooting paramater in multiple steps # initial_smoothing_value initial estimate of smoothing paramater # step 1, identify good initial value initial_smoothing_value = sectstop - sectstart + 1 best_smoothing_value = initial_smoothing_value best_estimate = 1 # step 2, refine so as not to runinto local minima later, # try to come up with a good way of getting optimal paramater best_error = find_spline_residuals(initial_smoothing_value, xvals, data, degree, weights) for i in range(2, 11): cur_smoothing_value = initial_smoothing_value * i # tries find optimal initial smooting paraater in this loop cur_error = find_spline_residuals(cur_smoothing_value, xvals, data, degree, weights) if cur_error < best_error: best_smoothing_value = cur_smoothing_value best_estimate = i # verboseprint("""I'm using (region length) * %d as the # initial estimate for the smoothing # parameter""" % (best_estimate)) try: # fine optimization of smooting paramater cutoff = float(0) tries = 0 # shouldn't get smoothing coef's this small.. increase # the initial estimate and try again. WARNING: BLACK MAGIC while cutoff < 5: tries += 1 # increasing this may improve accuracy, # but at the cost of running time. if tries == 3: break spline = optimize.minimize( find_spline_residuals, best_smoothing_value, args=(xvals, data, degree, weights), options={"disp": False, "maxiter": 10}, # method="Powell", # old method method="L-BFGS-B", # abnormal termination sometimes # method="COBYLA", bounds=((0.1, None),), ) # fit a smoothing spline using an optimal parameter # for smoothing and with weights proportional to the # number of reads aligned at each position if weights # is set if spline.success: cutoff = spline.x # print "cutoff is %s" % (cutoff) else: # print "%s failed spline building at section %s" % (loc, sect) # print spline.message pass best_smoothing_value += sect_length except Exception as best_error: print "best smoothing value is:", best_smoothing_value print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % (loc, sect) print >> sys.stderr, best_error continue # verboseprint ("optimized smoothing parameter") # if we are going to save and output as a pickle fi is %s" %(str(cutoff)) # final fit spline spline = find_univariate_spline(cutoff, xvals, data, degree, weights) spline_values = array([round(x) for x in spline(xvals)]) if plotit is True: plot_spline(spline, data, xvals, peakn, threshold) starts_and_stops, starts, stops = get_regions_above_threshold(threshold, spline_values) # walks along spline, and calls peaks along spline # for each start, take the next stop and find the peak # between the start and the stop this is where I need to # fix, some peaks starts start right after another start, # but not on top of it make sure the next start is after the # previous stop # subsections that are above threshold for p_start, p_stop in starts_and_stops: # peaks with-in this subsection, indexed from section # (not subsection) start # find all local maxima peaks = [x + p_start for x in xvals[find_local_maxima(spline_values[p_start : (p_stop + 1)])]] # map(lambda x: x + p_start, # xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0]) if not len(peaks) in (0, 1): # print gene_name # print "spline ", spline(xvals) # print "threshold: %s" % (threshold) # print "full spline ", spline_values # print "peaks", peaks # print p_start, p_stop # print starts_and_stops # print "spline values", spline_values[p_start:(p_stop + 1)] # print "peaks at in section", xvals[find_local_maxima(spline_values[p_start:(p_stop + 1)])] assert len(peaks) in (0, 1) # there should be one or zero peaks in every section # handles logic if there are multiple peaks between # start and stop if len(peaks) <= 0: continue if len(peaks) is 1: # gets reads in peak n_reads_in_peak = sum(cts[p_start : (p_stop + 1)]) # verboseprint(""""Peak %d (%d - %d) has %d # reads""" % (peakn, # p_start, # (p_stop + 1), # n_reads_in_peak)) # makes sure there enough reads if n_reads_in_peak < minreads or max(data[p_start : (p_stop + 1)]) < threshold: # verboseprint("""skipping peak, %d is not enough reads""" # % (n_reads_in_peak)) continue # formatting of bed track # start and stop for bed track to be created g_start = tx_start + sectstart + p_start g_stop = tx_start + sectstart + p_stop # highest point in start stop peak = tx_start + sectstart + peaks[0] # makes it thicker so we can see on the browser thick_start = peak - 2 thick_stop = peak + 2 # best_error checking logic to keep bed files from breaking if thick_start < g_start: thick_start = g_start if thick_stop > g_stop: thick_stop = g_stop peak_length = g_stop - g_start + 1 # skip really small peaks if peak_length < width_cutoff: continue peak_name = gene_name + "_" + str(peakn) + "_" + str(int(n_reads_in_peak)) # super local logic # best_error check to make sure area is in area of gene # distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 # for super local gets area around peak for calculation else: area_start = peak - tx_start - windowsize # area_start = sectstart # same thing except for end of gene instead of start if peak + windowsize > tx_end: # distance to gene stop area_stop = tx_start - tx_end + 1 else: area_stop = peak - tx_start + windowsize # area_stop = sectstop # use area reads + 1/2 all other reads in gene: # area_reads = sum(pos_counts[area_start:area_stop]) + # 0.5*(sum(pos_counts) - # sum(pos_counts[area_start:area_stop])) # use area reads: area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 # area_reads = sum(pos_counts[sectstart:sectstop]) # area_size = sect_length # calcluates poisson based of whole gene vs peak gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: # same thing except for based on super local p-value slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) # makes sure spop_poisP is defined, even if its # just normal, something to be removed later, # slop should only be used when defined as true else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 # remove later if slop_pois_p > poisson_cutoff: # continue pass # defines the bedline of a peak for returning # TODO This should be abstracted out for now... seperate model from view bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop, ) # metadata for the specific bedline peak_dict["clusters"][bedline] = {} peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p peak_dict["clusters"][bedline]["SloP"] = slop_pois_p peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak peak_dict["clusters"][bedline]["size"] = peak_length peakn += 1 # there are more than one peaks in this window # NO LONGER NESSESSARY SHOULD REMOVE else: # this handles peaks within peaks logic # local minima in subsection, relative to section start valleys = array( map(lambda x: x + p_start, xvals[diff(sign(diff(spline(xvals[p_start : p_stop + 1])))) > 0]) ) for subpeak in peaks: subpeak_start = int() subpeak_stop = int() if any(valleys < subpeak): subpeak_start = valleys[valleys < subpeak][-1] else: subpeak_start = starts[starts < subpeak][-1] if any(valleys > subpeak): subpeak_stop = valleys[valleys > subpeak][0] else: subpeak_stop = stops[stops > subpeak][0] peak_length = subpeak_stop - subpeak_start + 1 if peak_length < width_cutoff: # skip really small peaks continue n_reads_in_peak = sum(cts[subpeak_start : (subpeak_stop + 1)]) if n_reads_in_peak < minreads or max(data[subpeak_start : (subpeak_stop + 1)]) < threshold: continue g_start = tx_start + subpeak_start + sectstart g_stop = tx_start + subpeak_stop + sectstart peak = tx_start + subpeak + sectstart thick_start = peak - 2 if thick_start < g_start: thick_start = g_start thick_stop = peak + 2 if thick_stop > g_stop: thick_stop = g_stop peak_name = "%s_%s_%s" % (gene_name, peakn, int(n_reads_in_peak)) # distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 else: area_start = peak - tx_start - windowsize if peak + windowsize > tx_end: # distance to gene stop area_stop = tx_start - tx_end + 1 else: # area_stop = sectstop area_stop = peak - tx_start + windowsize area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 # leave these in to allow for BH p-value correction if slop_pois_p > poisson_cutoff: pass # output results again bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop, ) peak_dict["clusters"][bedline] = {} peak_dict["clusters"][bedline]["SloP"] = slop_pois_p peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak peak_dict["clusters"][bedline]["size"] = peak_length peakn += 1 except NameError as best_error: print >> sys.stderr, best_error print >> sys.stderr, "spline fitting failed for %s" % (loc) raise # inflate p-values based on # of comparisons #bonferroni corrected if correct_p is True: for peak in peak_dict["clusters"]: peak_dict["clusters"][peak]["p"] = ( peak_dict["clusters"][peak]["p"] * peakn ) # bonferroni correct p-value for MHT peak_dict["Nclusters"] = peakn return peak_dict