Esempio n. 1
0
    def test_find_sections(self):
        #setup 
        print "testing find sectionds"
        #Null Case
        self.assertRaises(TypeError, find_sections, (None, 0))
        
        #Case with all zero coverage
        wiggle = [0] * 20
        result = find_sections(wiggle, 0)
        assert result == []
        
        #Case with all non-zero coverage
        wiggle = [5] * 20
        result = find_sections(wiggle, 0)
        self.assertEqual(result, [(0,19)])
      
 

        wiggle = ([5] * 20) + [0] + ([5] * 20)
        #returns one segnment
        result = find_sections(wiggle, 1)
        self.assertEqual(result, [(0,40)])
        
        #second case returns two segnments
        wiggle = ([5] * 9) + [0] + ([5] * 10)
        result = find_sections(wiggle, 0)
        assert result == [(0,9), (10,19)]
        
        #returns one segnment
        result = find_sections(wiggle, 1)
        assert result == [(0,19)]
        
        #Edge case where margins stop before the end of genes
        wiggle = [0] + ([5] * 10)
        result = find_sections(wiggle, 0)
        assert result == [(1,10)]
        
        #Edge case where margins start after the start of genes
        wiggle = ([5] * 10) + [0] 
        result = find_sections(wiggle, 0)
        assert result == [(0,10)]
        
        #Test not integers
        wiggle = [.5] * 20
        result = find_sections(wiggle, 0)
        self.assertEqual(result, [(0,19)])
        
        #test numpy arrays
        wiggle = ones((20), dtype='f')
        wiggle = list(wiggle)
        result = find_sections(wiggle, 0)
        self.assertEqual(result, [(0,19)])
Esempio n. 2
0
    def test_find_sections(self):
        #setup
        print "testing find sectionds"
        #Null Case
        self.assertRaises(TypeError, find_sections, (None, 0))

        #Case with all zero coverage
        wiggle = [0] * 20
        result = find_sections(wiggle, 0)
        assert result == []

        #Case with all non-zero coverage
        wiggle = [5] * 20
        result = find_sections(wiggle, 0)
        self.assertEqual(result, [(0, 19)])

        wiggle = ([5] * 20) + [0] + ([5] * 20)
        #returns one segnment
        result = find_sections(wiggle, 1)
        self.assertEqual(result, [(0, 40)])

        #second case returns two segnments
        wiggle = ([5] * 9) + [0] + ([5] * 10)
        result = find_sections(wiggle, 0)
        assert result == [(0, 9), (10, 19)]

        #returns one segnment
        result = find_sections(wiggle, 1)
        assert result == [(0, 19)]

        #Edge case where margins stop before the end of genes
        wiggle = [0] + ([5] * 10)
        result = find_sections(wiggle, 0)
        assert result == [(1, 10)]

        #Edge case where margins start after the start of genes
        wiggle = ([5] * 10) + [0]
        result = find_sections(wiggle, 0)
        assert result == [(0, 10)]

        #Test not integers
        wiggle = [.5] * 20
        result = find_sections(wiggle, 0)
        self.assertEqual(result, [(0, 19)])

        #test numpy arrays
        wiggle = ones((20), dtype='f')
        wiggle = list(wiggle)
        result = find_sections(wiggle, 0)
        self.assertEqual(result, [(0, 19)])
Esempio n. 3
0
    def test_find_sections_two_sections(self):
        #Case with one region on margin of one and two regions on margin of two

        #returns two segnments
        wiggle = ([5] * 20) + [0] + ([5] * 20)
        result = find_sections(wiggle, 0)

        #I believe this is zero based half open result.  Need to think about it more
        self.assertEqual(result, [(0, 20), (21, 40)])
Esempio n. 4
0
 def test_find_sections_two_sections(self):
     #Case with one region on margin of one and two regions on margin of two
     
     #returns two segnments
     wiggle = ([5] * 20) + [0] + ([5] * 20)
     result = find_sections(wiggle, 0)
     
     
     #I believe this is zero based half open result.  Need to think about it more
     self.assertEqual(result, [(0,20), (21,40)])
Esempio n. 5
0
 def test_find_sections_no_overlaps(self):
     #verify there is no overlap
     
     wiggle = [10, 4,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
                3, 3, 3, 3]
     result = find_sections(wiggle, 15)
     print result
     #start is greater than end
     self.assertGreater(result[1][0], result[0][1], "first region: %s, second region %s, start of section value is less than end of first" %(result[0][1], result[1][0] )) 
Esempio n. 6
0
    def test_find_sections_no_overlaps(self):
        #verify there is no overlap

        wiggle = [
            10, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
        ]
        result = find_sections(wiggle, 15)
        print result
        #start is greater than end
        self.assertGreater(
            result[1][0], result[0][1],
            "first region: %s, second region %s, start of section value is less than end of first"
            % (result[0][1], result[1][0]))
Esempio n. 7
0
def peaks_from_info(bam_fileobj, wiggle, pos_counts, lengths, loc, gene_length, 
                    margin=25, fdr_alpha=0.05, binom_alpha=0.001, method="Randomization" ,user_threshold=None,
                    minreads=20, poisson_cutoff=0.05, plotit=False, 
                    width_cutoff=10, windowsize=1000, SloP=False, 
                    correct_p=False, max_width=None, min_width=None, 
                    max_gap=None, algorithm="spline"):

    """
    
    same args as before 
    wiggle is converted from bam file
    pos_counts - one point per read instead of coverage of entire read
    lengths - lengths aligned portions of reads 
    rest are the same fix later


    calls peaks for an individual gene 
    

    gene_length - effective length of gene
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in genomic_center that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
    algorithm - str the algorithm to run
    """

    peak_dict = {}
    
    #all the information nessessary to record a genomic_center, used later, but declared outside of loops

    
    #these are what is built in this dict, complicated enough that it might 
    #be worth turning into an object
    #peak_dict['clusters'] = {}
    #peak_dict['sections'] = {}
    #peak_dict['nreads'] = int()
    #peak_dict['threshold'] = int()
    #peak_dict['loc'] = loc
    
    #data munging
    chrom, gene_name, tx_start, tx_end, strand = loc
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]    
    
    #used for poisson calclulation? 
    nreads_in_gene = sum(pos_counts)

    #decides FDR calcalation, maybe move getFRDcutoff mean into c code
    gene_threshold = 0
    
    if user_threshold is None:    
        if method == "Binomial":  #Uses Binomial Distribution to get cutoff if specified by user                             
            gene_threshold = get_Binom_cutoff(lengths,gene_length,binom_alpha)
        else:
            gene_threshold = get_FDR_cutoff_mean(lengths, gene_length,alpha=fdr_alpha)     
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold
        
        
    
    if not isinstance(gene_threshold, int):
        raise TypeError
        
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = loc
    peak_number=1

 
    sections = find_sections(wiggle, margin)
    if plotit is True:      
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:
        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]
        
        #this cts is alright because we know the reads are bounded
        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(0, sect_length)
        Nreads = sum(cts)

        peak_dict['sections'][sect] = {}
        threshold = int()
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < minreads:
            logging.info("""%d is not enough reads, skipping section: %s""" %(Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False            
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %(sect, Nreads))
            pass
        
            
        if user_threshold == None:
            if SloP:
                
                #gets random subset of lengths of reads for calculations on a section
                #not exactly the right way to do this but it should be very close.
                sect_read_lengths = rs(lengths, Nreads) 
                
                #use the minimum FDR cutoff between superlocal and gene-wide calculations
                threshold = min(gene_threshold, get_FDR_cutoff_mean(sect_read_lengths, 
                                                sect_length, 
                                                alpha=fdr_alpha))
                logging.info("Using super-local threshold %d" %(threshold))
                
            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0
        
        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue
        
        if algorithm == "spline":
            
            initial_smoothing_value = (sectstop - sectstart + 1)
            fitter = SmoothingSpline(xvals, data, initial_smoothing_value,
                            lossFunction="get_norm_penalized_residuals")
            
        elif algorithm == "gaussian":
            fitter = GaussMix(xvals, data)
            
        elif algorithm == "classic":
            fitter = Classic(xvals, data, max_width, min_width, max_gap)
        try:
            peak_definitions = fitter.peaks(threshold, plotit)

        except Exception as error:
            logging.error(gene_name)
            raise error
            
        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop
        for peak_start, peak_stop, peak_center in peak_definitions: 
 
             genomic_start = tx_start + sectstart + peak_start
             genomic_stop = tx_start + sectstart + peak_stop
             
             number_reads_in_peak = bam_fileobj.count(chrom, start=genomic_start, end=genomic_stop)
             #sum(cts[peak_start:(peak_stop + 1)])
             logging.info("""Peak %d (%d - %d) has %d 
                              reads""" %(peak_number,                                             
                                          peak_start,
                                          (peak_stop + 1),
                                          number_reads_in_peak))

             #makes sure there enough reads
             if (number_reads_in_peak < minreads or 
                 max(data[peak_start:(peak_stop + 1)]) < threshold):
                 logging.info("""skipping genomic_center, %d is not enough reads"""
                              %(number_reads_in_peak))
                 continue

  
             #highest point in start stop
             genomic_center = tx_start + sectstart + peak_center

             #makes it thicker so we can see on the browser 
             thick_start = genomic_center - 2
             thick_stop = genomic_center + 2

             #best_error checking logic to keep bed files from breaking
             if thick_start < genomic_start:
                 thick_start = genomic_start
             if thick_stop > genomic_stop:
                 thick_stop = genomic_stop

             peak_length = genomic_stop - genomic_start + 1

             #skip really small peaks
             if peak_length < width_cutoff:
                 continue
           

             #super local logic 
             #best_error check to make sure area is in area of gene

             #distance from gene start
             if genomic_center - tx_start - windowsize < 0: 
                 area_start = 0

             #for super local gets area around genomic_center for calculation
             else:  
                 area_start = genomic_center - tx_start - windowsize
                 #area_start = sectstart

             #same thing except for end of gene instead of start
             if genomic_center + windowsize > tx_end: #distance to gene stop
                 area_stop = tx_start - tx_end + 1
             else:
                 area_stop = genomic_center - tx_start + windowsize
                 #area_stop = sectstop

             #use area reads + 1/2 all other reads in gene: 
             #area_reads = sum(pos_counts[area_start:area_stop]) + 
             #0.5*(sum(pos_counts) - 
             #sum(pos_counts[area_start:area_stop]))

             #use area reads:
             area_reads = sum(pos_counts[area_start:area_stop])
             area_size = area_stop - area_start + 1

             #area_reads = sum(pos_counts[sectstart:sectstop])
             #area_size = sect_length

             #calcluates poisson based of whole gene vs genomic_center
             if algorithm == "classic" and peak_length < min_width:
                 peak_length = min_width
                 
             gene_pois_p = poissonP(nreads_in_gene, 
                                    number_reads_in_peak, 
                                    gene_length, 
                                    peak_length)
             if SloP is True:
                 #same thing except for based on super local p-value
                 slop_pois_p = poissonP(area_reads, 
                                       number_reads_in_peak, 
                                       area_size, 
                                       peak_length)

             #makes sure spop_poisP is defined, even if its 
             #just normal, something to be removed later,
             #slop should only be used when defined as true
             else:
                 slop_pois_p = gene_pois_p


             if math.isnan(slop_pois_p):
                 slop_pois_p = 1

             #defines the bedline of a genomic_center for returning
             #TODO This should be abstracted out for now... seperate model from view
             
             peak_dict['clusters'].append(Peak(chrom, 
                                               genomic_start, 
                                               genomic_stop, 
                                               gene_name, #need this is a unique id for later analysis
                                               slop_pois_p, 
                                               strand,
                                               thick_start,
                                               thick_stop,
                                               peak_number,
                                               number_reads_in_peak,
                                               gene_pois_p,
                                               peak_length,
                                               0
                                               )
                                          )

             peak_number += 1
             peak_dict['sections'][sect]['nPeaks'] +=1
           
    #inflate p-values based on # of comparisons #bonferroni corrected
    if correct_p is True:
        #best I can tell this never executes...            
        for genomic_center in peak_dict['clusters']:
            genomic_center.p = genomic_center.p * peak_number  #bonferroni correct p-value for MHT
        
        

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)
    return peak_dict
Esempio n. 8
0
def call_peaks(interval,
               gene_length,
               bam_file=None,
               max_gap=25,
               fdr_alpha=0.05,
               user_threshold=None,
               binom_alpha=0.05,
               method="binomial",
               min_reads=3,
               poisson_cutoff=0.05,
               plotit=False,
               w_cutoff=10,
               windowsize=1000,
               SloP=False,
               max_width=None,
               min_width=None,
               algorithm="spline",
               reverse_strand=False,
               input_bam=None):
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    """

    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom

    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"
    (wiggle, jxns, pos_counts, lengths,
     allreads) = readsToWiggle_pysam(subset_reads, interval.start,
                                     interval.stop, strand, "start", False)

    #This is the worst of hacks, need to factor out pysam eventually
    bam_fileobj = Robust_BAM_Reader(bam_file)
    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    array_of_reads = read_array(subset_reads, interval.start, interval.stop)

    if input_bam:  #if not none
        input_bam_fileobj = Robust_BAM_Reader(input_bam)
        input_subset_reads = list(
            input_bam_fileobj.fetch(reference=str(interval.chrom),
                                    start=interval.start,
                                    end=interval.stop))
        input_array_of_reads = read_array(input_subset_reads, interval.start,
                                          interval.stop)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [
        gene_length - 1 if read >= gene_length else read for read in lengths
    ]

    if user_threshold is None:
        if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user

            gene_threshold = get_FDR_cutoff_binom(lengths, gene_length,
                                                  binom_alpha)
        elif method == "random":
            gene_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                 genelength=gene_length,
                                                 alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold

    if not isinstance(gene_threshold, int):
        raise TypeError

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = interval

    peak_number = 0

    sections = find_sections(wiggle, max_gap)
    if plotit:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                             sectstart + interval.start,
                                             sectstop + interval.start + 1,
                                             strand)

        Nreads = count_reads_in_interval(cur_interval, array_of_reads)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" %
                         (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %
                         (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                half_width = 500
                section_start = max(0, sectstart + interval.start - half_width)
                section_stop = sectstop + interval.start + 1 + half_width
                expanded_sect_length = section_stop - section_start
                cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                     section_start,
                                                     section_stop, strand)
                expanded_Nreads = get_reads_in_interval(
                    cur_interval, array_of_reads)
                sect_read_lengths = read_lengths_from_htseq(expanded_Nreads)
                sect_read_lengths = [
                    sect_length - 1 if read > sect_length else read
                    for read in sect_read_lengths
                ]
                peak_dict['sections'][sect]['expanded_Nreads'] = len(
                    expanded_Nreads)

                if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user
                    threshold = max(
                        gene_threshold,
                        get_FDR_cutoff_binom(sect_read_lengths,
                                             expanded_sect_length,
                                             binom_alpha))
                elif method == "random":
                    #use the minimum FDR cutoff between superlocal and gene-wide calculations
                    threshold = max(
                        gene_threshold,
                        get_FDR_cutoff_mean(readlengths=sect_read_lengths,
                                            genelength=expanded_sect_length,
                                            alpha=fdr_alpha))
                else:
                    raise ValueError("Method %s does not exist" % (method))
                logging.info("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = map(float, data)
            #Magic number for initial smoothing, but it works
            initial_smoothing_value = (
                (sectstop - sectstart + 1)**(1 / 3)) + 10

            peak_dict['sections'][sect][
                'smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" %
                         initial_smoothing_value)
            fitter = SmoothingSpline(
                xvals,
                data,
                smoothing_factor=initial_smoothing_value,
                lossFunction="get_turn_penalized_residuals",
                threshold=threshold,
                num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = map(float, cts)
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = map(float, data)
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" %
                         fitter.smoothing_factor)
            peak_dict['sections'][sect][
                'final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" %
                          (interval.name, error))
            raise error

        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop

        #Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:

            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                 genomic_start, genomic_stop,
                                                 strand)
            number_reads_in_peak = count_reads_in_interval(
                cur_interval, array_of_reads)

            if input_bam:
                input_number_reads_in_peak = count_reads_in_interval(
                    cur_interval, input_array_of_reads)
            else:
                input_number_reads_in_peak = 0

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                      (peak_stop + 1), number_reads_in_peak))

            #highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            #makes it thicker so we can see on the browser
            #error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)

            #super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize,
                            len(wiggle))

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                 interval.start + area_start,
                                                 interval.start + area_stop,
                                                 strand)
            number_reads_in_area = count_reads_in_interval(
                cur_interval, array_of_reads)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(
                Peak(
                    chrom=interval.chrom,
                    genomic_start=genomic_start,
                    genomic_stop=genomic_stop,
                    gene_name=interval.attrs['gene_id'],
                    strand=interval.strand,
                    thick_start=thick_start,
                    thick_stop=thick_stop,
                    peak_number=peak_number,
                    number_reads_in_peak=number_reads_in_peak,
                    size=peak_length,
                    p=0,
                    effective_length=int(interval.attrs['effective_length']),
                    peak_length=peak_length,
                    area_reads=number_reads_in_area,
                    area_size=area_length,
                    nreads_in_gene=nreads_in_gene,
                    #nreads_in_input=input_number_reads_in_peak,
                ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)

    return peak_dict
Esempio n. 9
0
def call_peaks(interval, gene_length, bam_file=None, max_gap=25,
               fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial",
               min_reads=3, poisson_cutoff=0.05,
               plotit=False, w_cutoff=10, windowsize=1000, 
               SloP=False, max_width=None, min_width=None,
               algorithm="spline", reverse_strand=False, input_bam=None):
    
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    """
    
    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom

    subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"
    (wiggle, jxns, pos_counts,
     lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start,
                                              interval.stop, strand, "start", False)

    #This is the worst of hacks, need to factor out pysam eventually
    bam_fileobj = Robust_BAM_Reader(bam_file)
    subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
    array_of_reads = read_array(subset_reads, interval.start, interval.stop)

    if input_bam: #if not none
        input_bam_fileobj = Robust_BAM_Reader(input_bam)
        input_subset_reads = list(input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
        input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [gene_length - 1 if read >= gene_length else read for read in lengths]

    if user_threshold is None:
        if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user

            gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha)
        elif method == "random":
            gene_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                 genelength=gene_length,
                                                 alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold

    if not isinstance(gene_threshold, int):
        raise TypeError

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = interval

    peak_number = 0


    sections = find_sections(wiggle, max_gap)
    if plotit:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1,
                                         strand)

        Nreads = count_reads_in_interval(cur_interval, array_of_reads)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                half_width = 500
                section_start = max(0, sectstart + interval.start - half_width)
                section_stop = sectstop + interval.start + 1 + half_width
                expanded_sect_length = section_stop - section_start
                cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop,strand )
                expanded_Nreads = get_reads_in_interval(cur_interval, array_of_reads)
                sect_read_lengths = read_lengths_from_htseq(expanded_Nreads)
                sect_read_lengths = [sect_length - 1 if read > sect_length else read for read in sect_read_lengths]

                if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user
                    threshold = max(gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha))
                elif method == "random":
                    #use the minimum FDR cutoff between superlocal and gene-wide calculations
                    threshold = max(gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha))
                else:
                    raise ValueError("Method %s does not exist" % (method))
                logging.info("Using super-local threshold %d" %(threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['expanded_Nreads'] = len(expanded_Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = map(float, data)
            #Magic number for initial smoothing, but it works
            initial_smoothing_value = ((sectstop - sectstart + 1)**(1/3)) + 10

            peak_dict['sections'][sect]['smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" % initial_smoothing_value)
            fitter = SmoothingSpline(xvals, data, smoothing_factor=initial_smoothing_value,
                            lossFunction="get_turn_penalized_residuals",
                            threshold=threshold,
                            num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = map(float, cts)
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = map(float, data)
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor)
            peak_dict['sections'][sect]['final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" % (interval.name, error))
            raise error

        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop

        #Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:

            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop,
                                         strand)
            number_reads_in_peak = count_reads_in_interval(cur_interval, array_of_reads)

            if input_bam:
                input_number_reads_in_peak = count_reads_in_interval(cur_interval, input_array_of_reads)
            else:
                input_number_reads_in_peak = 0

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                     (peak_stop + 1), number_reads_in_peak))

            #highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            #makes it thicker so we can see on the browser
            #error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)


            #super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize, len(wiggle))

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop,
                                         strand)
            number_reads_in_area = count_reads_in_interval(cur_interval, array_of_reads)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(Peak(chrom=interval.chrom,
                                              genomic_start=genomic_start,
                                              genomic_stop=genomic_stop,
                                              gene_name=interval.attrs['gene_id'],
                                              strand=interval.strand,
                                              thick_start=thick_start,
                                              thick_stop=thick_stop,
                                              peak_number=peak_number,
                                              number_reads_in_peak=number_reads_in_peak,
                                              size=peak_length,
                                              p=0,
                                              effective_length=int(interval.attrs['effective_length']),
                                              peak_length=peak_length,
                                              area_reads=number_reads_in_area,
                                              area_size=area_length,
                                              nreads_in_gene=nreads_in_gene,
                                              #nreads_in_input=input_number_reads_in_peak,
                                              ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)

    return peak_dict
Esempio n. 10
0
def call_peaks(interval,
               gene_length,
               bam_file=None,
               max_gap=25,
               fdr_alpha=0.05,
               user_threshold=None,
               binom_alpha=0.05,
               method="binomial",
               min_reads=3,
               poisson_cutoff=0.05,
               plotit=False,
               w_cutoff=10,
               windowsize=1000,
               SloP=False,
               max_width=None,
               min_width=None,
               algorithm="spline",
               reverse_strand=False,
               exons=None):
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarded
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value (+/- 500 b.p. of each section)
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    returns peak_dict, dictionary containing
     peak_dict['clusters']: list of Peak objects
     peak_dict['sections']: key: section
        ['nreads'] how many reads in this section
        ['threshold'] = threshold // either be suerlocal threshold, mRNA threshold or pre-mRNA threshold

        ['tried'] = True
        ['nPeaks'] = number of peaks
     peak_dict['nreads']: No. reads in gene
     peak_dict['threshold']
     peak_dict['loc']: interval
     peak_dict['Nclusters']: total peaks in transcript
    """
    ###########################################################################
    # print("starting call_peaks on gene_no:", gene_no, "interval:", interval)
    # genecallpeaksloggingperiode = 100
    # should_log_gene_call_peaks_this_time = (gene_no % genecallpeaksloggingperiode == 0)
    ###########################################################################
    # if should_log_gene_call_peaks_this_time:
    #    logging.info(" starting call_peaks on gene_no {}".format(gene_no))
    ###########################################################################

    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')

    # fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr") and not interval.chrom.startswith(
            "ERCC") and not interval.chrom.startswith("phiX"):
        interval.chrom = "chr" + interval.chrom

    # fetch reads in the genomic region
    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"

    # convert pysam to a wiggle vector, junction, positional count(coverage), read lengths, all_reads, location
    (wiggle, jxns, pos_counts, lengths, allreads,
     read_locations) = readsToWiggle_pysam(subset_reads, interval.start,
                                           interval.stop, strand, "start",
                                           False)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [
        gene_length - 1 if read >= gene_length else read for read in lengths
    ]

    # pre-mRNA Threshold
    if user_threshold is None:
        if method == "binomial":  # Uses Binomial Distribution to get cutoff if specified by user
            # print(len(lengths), gene_length, binom_alpha)
            premRNA_threshold = get_FDR_cutoff_binom(lengths, gene_length,
                                                     binom_alpha)
            # print(premRNA_threshold)
        elif method == "random":
            premRNA_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                    genelength=gene_length,
                                                    alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        premRNA_threshold = user_threshold

    # mRNA Threshold
    exons = pybedtools.BedTool(exons)
    exons = exons.filter(
        lambda x: x.name == interval.attrs['gene_id']).saveas()

    total_exonic_reads = []
    total_exonic_length = 0
    htseq_exons = HTSeq.GenomicArrayOfSets(chroms="auto", stranded=False)

    for exon, exon_interval in zip(exons, bed_to_genomic_interval(exons)):
        exon.stop += 1
        exonic_reads = get_reads_in_interval_pysam(exon, interval.start,
                                                   read_locations)

        exon_read_lengths = read_lengths_from_pysam(exonic_reads)
        exon_read_lengths = [
            exon_interval.length - 1 if read > exon_interval.length else read
            for read in exon_read_lengths
        ]
        total_exonic_reads += exon_read_lengths
        total_exonic_length += exon_interval.length
        htseq_exons[exon_interval] += 'exon'

    mRNA_threshold = get_FDR_cutoff_binom(total_exonic_reads,
                                          total_exonic_length, binom_alpha)
    if not isinstance(premRNA_threshold, int):
        raise TypeError

    # these are what is built in this dict, complicated enough that it might
    # be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = premRNA_threshold
    peak_dict['loc'] = interval

    peak_number = 0

    sections = find_sections(
        wiggle,
        max_gap)  # return list of base with contiguous read > 0 (gap allowed)
    if plotit:
        plot_sections(wiggle, sections, premRNA_threshold)

    # for each section, call peaks
    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        # make interval for teh section
        cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                             sectstart + interval.start,
                                             sectstop + interval.start + 1,
                                             strand)

        # Logic to use variable thresholds for exons or introns, still superseded by superLocal logic
        overlaps_exon = len(
            reduce(set.union,
                   (val for iv, val in htseq_exons[cur_interval].steps()))) > 0
        gene_threshold = mRNA_threshold if overlaps_exon else premRNA_threshold

        # maybe make a function that takes a genomic interval and converts it into a pybedtools interval
        bed_format = [
            interval.chrom, sectstart + interval.start,
            sectstop + interval.start + 1, interval.name, interval.score,
            strand
        ]
        bed_format = list(map(str, bed_format))
        cur_pybedtools_interval = pybedtools.create_interval_from_list(
            bed_format)

        Nreads = count_reads_in_interval_pysam(cur_pybedtools_interval,
                                               interval.start, read_locations)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        # makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" %
                         (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %
                         (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                # super local p-value: section +/- 500 b.p.'; instead of using whole gene's length and read, use this extended region
                half_width = 500
                section_start = max(
                    0, sectstart + interval.start -
                    half_width)  # aim at -500 offset from section start
                section_stop = sectstop + interval.start + 1 + half_width  # aim at _500 from section stop
                expanded_sect_length = section_stop - section_start

                bed_format = [
                    interval.chrom, section_start, section_stop, interval.name,
                    interval.score, strand
                ]
                bed_format = list(map(str, bed_format))
                cur_pybedtools_interval = pybedtools.create_interval_from_list(
                    bed_format)

                expanded_Nreads = get_reads_in_interval_pysam(
                    cur_pybedtools_interval, interval.start, read_locations)
                sect_read_lengths = read_lengths_from_pysam(expanded_Nreads)
                sect_read_lengths = [
                    sect_length - 1 if read > sect_length else read
                    for read in sect_read_lengths
                ]
                peak_dict['sections'][sect]['expanded_Nreads'] = len(
                    expanded_Nreads)

                if method == "binomial":  # Uses Binomial Distribution to get cutoff if specified by user
                    slop_threshold = get_FDR_cutoff_binom(
                        readlengths=sect_read_lengths,
                        genelength=expanded_sect_length,
                        alpha=binom_alpha)
                elif method == "random":
                    # use the minimum FDR cutoff between superlocal and gene-wide calculations
                    slop_threshold = get_FDR_cutoff_mean(
                        readlengths=sect_read_lengths,
                        genelength=expanded_sect_length,
                        alpha=fdr_alpha)
                else:
                    raise ValueError("Method %s does not exist" % (method))
                threshold = max(gene_threshold, slop_threshold)

                logging.info("Using super-local threshold %d" % (threshold))

            else:
                # if not use super local threshold (+/- 500 bp), use mRNA_threshold for exon; premRNA_threshold if section does not overlap with exon
                threshold = gene_threshold
        else:
            threshold = user_threshold

        # saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = list(map(float, data))
            # Magic number for initial smoothing, but it works
            initial_smoothing_value = (
                (sectstop - sectstart + 1)**(1 / 3)) + 10

            peak_dict['sections'][sect][
                'smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" %
                         initial_smoothing_value)
            fitter = SmoothingSpline(
                xvals,
                data,
                smoothing_factor=initial_smoothing_value,
                lossFunction="get_turn_penalized_residuals",
                threshold=threshold,
                num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = list(map(float, cts))
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = list(map(float, data))
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" %
                         fitter.smoothing_factor)
            peak_dict['sections'][sect][
                'final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" %
                          (interval.name, error))
            raise error

        # subsections that are above threshold
        # peak center is actually the location where we think binding should
        # occur, not the average of start and stop

        # Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:
            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            # save to bedtool
            bed_format = [
                interval.chrom, genomic_start, genomic_stop, interval.name,
                interval.score, strand
            ]
            bed_format = list(map(str,
                                  bed_format))  # create_interval_only_take_str
            cur_pybedtools_interval = pybedtools.create_interval_from_list(
                bed_format)

            number_reads_in_peak = count_reads_in_interval_pysam(
                cur_pybedtools_interval, interval.start, read_locations)

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                      (peak_stop + 1), number_reads_in_peak))

            # highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            # makes it thicker so we can see on the browser
            # error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)

            # super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize,
                            len(wiggle))

            bed_format = [
                interval.chrom, interval.start + area_start,
                interval.start + area_stop, interval.name, interval.score,
                strand
            ]
            bed_format = list(map(str, bed_format))
            cur_pybedtools_interval = pybedtools.create_interval_from_list(
                bed_format)

            number_reads_in_area = count_reads_in_interval_pysam(
                cur_pybedtools_interval, interval.start, read_locations)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(
                Peak(
                    chrom=interval.chrom,
                    genomic_start=genomic_start,
                    genomic_stop=genomic_stop,
                    gene_name=interval.attrs['gene_id'],
                    strand=interval.strand,
                    thick_start=thick_start,
                    thick_stop=thick_stop,
                    peak_number=peak_number,
                    number_reads_in_peak=number_reads_in_peak,
                    size=peak_length,
                    p=0,
                    effective_length=int(interval.attrs['effective_length']),
                    peak_length=peak_length,
                    area_reads=number_reads_in_area,
                    area_size=area_length,
                    nreads_in_gene=nreads_in_gene,
                    # nreads_in_input=input_number_reads_in_peak,
                ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)
    ###################################################
    # print("returning gene_no:", gene_no, "peak_dict:", peak_dict)
    ####################################################

    return peak_dict
Esempio n. 11
0
def peaks_from_info(wiggle,
                    pos_counts,
                    lengths,
                    loc,
                    gene_length,
                    margin=25,
                    fdr_alpha=0.05,
                    user_threshold=None,
                    minreads=20,
                    poisson_cutoff=0.05,
                    plotit=False,
                    width_cutoff=10,
                    windowsize=1000,
                    SloP=False,
                    correct_p=False):
    """
    
    same args as before 
    wiggle is converted from bam file
    pos_counts - one point per read instead of coverage of entire read
    lengths - lengths aligned portions of reads 
    rest are the same fix later


    calls peaks for an individual gene 
    

    gene_length - effective length of gene
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
        
    """

    peak_dict = {}

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    #peak_dict['clusters'] = {}
    #peak_dict['sections'] = {}
    #peak_dict['nreads'] = int()
    #peak_dict['threshold'] = int()
    #peak_dict['loc'] = loc

    #data munging
    chrom, gene_name, tx_start, tx_end, signstrand = loc
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]

    #used for poisson calclulation?
    nreads_in_gene = sum(pos_counts)

    #decides FDR calcalation, maybe move getFRDcutoff mean into c code

    if user_threshold is None:
        gene_threshold = get_FDR_cutoff_mean(lengths,
                                             gene_length,
                                             alpha=fdr_alpha)

    else:
        gene_threshold = user_threshold

    if gene_threshold == "best_error":
        #verboseprint("""I had a hard time with this one: %s.
        #                I think I'll use a threshold of 50""" % (loc))

        threshold = 50

    peak_dict['clusters'] = {}
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = loc
    peakn = 1

    sections = find_sections(wiggle, margin)
    if plotit is True:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:
        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]
        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(0, sect_length)
        Nreads = sum(cts)

        #gets random subset of lengths of reads for calculations on a section
        #not exactly the right way to do this but it should be very close.
        sect_read_lengths = rs(lengths, Nreads)
        peak_dict['sections'][sect] = {}
        threshold = int()

        #makes sure there are enough reads
        if Nreads < minreads:
            #verboseprint("""%d is not enough reads, skipping section:
            #                %s""" % (Nreads, sect))
            continue

        else:
            pass
            #verboseprint("""Analyzing section %s with %d reads"""
            #              % (sect, Nreads))

        #sets super-local if requested, might be able to factor this
        if user_threshold is None:
            if SloP is True:
                #use the minimum FDR cutoff between superlocal and gene-wide calculations
                threshold = min(
                    gene_theshold,
                    get_FDR_cutoff_mean(sect_read_lengths,
                                        sect_length,
                                        alpha=fdr_alpha))

            #verboseprint("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #if wiggle track never excides threshold
        if max(data) < threshold:
            #verboseprint("data does not excede threshold, stopping")
            continue

        #fitting splines logic, black magic
        try:
            degree = 3  #cubic spline
            weights = None

            #for very large windows with many reads a large smoothing
            #parameter is required.  test several different options
            #to determine a reasonable inital estimate
            #Goal is to find optimnal smooting paramater in multiple steps
            #initial_smoothing_value initial estimate of smoothing paramater
            #step 1, identify good initial value
            initial_smoothing_value = (sectstop - sectstart + 1)
            best_smoothing_value = initial_smoothing_value
            best_estimate = 1

            #step 2, refine so as not to runinto local minima later,
            #try to come up with a good way of getting optimal paramater
            best_error = find_spline_residuals(initial_smoothing_value, xvals,
                                               data, degree, weights)

            for i in range(2, 11):
                cur_smoothing_value = initial_smoothing_value * i

                #tries find optimal initial smooting paraater in this loop
                cur_error = find_spline_residuals(cur_smoothing_value, xvals,
                                                  data, degree, weights)
                if cur_error < best_error:
                    best_smoothing_value = cur_smoothing_value
                    best_estimate = i

            try:
                #fine optimization of smooting paramater
                cutoff = float(0)
                tries = 0

                # shouldn't get smoothing coef's this small.. increase
                #the initial estimate and try again. WARNING: BLACK MAGIC
                while cutoff < 5:
                    tries += 1

                    # increasing this may improve accuracy,
                    #but at the cost of running time.
                    if tries == 3:
                        break

                    spline = optimize.minimize(
                        find_spline_residuals,
                        best_smoothing_value,
                        args=(xvals, data, degree, weights),
                        options={
                            'disp': False,
                            'maxiter': 10,
                        },
                        #method="Powell", # old method
                        method="L-BFGS-B",  #abnormal termination sometimes
                        #method="COBYLA",
                        bounds=((.1, None), ),
                    )

                    #fit a smoothing spline using an optimal parameter
                    #for smoothing and with weights proportional to the
                    #number of reads aligned at each position if weights
                    #is set
                    if spline.success:
                        cutoff = spline.x
                        #print "cutoff is %s" % (cutoff)
                    else:
                        #print "%s failed spline building at section %s" % (loc, sect)
                        #print spline.message
                        pass

                    best_smoothing_value += sect_length
            except Exception as best_error:
                print >> sys.stderr, "best smoothing value is:", best_smoothing_value
                print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % (
                    loc, sect)
                print >> sys.stderr, best_error
                continue

            #verboseprint ("optimized smoothing parameter")
            #if we are going to save and output as a pickle fi is %s" %(str(cutoff))
            #final fit spline

            spline = find_univariate_spline(cutoff, xvals, data, degree,
                                            weights)

            spline_values = array([round(x) for x in spline(xvals)])
            if plotit is True:
                plot_spline(spline, data, xvals, peakn, threshold)

            starts_and_stops, starts, stops = get_regions_above_threshold(
                threshold, spline_values)

            #walks along spline, and calls peaks along spline
            #for each start, take the next stop and find the peak
            #between the start and the stop this is where I need to
            #fix, some peaks starts start right after another start,
            #but not on top of it make sure the next start is after the
            #previous stop

            #subsections that are above threshold
            for p_start, p_stop in starts_and_stops:

                #peaks with-in this subsection, indexed from section
                #(not subsection) start
                #find all local maxima
                peaks = [
                    x + p_start
                    for x in xvals[find_local_maxima(spline_values[p_start:(
                        p_stop + 1)])]
                ]
                #map(lambda x: x + p_start,
                #            xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0])

                if not len(peaks) in (0, 1):
                    assert len(peaks) in (
                        0, 1
                    )  #there should be one or zero peaks in every section

                #handles logic if there are multiple peaks between
                #start and stop
                if len(peaks) <= 0:
                    continue
                if len(peaks) is 1:
                    #TODO All this formatting logic doesn't belong here
                    #should be simplifed
                    #gets reads in peak
                    n_reads_in_peak = sum(cts[p_start:(p_stop + 1)])
                    #verboseprint(""""Peak %d (%d - %d) has %d
                    #                 reads""" % (peakn,
                    #                             p_start,
                    #                             (p_stop + 1),
                    #                             n_reads_in_peak))

                    #makes sure there enough reads
                    if (n_reads_in_peak < minreads
                            or max(data[p_start:(p_stop + 1)]) < threshold):
                        #    verboseprint("""skipping peak, %d is not enough reads"""
                        #                  % (n_reads_in_peak))
                        continue

                    #formatting of bed track
                    #start and stop for bed track to be created
                    g_start = tx_start + sectstart + p_start
                    g_stop = tx_start + sectstart + p_stop

                    #highest point in start stop
                    peak = tx_start + sectstart + peaks[0]

                    #makes it thicker so we can see on the browser
                    thick_start = peak - 2
                    thick_stop = peak + 2

                    #best_error checking logic to keep bed files from breaking
                    if thick_start < g_start:
                        thick_start = g_start
                    if thick_stop > g_stop:
                        thick_stop = g_stop

                    peak_length = g_stop - g_start + 1

                    #skip really small peaks
                    if peak_length < width_cutoff:
                        continue
                    peak_name = gene_name + "_" + str(peakn) + "_" + str(
                        int(n_reads_in_peak))

                    #super local logic
                    #best_error check to make sure area is in area of gene

                    #distance from gene start
                    if peak - tx_start - windowsize < 0:
                        area_start = 0

                    #for super local gets area around peak for calculation
                    else:
                        area_start = peak - tx_start - windowsize
                        #area_start = sectstart

                    #same thing except for end of gene instead of start
                    if peak + windowsize > tx_end:  #distance to gene stop
                        area_stop = tx_start - tx_end + 1
                    else:
                        area_stop = peak - tx_start + windowsize
                        #area_stop = sectstop

                    #use area reads + 1/2 all other reads in gene:
                    #area_reads = sum(pos_counts[area_start:area_stop]) +
                    #0.5*(sum(pos_counts) -
                    #sum(pos_counts[area_start:area_stop]))

                    #use area reads:
                    area_reads = sum(pos_counts[area_start:area_stop])
                    area_size = area_stop - area_start + 1

                    #area_reads = sum(pos_counts[sectstart:sectstop])
                    #area_size = sect_length

                    #calcluates poisson based of whole gene vs peak
                    gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak,
                                           gene_length, peak_length)
                    if SloP is True:
                        #same thing except for based on super local p-value
                        slop_pois_p = poissonP(area_reads, n_reads_in_peak,
                                               area_size, peak_length)

                    #makes sure spop_poisP is defined, even if its
                    #just normal, something to be removed later,
                    #slop should only be used when defined as true
                    else:
                        slop_pois_p = gene_pois_p

                    if math.isnan(slop_pois_p):
                        slop_pois_p = 1

                    #remove later
                    if slop_pois_p > poisson_cutoff:
                        #continue
                        pass

                    #defines the bedline of a peak for returning
                    #TODO This should be abstracted out for now... seperate model from view
                    bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                        chrom, g_start, g_stop, peak_name, slop_pois_p,
                        signstrand, thick_start, thick_stop)

                    #metadata for the specific bedline
                    peak_dict['clusters'][bedline] = {}
                    peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p
                    peak_dict['clusters'][bedline]['SloP'] = slop_pois_p
                    peak_dict['clusters'][bedline]['Nreads'] = n_reads_in_peak
                    peak_dict['clusters'][bedline]['size'] = peak_length

                    peakn += 1

                #there are more than one peaks in this window
                #NO LONGER NESSESSARY SHOULD REMOVE
                else:
                    #this handles peaks within peaks logic

                    #local minima in subsection, relative to section start
                    valleys = array(
                        map(
                            lambda x: x + p_start, xvals[diff(
                                sign(diff(spline(xvals[p_start:p_stop +
                                                       1])))) > 0]))

                    for subpeak in peaks:
                        subpeak_start = int()
                        subpeak_stop = int()

                        if any(valleys < subpeak):
                            subpeak_start = valleys[valleys < subpeak][-1]
                        else:
                            subpeak_start = starts[starts < subpeak][-1]

                        if any(valleys > subpeak):
                            subpeak_stop = valleys[valleys > subpeak][0]
                        else:
                            subpeak_stop = stops[stops > subpeak][0]
                        peak_length = subpeak_stop - subpeak_start + 1

                        if peak_length < width_cutoff:  #skip really small peaks
                            continue
                        n_reads_in_peak = sum(cts[subpeak_start:(subpeak_stop +
                                                                 1)])

                        if (n_reads_in_peak < minreads
                                or max(data[subpeak_start:(subpeak_stop + 1)])
                                < threshold):
                            continue

                        g_start = tx_start + subpeak_start + sectstart
                        g_stop = tx_start + subpeak_stop + sectstart
                        peak = tx_start + subpeak + sectstart
                        thick_start = peak - 2

                        if thick_start < g_start:
                            thick_start = g_start
                        thick_stop = peak + 2

                        if thick_stop > g_stop:
                            thick_stop = g_stop
                        peak_name = "%s_%s_%s" % (gene_name, peakn,
                                                  int(n_reads_in_peak))

                        #distance from gene start
                        if peak - tx_start - windowsize < 0:
                            area_start = 0
                        else:
                            area_start = peak - tx_start - windowsize

                        if peak + windowsize > tx_end:  #distance to gene stop
                            area_stop = tx_start - tx_end + 1
                        else:
                            #area_stop = sectstop
                            area_stop = peak - tx_start + windowsize

                        area_reads = sum(pos_counts[area_start:area_stop])
                        area_size = area_stop - area_start + 1

                        gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak,
                                               gene_length, peak_length)

                        if SloP is True:
                            slop_pois_p = poissonP(area_reads, n_reads_in_peak,
                                                   area_size, peak_length)
                        else:
                            slop_pois_p = gene_pois_p

                        if math.isnan(slop_pois_p):
                            slop_pois_p = 1

                        #leave these in to allow for BH p-value correction
                        if slop_pois_p > poisson_cutoff:
                            pass

                        #output results again
                        bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                            chrom, g_start, g_stop, peak_name, slop_pois_p,
                            signstrand, thick_start, thick_stop)

                        peak_dict['clusters'][bedline] = {}
                        peak_dict['clusters'][bedline]['SloP'] = slop_pois_p
                        peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p
                        peak_dict['clusters'][bedline][
                            'Nreads'] = n_reads_in_peak
                        peak_dict['clusters'][bedline]['size'] = peak_length
                        peakn += 1
        except NameError as best_error:
            print >> sys.stderr, best_error
            print >> sys.stderr, "spline fitting failed for %s" % (loc)
            raise

    #inflate p-values based on # of comparisons #bonferroni corrected
    if correct_p is True:
        for peak in peak_dict['clusters']:
            peak_dict['clusters'][peak]['p'] = peak_dict['clusters'][peak][
                'p'] * peakn  #bonferroni correct p-value for MHT

    peak_dict['Nclusters'] = peakn

    return peak_dict
Esempio n. 12
0
def peaks_from_info(
    wiggle,
    pos_counts,
    lengths,
    loc,
    gene_length,
    margin=25,
    fdr_alpha=0.05,
    user_threshold=None,
    minreads=20,
    poisson_cutoff=0.05,
    plotit=False,
    width_cutoff=10,
    windowsize=1000,
    SloP=False,
    correct_p=False,
):

    """
    
    same args as before 
    wiggle is converted from bam file
    pos_counts - one point per read instead of coverage of entire read
    lengths - lengths aligned portions of reads 
    rest are the same fix later


    calls peaks for an individual gene 
    

    gene_length - effective length of gene
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
        
    """

    peak_dict = {}

    # these are what is built in this dict, complicated enough that it might
    # be worth turning into an object
    # peak_dict['clusters'] = {}
    # peak_dict['sections'] = {}
    # peak_dict['nreads'] = int()
    # peak_dict['threshold'] = int()
    # peak_dict['loc'] = loc

    # data munging
    chrom, gene_name, tx_start, tx_end, signstrand = loc
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]

    # used for poisson calclulation?
    nreads_in_gene = sum(pos_counts)

    # decides FDR calcalation, maybe move getFRDcutoff mean into c code

    if user_threshold is None:
        gene_threshold = get_FDR_cutoff_mean(lengths, gene_length, alpha=fdr_alpha)

    else:
        gene_threshold = user_threshold

    if gene_threshold == "best_error":
        # verboseprint("""I had a hard time with this one: %s.
        #                I think I'll use a threshold of 50""" % (loc))

        threshold = 50

    peak_dict["clusters"] = {}
    peak_dict["sections"] = {}
    peak_dict["nreads"] = int(nreads_in_gene)
    peak_dict["threshold"] = gene_threshold
    peak_dict["loc"] = loc
    peakn = 1

    # verboseprintprint("Testing %s" % (loc))
    # verboseprint("Gene threshold is: %d" % (gene_threshold))

    # print wiggle
    # print margin
    sections = find_sections(wiggle, margin)
    if plotit is True:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:
        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart : (sectstop + 1)]
        cts = pos_counts[sectstart : (sectstop + 1)]
        xvals = arange(0, sect_length)
        Nreads = sum(cts)

        # gets random subset of lengths of reads for calculations on a section
        # not exactly the right way to do this but it should be very close.
        sect_read_lengths = rs(lengths, Nreads)
        peak_dict["sections"][sect] = {}
        threshold = int()

        # makes sure there are enough reads
        if Nreads < minreads:
            # verboseprint("""%d is not enough reads, skipping section:
            #                %s""" % (Nreads, sect))
            continue

        else:
            pass
            # verboseprint("""Analyzing section %s with %d reads"""
            #              % (sect, Nreads))

        # sets super-local if requested, might be able to factor this
        if user_threshold is None:
            if SloP is True:
                threshold = get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha)

            # verboseprint("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        # saves threshold for each individual section
        peak_dict["sections"][sect]["threshold"] = threshold
        peak_dict["sections"][sect]["nreads"] = int(Nreads)

        # if wiggle track never excides threshold
        if max(data) < threshold:
            # verboseprint("data does not excede threshold, stopping")
            continue

        # fitting splines logic, black magic
        try:
            degree = 3  # cubic spline
            weights = None

            # for very large windows with many reads a large smoothing
            # parameter is required.  test several different options
            # to determine a reasonable inital estimate
            # Goal is to find optimnal smooting paramater in multiple steps
            # initial_smoothing_value initial estimate of smoothing paramater
            # step 1, identify good initial value
            initial_smoothing_value = sectstop - sectstart + 1
            best_smoothing_value = initial_smoothing_value
            best_estimate = 1

            # step 2, refine so as not to runinto local minima later,
            # try to come up with a good way of getting optimal paramater
            best_error = find_spline_residuals(initial_smoothing_value, xvals, data, degree, weights)

            for i in range(2, 11):
                cur_smoothing_value = initial_smoothing_value * i

                # tries find optimal initial smooting paraater in this loop
                cur_error = find_spline_residuals(cur_smoothing_value, xvals, data, degree, weights)
                if cur_error < best_error:
                    best_smoothing_value = cur_smoothing_value
                    best_estimate = i

            # verboseprint("""I'm using (region length) * %d as the
            #                initial estimate for the smoothing
            #                parameter""" % (best_estimate))

            try:
                # fine optimization of smooting paramater
                cutoff = float(0)
                tries = 0

                # shouldn't get smoothing coef's this small.. increase
                # the initial estimate and try again. WARNING: BLACK MAGIC
                while cutoff < 5:
                    tries += 1

                    # increasing this may improve accuracy,
                    # but at the cost of running time.
                    if tries == 3:
                        break

                    spline = optimize.minimize(
                        find_spline_residuals,
                        best_smoothing_value,
                        args=(xvals, data, degree, weights),
                        options={"disp": False, "maxiter": 10},
                        # method="Powell", # old method
                        method="L-BFGS-B",  # abnormal termination sometimes
                        # method="COBYLA",
                        bounds=((0.1, None),),
                    )

                    # fit a smoothing spline using an optimal parameter
                    # for smoothing and with weights proportional to the
                    # number of reads aligned at each position if weights
                    # is set
                    if spline.success:
                        cutoff = spline.x
                        # print "cutoff is %s" % (cutoff)
                    else:
                        # print "%s failed spline building at section %s" % (loc, sect)
                        # print spline.message
                        pass

                    best_smoothing_value += sect_length
            except Exception as best_error:
                print "best smoothing value is:", best_smoothing_value
                print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % (loc, sect)
                print >> sys.stderr, best_error
                continue

            # verboseprint ("optimized smoothing parameter")
            # if we are going to save and output as a pickle fi is %s" %(str(cutoff))
            # final fit spline

            spline = find_univariate_spline(cutoff, xvals, data, degree, weights)

            spline_values = array([round(x) for x in spline(xvals)])
            if plotit is True:
                plot_spline(spline, data, xvals, peakn, threshold)

            starts_and_stops, starts, stops = get_regions_above_threshold(threshold, spline_values)

            # walks along spline, and calls peaks along spline
            # for each start, take the next stop and find the peak
            # between the start and the stop this is where I need to
            # fix, some peaks starts start right after another start,
            # but not on top of it make sure the next start is after the
            # previous stop

            # subsections that are above threshold
            for p_start, p_stop in starts_and_stops:

                # peaks with-in this subsection, indexed from section
                # (not subsection) start
                # find all local maxima
                peaks = [x + p_start for x in xvals[find_local_maxima(spline_values[p_start : (p_stop + 1)])]]
                # map(lambda x: x + p_start,
                #            xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0])

                if not len(peaks) in (0, 1):
                    # print gene_name
                    # print "spline ", spline(xvals)
                    # print "threshold: %s" % (threshold)
                    # print "full spline ", spline_values
                    # print "peaks", peaks
                    # print p_start, p_stop
                    # print starts_and_stops
                    # print "spline values", spline_values[p_start:(p_stop + 1)]
                    # print "peaks at in section", xvals[find_local_maxima(spline_values[p_start:(p_stop + 1)])]
                    assert len(peaks) in (0, 1)  # there should be one or zero peaks in every section

                # handles logic if there are multiple peaks between
                # start and stop
                if len(peaks) <= 0:
                    continue
                if len(peaks) is 1:

                    # gets reads in peak
                    n_reads_in_peak = sum(cts[p_start : (p_stop + 1)])
                    # verboseprint(""""Peak %d (%d - %d) has %d
                    #                 reads""" % (peakn,
                    #                             p_start,
                    #                             (p_stop + 1),
                    #                             n_reads_in_peak))

                    # makes sure there enough reads
                    if n_reads_in_peak < minreads or max(data[p_start : (p_stop + 1)]) < threshold:
                        #    verboseprint("""skipping peak, %d is not enough reads"""
                        #                  % (n_reads_in_peak))
                        continue

                    # formatting of bed track
                    # start and stop for bed track to be created
                    g_start = tx_start + sectstart + p_start
                    g_stop = tx_start + sectstart + p_stop

                    # highest point in start stop
                    peak = tx_start + sectstart + peaks[0]

                    # makes it thicker so we can see on the browser
                    thick_start = peak - 2
                    thick_stop = peak + 2

                    # best_error checking logic to keep bed files from breaking
                    if thick_start < g_start:
                        thick_start = g_start
                    if thick_stop > g_stop:
                        thick_stop = g_stop

                    peak_length = g_stop - g_start + 1

                    # skip really small peaks
                    if peak_length < width_cutoff:
                        continue
                    peak_name = gene_name + "_" + str(peakn) + "_" + str(int(n_reads_in_peak))

                    # super local logic
                    # best_error check to make sure area is in area of gene

                    # distance from gene start
                    if peak - tx_start - windowsize < 0:
                        area_start = 0

                    # for super local gets area around peak for calculation
                    else:
                        area_start = peak - tx_start - windowsize
                        # area_start = sectstart

                    # same thing except for end of gene instead of start
                    if peak + windowsize > tx_end:  # distance to gene stop
                        area_stop = tx_start - tx_end + 1
                    else:
                        area_stop = peak - tx_start + windowsize
                        # area_stop = sectstop

                    # use area reads + 1/2 all other reads in gene:
                    # area_reads = sum(pos_counts[area_start:area_stop]) +
                    # 0.5*(sum(pos_counts) -
                    # sum(pos_counts[area_start:area_stop]))

                    # use area reads:
                    area_reads = sum(pos_counts[area_start:area_stop])
                    area_size = area_stop - area_start + 1

                    # area_reads = sum(pos_counts[sectstart:sectstop])
                    # area_size = sect_length

                    # calcluates poisson based of whole gene vs peak
                    gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length)
                    if SloP is True:
                        # same thing except for based on super local p-value
                        slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length)

                    # makes sure spop_poisP is defined, even if its
                    # just normal, something to be removed later,
                    # slop should only be used when defined as true
                    else:
                        slop_pois_p = gene_pois_p

                    if math.isnan(slop_pois_p):
                        slop_pois_p = 1

                    # remove later
                    if slop_pois_p > poisson_cutoff:
                        # continue
                        pass

                    # defines the bedline of a peak for returning
                    # TODO This should be abstracted out for now... seperate model from view
                    bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                        chrom,
                        g_start,
                        g_stop,
                        peak_name,
                        slop_pois_p,
                        signstrand,
                        thick_start,
                        thick_stop,
                    )

                    # metadata for the specific bedline
                    peak_dict["clusters"][bedline] = {}
                    peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p
                    peak_dict["clusters"][bedline]["SloP"] = slop_pois_p
                    peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak
                    peak_dict["clusters"][bedline]["size"] = peak_length

                    peakn += 1

                # there are more than one peaks in this window
                # NO LONGER NESSESSARY SHOULD REMOVE
                else:
                    # this handles peaks within peaks logic

                    # local minima in subsection, relative to section start
                    valleys = array(
                        map(lambda x: x + p_start, xvals[diff(sign(diff(spline(xvals[p_start : p_stop + 1])))) > 0])
                    )

                    for subpeak in peaks:
                        subpeak_start = int()
                        subpeak_stop = int()

                        if any(valleys < subpeak):
                            subpeak_start = valleys[valleys < subpeak][-1]
                        else:
                            subpeak_start = starts[starts < subpeak][-1]

                        if any(valleys > subpeak):
                            subpeak_stop = valleys[valleys > subpeak][0]
                        else:
                            subpeak_stop = stops[stops > subpeak][0]
                        peak_length = subpeak_stop - subpeak_start + 1

                        if peak_length < width_cutoff:  # skip really small peaks
                            continue
                        n_reads_in_peak = sum(cts[subpeak_start : (subpeak_stop + 1)])

                        if n_reads_in_peak < minreads or max(data[subpeak_start : (subpeak_stop + 1)]) < threshold:
                            continue

                        g_start = tx_start + subpeak_start + sectstart
                        g_stop = tx_start + subpeak_stop + sectstart
                        peak = tx_start + subpeak + sectstart
                        thick_start = peak - 2

                        if thick_start < g_start:
                            thick_start = g_start
                        thick_stop = peak + 2

                        if thick_stop > g_stop:
                            thick_stop = g_stop
                        peak_name = "%s_%s_%s" % (gene_name, peakn, int(n_reads_in_peak))

                        # distance from gene start
                        if peak - tx_start - windowsize < 0:
                            area_start = 0
                        else:
                            area_start = peak - tx_start - windowsize

                        if peak + windowsize > tx_end:  # distance to gene stop
                            area_stop = tx_start - tx_end + 1
                        else:
                            # area_stop = sectstop
                            area_stop = peak - tx_start + windowsize

                        area_reads = sum(pos_counts[area_start:area_stop])
                        area_size = area_stop - area_start + 1

                        gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length)

                        if SloP is True:
                            slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length)
                        else:
                            slop_pois_p = gene_pois_p

                        if math.isnan(slop_pois_p):
                            slop_pois_p = 1

                        # leave these in to allow for BH p-value correction
                        if slop_pois_p > poisson_cutoff:
                            pass

                        # output results again
                        bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                            chrom,
                            g_start,
                            g_stop,
                            peak_name,
                            slop_pois_p,
                            signstrand,
                            thick_start,
                            thick_stop,
                        )

                        peak_dict["clusters"][bedline] = {}
                        peak_dict["clusters"][bedline]["SloP"] = slop_pois_p
                        peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p
                        peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak
                        peak_dict["clusters"][bedline]["size"] = peak_length
                        peakn += 1
        except NameError as best_error:
            print >> sys.stderr, best_error
            print >> sys.stderr, "spline fitting failed for %s" % (loc)
            raise

    # inflate p-values based on # of comparisons #bonferroni corrected
    if correct_p is True:
        for peak in peak_dict["clusters"]:
            peak_dict["clusters"][peak]["p"] = (
                peak_dict["clusters"][peak]["p"] * peakn
            )  # bonferroni correct p-value for MHT

    peak_dict["Nclusters"] = peakn

    return peak_dict