Ejemplo n.º 1
0
def get_bam_coverage(bamfile):
    """

    Given a bam file returns a properly covered htseq coverage file (this is slow)

    """
    bam = Robust_BAM_Reader(bamfile)
    coverage = HTSeq.GenomicArray("auto", typecode="i", stranded=True)
    for read in bam:
        if read.aligned:
            for cigop in read.cigar:
                if cigop.type != "M":
                    continue
                coverage[cigop.ref_iv] += 1
    return coverage
Ejemplo n.º 2
0
def call_peaks(interval,
               gene_length,
               bam_file=None,
               max_gap=25,
               fdr_alpha=0.05,
               user_threshold=None,
               binom_alpha=0.05,
               method="binomial",
               min_reads=3,
               poisson_cutoff=0.05,
               plotit=False,
               w_cutoff=10,
               windowsize=1000,
               SloP=False,
               max_width=None,
               min_width=None,
               algorithm="spline",
               reverse_strand=False,
               input_bam=None):
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    """

    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom

    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"
    (wiggle, jxns, pos_counts, lengths,
     allreads) = readsToWiggle_pysam(subset_reads, interval.start,
                                     interval.stop, strand, "start", False)

    #This is the worst of hacks, need to factor out pysam eventually
    bam_fileobj = Robust_BAM_Reader(bam_file)
    subset_reads = list(
        bam_fileobj.fetch(reference=str(interval.chrom),
                          start=interval.start,
                          end=interval.stop))
    array_of_reads = read_array(subset_reads, interval.start, interval.stop)

    if input_bam:  #if not none
        input_bam_fileobj = Robust_BAM_Reader(input_bam)
        input_subset_reads = list(
            input_bam_fileobj.fetch(reference=str(interval.chrom),
                                    start=interval.start,
                                    end=interval.stop))
        input_array_of_reads = read_array(input_subset_reads, interval.start,
                                          interval.stop)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [
        gene_length - 1 if read >= gene_length else read for read in lengths
    ]

    if user_threshold is None:
        if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user

            gene_threshold = get_FDR_cutoff_binom(lengths, gene_length,
                                                  binom_alpha)
        elif method == "random":
            gene_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                 genelength=gene_length,
                                                 alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold

    if not isinstance(gene_threshold, int):
        raise TypeError

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = interval

    peak_number = 0

    sections = find_sections(wiggle, max_gap)
    if plotit:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                             sectstart + interval.start,
                                             sectstop + interval.start + 1,
                                             strand)

        Nreads = count_reads_in_interval(cur_interval, array_of_reads)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" %
                         (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %
                         (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                half_width = 500
                section_start = max(0, sectstart + interval.start - half_width)
                section_stop = sectstop + interval.start + 1 + half_width
                expanded_sect_length = section_stop - section_start
                cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                     section_start,
                                                     section_stop, strand)
                expanded_Nreads = get_reads_in_interval(
                    cur_interval, array_of_reads)
                sect_read_lengths = read_lengths_from_htseq(expanded_Nreads)
                sect_read_lengths = [
                    sect_length - 1 if read > sect_length else read
                    for read in sect_read_lengths
                ]
                peak_dict['sections'][sect]['expanded_Nreads'] = len(
                    expanded_Nreads)

                if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user
                    threshold = max(
                        gene_threshold,
                        get_FDR_cutoff_binom(sect_read_lengths,
                                             expanded_sect_length,
                                             binom_alpha))
                elif method == "random":
                    #use the minimum FDR cutoff between superlocal and gene-wide calculations
                    threshold = max(
                        gene_threshold,
                        get_FDR_cutoff_mean(readlengths=sect_read_lengths,
                                            genelength=expanded_sect_length,
                                            alpha=fdr_alpha))
                else:
                    raise ValueError("Method %s does not exist" % (method))
                logging.info("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = map(float, data)
            #Magic number for initial smoothing, but it works
            initial_smoothing_value = (
                (sectstop - sectstart + 1)**(1 / 3)) + 10

            peak_dict['sections'][sect][
                'smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" %
                         initial_smoothing_value)
            fitter = SmoothingSpline(
                xvals,
                data,
                smoothing_factor=initial_smoothing_value,
                lossFunction="get_turn_penalized_residuals",
                threshold=threshold,
                num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = map(float, cts)
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = map(float, data)
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" %
                         fitter.smoothing_factor)
            peak_dict['sections'][sect][
                'final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" %
                          (interval.name, error))
            raise error

        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop

        #Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:

            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                 genomic_start, genomic_stop,
                                                 strand)
            number_reads_in_peak = count_reads_in_interval(
                cur_interval, array_of_reads)

            if input_bam:
                input_number_reads_in_peak = count_reads_in_interval(
                    cur_interval, input_array_of_reads)
            else:
                input_number_reads_in_peak = 0

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                      (peak_stop + 1), number_reads_in_peak))

            #highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            #makes it thicker so we can see on the browser
            #error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)

            #super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize,
                            len(wiggle))

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom),
                                                 interval.start + area_start,
                                                 interval.start + area_stop,
                                                 strand)
            number_reads_in_area = count_reads_in_interval(
                cur_interval, array_of_reads)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(
                Peak(
                    chrom=interval.chrom,
                    genomic_start=genomic_start,
                    genomic_stop=genomic_stop,
                    gene_name=interval.attrs['gene_id'],
                    strand=interval.strand,
                    thick_start=thick_start,
                    thick_stop=thick_stop,
                    peak_number=peak_number,
                    number_reads_in_peak=number_reads_in_peak,
                    size=peak_length,
                    p=0,
                    effective_length=int(interval.attrs['effective_length']),
                    peak_length=peak_length,
                    area_reads=number_reads_in_area,
                    area_size=area_length,
                    nreads_in_gene=nreads_in_gene,
                    #nreads_in_input=input_number_reads_in_peak,
                ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)

    return peak_dict
Ejemplo n.º 3
0
def call_peaks(interval, gene_length, bam_file=None, max_gap=25,
               fdr_alpha=0.05, user_threshold=None, binom_alpha=0.05, method="binomial",
               min_reads=3, poisson_cutoff=0.05,
               plotit=False, w_cutoff=10, windowsize=1000, 
               SloP=False, max_width=None, min_width=None,
               algorithm="spline", reverse_strand=False, input_bam=None):
    
    """

    calls peaks for an individual gene 
    
    interval - gtf interval describing the gene to query 
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    max_gap - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha

    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 

    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    max_width - int maximum with of classic peak calling algorithm peak
    min_width - int min width of classic peak calling algorithm peak
    max_gap   - int max gap of classic peak calling algorithm peak

    """
    
    if plotit:
        plt.rcParams['interactive'] = True
        pass

    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    #fixes non-standard chrom file names (without the chr)
    if not interval.chrom.startswith("chr"):
        interval.chrom = "chr" + interval.chrom

    subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
    strand = str(interval.strand)
    if reverse_strand:
        if strand == "+":
            strand = "-"
        elif strand == "-":
            strand = "+"
    (wiggle, jxns, pos_counts,
     lengths, allreads) = readsToWiggle_pysam(subset_reads, interval.start,
                                              interval.stop, strand, "start", False)

    #This is the worst of hacks, need to factor out pysam eventually
    bam_fileobj = Robust_BAM_Reader(bam_file)
    subset_reads = list(bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
    array_of_reads = read_array(subset_reads, interval.start, interval.stop)

    if input_bam: #if not none
        input_bam_fileobj = Robust_BAM_Reader(input_bam)
        input_subset_reads = list(input_bam_fileobj.fetch(reference=str(interval.chrom), start=interval.start, end=interval.stop))
        input_array_of_reads = read_array(input_subset_reads, interval.start, interval.stop)

    nreads_in_gene = sum(pos_counts)
    gene_length = int(gene_length)
    lengths = [gene_length - 1 if read >= gene_length else read for read in lengths]

    if user_threshold is None:
        if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user

            gene_threshold = get_FDR_cutoff_binom(lengths, gene_length, binom_alpha)
        elif method == "random":
            gene_threshold = get_FDR_cutoff_mean(readlengths=lengths,
                                                 genelength=gene_length,
                                                 alpha=fdr_alpha)
        else:
            raise ValueError("Method %s does not exist" % (method))
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold

    if not isinstance(gene_threshold, int):
        raise TypeError

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    peak_dict = {}
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = interval

    peak_number = 0


    sections = find_sections(wiggle, max_gap)
    if plotit:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:

        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]

        cur_interval = HTSeq.GenomicInterval(str(interval.chrom), sectstart + interval.start, sectstop + interval.start + 1,
                                         strand)

        Nreads = count_reads_in_interval(cur_interval, array_of_reads)

        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(len(data))
        peak_dict['sections'][sect] = {}
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < min_reads:
            logging.info("""%d is not enough reads, skipping section: %s""" % (Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" % (sect, Nreads))
            pass

        if user_threshold is None:
            if SloP:
                half_width = 500
                section_start = max(0, sectstart + interval.start - half_width)
                section_stop = sectstop + interval.start + 1 + half_width
                expanded_sect_length = section_stop - section_start
                cur_interval = HTSeq.GenomicInterval(str(interval.chrom), section_start, section_stop,strand )
                expanded_Nreads = get_reads_in_interval(cur_interval, array_of_reads)
                sect_read_lengths = read_lengths_from_htseq(expanded_Nreads)
                sect_read_lengths = [sect_length - 1 if read > sect_length else read for read in sect_read_lengths]

                if method == "binomial":  #Uses Binomial Distribution to get cutoff if specified by user
                    threshold = max(gene_threshold, get_FDR_cutoff_binom(sect_read_lengths, expanded_sect_length, binom_alpha))
                elif method == "random":
                    #use the minimum FDR cutoff between superlocal and gene-wide calculations
                    threshold = max(gene_threshold, get_FDR_cutoff_mean(readlengths=sect_read_lengths, genelength=expanded_sect_length, alpha=fdr_alpha))
                else:
                    raise ValueError("Method %s does not exist" % (method))
                logging.info("Using super-local threshold %d" %(threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['expanded_Nreads'] = len(expanded_Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0

        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue

        if algorithm == "spline":
            data = map(float, data)
            #Magic number for initial smoothing, but it works
            initial_smoothing_value = ((sectstop - sectstart + 1)**(1/3)) + 10

            peak_dict['sections'][sect]['smoothing_factor'] = initial_smoothing_value

            logging.info("initial smoothing value: %.2f" % initial_smoothing_value)
            fitter = SmoothingSpline(xvals, data, smoothing_factor=initial_smoothing_value,
                            lossFunction="get_turn_penalized_residuals",
                            threshold=threshold,
                            num_reads=Nreads)

        elif algorithm == "gaussian":
            cts = map(float, cts)
            fitter = GaussMix(xvals, cts)

        elif algorithm == "classic":
            data = map(float, data)
            fitter = Classic(xvals, data, max_width, min_width, max_gap)

        try:
            peak_definitions = fitter.peaks()
            logging.info("optimized smoothing value: %.2f" % fitter.smoothing_factor)
            peak_dict['sections'][sect]['final_smoothing_factor'] = fitter.smoothing_factor
            if peak_definitions is None:
                numpeaks = 0
            else:
                numpeaks = len(peak_definitions)
            logging.info("I identified %d potential peaks" % (numpeaks))

        except Exception as error:
            logging.error("peak finding failed:, %s, %s" % (interval.name, error))
            raise error

        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop

        #Need to get all ranges, count number of reads in each range and compute from there
        for peak_start, peak_stop, peak_center in peak_definitions:

            genomic_start = interval.start + sectstart + peak_start
            genomic_stop = interval.start + sectstart + peak_stop

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom), genomic_start, genomic_stop,
                                         strand)
            number_reads_in_peak = count_reads_in_interval(cur_interval, array_of_reads)

            if input_bam:
                input_number_reads_in_peak = count_reads_in_interval(cur_interval, input_array_of_reads)
            else:
                input_number_reads_in_peak = 0

            peak_length = genomic_stop - genomic_start + 1

            logging.info("""Peak %d (%d - %d) has %d
                          reads""" % (peak_number, peak_start,
                                     (peak_stop + 1), number_reads_in_peak))

            #highest point in start stop
            genomic_center = interval.start + sectstart + peak_center

            #makes it thicker so we can see on the browser
            #error checking logic to keep bed files from breaking
            thick_start = max(genomic_center - 2, genomic_start)
            thick_stop = min(genomic_center + 2, genomic_stop)


            #super local logic
            area_start = max(0, (peak_center + sectstart) - windowsize)
            area_stop = min((peak_center + sectstart) + windowsize, len(wiggle))

            cur_interval = HTSeq.GenomicInterval(str(interval.chrom), interval.start + area_start, interval.start + area_stop,
                                         strand)
            number_reads_in_area = count_reads_in_interval(cur_interval, array_of_reads)
            area_length = area_stop - area_start + 1

            peak_dict['clusters'].append(Peak(chrom=interval.chrom,
                                              genomic_start=genomic_start,
                                              genomic_stop=genomic_stop,
                                              gene_name=interval.attrs['gene_id'],
                                              strand=interval.strand,
                                              thick_start=thick_start,
                                              thick_stop=thick_stop,
                                              peak_number=peak_number,
                                              number_reads_in_peak=number_reads_in_peak,
                                              size=peak_length,
                                              p=0,
                                              effective_length=int(interval.attrs['effective_length']),
                                              peak_length=peak_length,
                                              area_reads=number_reads_in_area,
                                              area_size=area_length,
                                              nreads_in_gene=nreads_in_gene,
                                              #nreads_in_input=input_number_reads_in_peak,
                                              ))

            peak_number += 1
            peak_dict['sections'][sect]['nPeaks'] += 1

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)

    return peak_dict