Ejemplo n.º 1
0
    def test_readsToWiggle_pysam(self):
        reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam"))      
        reads = reads.fetch(region="chr15:91536649-91537641")
        wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False)
        #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False)
         
        wiggle_true = [  2. ,  2.,   2. ,  2. ,  2. ,  2.  , 2. ,  2. , 11. , 11.,  11. , 11.  ,11. , 11. , 11.,
   11. , 11.,  11.,  11. , 11.  ,11. , 11. , 11. , 11.,  11. , 11. , 11.  ,11. , 11.  ,11.,
   11. , 11.,  11.,   9. ,  9. ,  9. ,  9. ,  9.,   9. ,  9.,   9. ,  0. ,  0.,   0.]
        
        print wiggle
        for true, test in zip(wiggle_true, wiggle):
            self.assertEqual(test, true)
        #
        pos_counts_true = [ 0. , 0.,  0. , 0.  ,0. , 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0. , 0. , 
                            0. , 0. , 2.,  0., 0. , 0.,  0.,  0.,  0. , 0.,  9.,  0. , 0.,  0. , 0. ,  
                            0. , 0. , 0. , 0. , 0.,  0.,  0., 0. , 0.,  0. , 0. , 0.,  0.,  0. ,  0.]
        
        
        for true, test in zip(pos_counts_true, pos_counts):
            self.assertEqual(test, true)
        
        assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]
        
        reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam"))      
        reads = reads.fetch(region="chr15:91536649-91537641")
        wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True)
        #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True)

        wiggle_true = [0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0]
        for true, test in zip(wiggle_true, wiggle):
            self.assertEqual(test, true)
Ejemplo n.º 2
0
def call_peaks(loc, gene_length, bam_fileobj=None, bam_file=None, 
               margin=25, fdr_alpha=0.05, user_threshold=None,
               minreads=20, poisson_cutoff=0.05, 
               plotit=False, w_cutoff=10, windowsize=1000, 
               SloP=False, correct_p=False):
    
    """

    calls peaks for an individual gene 
    
    loc - string of all gene location
    gene_length - effective length of gene
    takes bam file or bam file object.  Serial uses object parallel uses location (name)
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
    
    """
    
    #setup
    chrom, gene_name, tx_start, tx_end, signstrand = loc

    #logic reading bam files
    if bam_file is None and bam_fileobj is None:
        #using a file opbject is faster for serial processing 
        #but doesn't work in parallel
        
        verboseprint("""you have to pick either bam file or bam file 
                        object, not both""")
        exit()
    elif bam_fileobj is None:
        bam_fileobj = pysam.Samfile(bam_file, 'rb')
        
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]
    subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start, end=tx_end)

    #need to document reads to wiggle
    wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(subset_reads, tx_start, tx_end, signstrand, "center", False)

    #wiggle, pos_counts, lengths = readsToWiggle_pysam(subset_reads, tx_start, tx_end, signstrand, "center", False)

    #TODO have a check to kill this if there aren't any reads in a region
        
    result = peaks_from_info(list(wiggle), pos_counts, lengths, loc, gene_length, margin, fdr_alpha, user_threshold, minreads, poisson_cutoff, plotit, w_cutoff, windowsize, SloP, correct_p)

    return result
Ejemplo n.º 3
0
def assign_reads(gene, splicedict=None, bam_file=None,
                 alignment_slop=10, flip=True, splicetypes=None):

    if splicedict is None or bam_file is None:

        raise Exception
    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    data = {}
    
    chrom = splicedict["chromosome"]
    strand = splicedict["strand"]
    tx_start = splicedict["tx_start"]
    tx_end = splicedict["tx_end"]
    
    signstrand = None

    if flip is True:
        usestrand = strand * -1
    else:
        usestrand = strand

    if usestrand == 1:
        signstrand = "+"

    elif usestrand == -1:
        signstrand = "-"


    interval = pybedtools.Interval(chrom, tx_start, tx_end, strand=signstrand)
    subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start,end=tx_end)
    
    (wig, jxns, nrCounts, readLengths, 
     reads) = readsToWiggle_pysam(subset_reads, (tx_start-1000),
                                  (tx_end+1000), signstrand, "center", False)
    
    data["descriptor"] = gene
    if "SE" in splicedict and "SE" in splicetypes:
        data["SE"] = {}
        for loc in splicedict["SE"]:
            #rangestart = splicedict[gene]["SE"][loc]["rangestart"]
            #rangeend = splicedict[gene]["SE"][loc]["rangeend"]                
            data["SE"][loc] = {}
            data["SE"][loc]["IN"] = 0
            data["SE"][loc]["EX"] = 0

            bodyLoc = splicedict['SE'][loc]["BODY"]
            upLoc = splicedict['SE'][loc]["UP"]
            downLoc = splicedict['SE'][loc]["DOWN"]
            if strand == 1:
                upIntronLoc = upLoc.split("-")[1] + "-" + bodyLoc.split("-")[0]
                downIntronLoc = bodyLoc.split("-")[1] + "-" +  downLoc.split("-")[0]
            else:
                upIntronLoc = bodyLoc.split("-")[1] + "-" + upLoc.split("-")[0]
                downIntronLoc = downLoc.split("-")[1] + "-" +  bodyLoc.split("-")[0]
            
            try:
                data["SE"][loc]["BODY_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, bodyLoc.split("-")), strand=signstrand), bam_fileobj)

                data["SE"][loc]["UP_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, upLoc.split("-")), strand=signstrand), bam_fileobj)

                data["SE"][loc]["DOWN_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, downLoc.split("-")), strand=signstrand), bam_fileobj)

                data["SE"][loc]["UPI_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, upIntronLoc.split("-")), strand=signstrand), bam_fileobj)

                data["SE"][loc]["DOWNI_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, downIntronLoc.split("-")), strand=signstrand), bam_fileobj)
            except:
                #import pdb; pdb.set_trace()
                print "uh oh %s" %(gene + loc)
                continue

            for structure in splicedict["SE"][loc]["IN"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    
                    structurestrip = tuple(map(int, structurestrip.split(":")))

                    if structurestrip in jxns:
                        data["SE"][loc]["IN"] += jxns[structurestrip]


            for structure in splicedict["SE"][loc]["EX"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    structurestrip = tuple(map(int, structurestrip.split(":")))                    
                    if structurestrip in jxns:
                        data["SE"][loc]["EX"] += jxns[structurestrip]

    if "MXE" in splicedict and "MXE" in splicetypes:
        data["MXE"] = {}
#        for loc in splicedict[gene]["MXE"]:
        for loc in splicedict["MXE"]:            
            #rangestart = splicedict[gene]["SE"][loc]["rangestart"]
            #rangeend = splicedict[gene]["SE"][loc]["rangeend"]                
            data["MXE"][loc] = {}
            data["MXE"][loc]["A"] = 0
            data["MXE"][loc]["B"] = 0
            #import code
            #code.interact(local=locals())
            for structure in splicedict["MXE"][loc]["A"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    structurestrip = tuple(map(int, structurestrip.split(":")))                    
                    if structurestrip in jxns:
                        data["MXE"][loc]["A"] += jxns[structurestrip]
                elif structure.startswith("b"):
                    continue
                    exstart, exstop = map(int, structure.lstrip("b").split("-"))
                    for position in range(exstart, (exstop+1)):
                        if position in reads:
                            for read_end in reads[position]:
                                if read_end <= exstop:
                                    data["MXE"][loc]["A"] += reads[position][read_end]

            for structure in splicedict["MXE"][loc]["B"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    structurestrip = tuple(map(int, structurestrip.split(":")))                    
                    if structurestrip in jxns:
                        data["MXE"][loc]["B"] += jxns[structurestrip]
                elif structure.startswith("b"):
                    continue
                    exstart, exstop = map(int, structure.lstrip("b").split("-"))
                    for position in range(exstart, (exstop+1)):
                        if position in reads:
                            for read_end in reads[position]:
                                if read_end <= exstop:
                                    data["MXE"][loc]["B"] += reads[position][read_end]                                    

    return data
Ejemplo n.º 4
0
def assign_reads(gene, splicedict=None, bam_file=None, alignment_slop=10, flip=True, splicetypes=None):

    if splicedict is None or bam_file is None:

        raise Exception
    bam_fileobj = pysam.Samfile(bam_file, 'rb')
    data = {}
    
    chrom = splicedict["chromosome"]
    strand = splicedict["strand"]
    tx_start = splicedict["tx_start"]
    tx_end = splicedict["tx_end"]

    signstrand = None
    if flip is not None:
        if flip is True:
            usestrand = strand * -1
        else:
            usestrand = strand
        if usestrand == 1:
            signstrand = "+"
        elif usestrand == -1:
            signstrand = "-"
    subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start,end=tx_end)
    
    wig, jxns, nrCounts, readLengths, reads = readsToWiggle_pysam(subset_reads, (tx_start-1000), (tx_end+1000), signstrand, "center", False)
    
    data["descriptor"] = gene
    if "SE" in splicedict and "SE" in splicetypes:
        data["SE"] = {}
        for loc in splicedict["SE"]:
            #rangestart = splicedict[gene]["SE"][loc]["rangestart"]
            #rangeend = splicedict[gene]["SE"][loc]["rangeend"]                
            data["SE"][loc] = {}
            data["SE"][loc]["IN"] = 0
            data["SE"][loc]["EX"] = 0

            for structure in splicedict["SE"][loc]["IN"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    
                    structurestrip = tuple(map(int, structurestrip.split(":")))

                    if structurestrip in jxns:
                        data["SE"][loc]["IN"] += jxns[structurestrip]

                elif structure.startswith("b"):
                    continue #skip exon body
                    exstart, exstop = map(int, structure.lstrip("b").split("-"))
                    for position in range(exstart, (exstop+1)):
                        if position in reads:
                            for read_end in reads[position]:
                                if read_end <= exstop:
                                    data["SE"][loc]["IN"] += reads[position][read_end]
                                    
                    #for. read in reads:
                    #    rstart, rstop = map(int, read.split("-"))
                    #    if rstart >= (exstart-alignment_slop) and rstop <= (exstop + alignment_slop):
                    #        data["SE"][loc]["IN"] += reads[read]
                    #    else:
                    #        pass
            for structure in splicedict["SE"][loc]["EX"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    structurestrip = tuple(map(int, structurestrip.split(":")))                    
                    if structurestrip in jxns:
                        data["SE"][loc]["EX"] += jxns[structurestrip]

    if "MXE" in splicedict and "MXE" in splicetypes:
        data["MXE"] = {}
#        for loc in splicedict[gene]["MXE"]:
        for loc in splicedict["MXE"]:            
            #rangestart = splicedict[gene]["SE"][loc]["rangestart"]
            #rangeend = splicedict[gene]["SE"][loc]["rangeend"]                
            data["MXE"][loc] = {}
            data["MXE"][loc]["A"] = 0
            data["MXE"][loc]["B"] = 0
            #import code
            #code.interact(local=locals())
            for structure in splicedict["MXE"][loc]["A"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    structurestrip = tuple(map(int, structurestrip.split(":")))                    
                    if structurestrip in jxns:
                        data["MXE"][loc]["A"] += jxns[structurestrip]
                elif structure.startswith("b"):
                    continue
                    exstart, exstop = map(int, structure.lstrip("b").split("-"))
                    for position in range(exstart, (exstop+1)):
                        if position in reads:
                            for read_end in reads[position]:
                                if read_end <= exstop:
                                    data["MXE"][loc]["A"] += reads[position][read_end]

            for structure in splicedict["MXE"][loc]["B"]:
                if structure.startswith("j"):
                    structurestrip = structure.lstrip("j")
                    structurestrip = tuple(map(int, structurestrip.split(":")))                    
                    if structurestrip in jxns:
                        data["MXE"][loc]["B"] += jxns[structurestrip]
                elif structure.startswith("b"):
                    continue
                    exstart, exstop = map(int, structure.lstrip("b").split("-"))
                    for position in range(exstart, (exstop+1)):
                        if position in reads:
                            for read_end in reads[position]:
                                if read_end <= exstop:
                                    data["MXE"][loc]["B"] += reads[position][read_end]                                    


    return data