def test_readsToWiggle_pysam(self): reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False) #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False) wiggle_true = [ 2. , 2., 2. , 2. , 2. , 2. , 2. , 2. , 11. , 11., 11. , 11. ,11. , 11. , 11., 11. , 11., 11., 11. , 11. ,11. , 11. , 11. , 11., 11. , 11. , 11. ,11. , 11. ,11., 11. , 11., 11., 9. , 9. , 9. , 9. , 9., 9. , 9., 9. , 0. , 0., 0.] print wiggle for true, test in zip(wiggle_true, wiggle): self.assertEqual(test, true) # pos_counts_true = [ 0. , 0., 0. , 0. ,0. , 0., 0., 0., 0., 0., 0., 0., 0. , 0. , 0. , 0. , 2., 0., 0. , 0., 0., 0., 0. , 0., 9., 0. , 0., 0. , 0. , 0. , 0. , 0. , 0. , 0., 0., 0., 0. , 0., 0. , 0. , 0., 0., 0. , 0.] for true, test in zip(pos_counts_true, pos_counts): self.assertEqual(test, true) assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33] reads = pysam.Samfile(os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True) #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True) wiggle_true = [0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0] for true, test in zip(wiggle_true, wiggle): self.assertEqual(test, true)
def test_readsToWiggle_pysam(self): reads = pysam.Samfile( os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam( reads, 91537632, 91537675, '-', 'center', False) #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', False) wiggle_true = [ 2., 2., 2., 2., 2., 2., 2., 2., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 9., 9., 9., 9., 9., 9., 9., 9., 0., 0., 0. ] print wiggle for true, test in zip(wiggle_true, wiggle): self.assertEqual(test, true) # pos_counts_true = [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 9., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ] for true, test in zip(pos_counts_true, pos_counts): self.assertEqual(test, true) assert lengths == [33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33] reads = pysam.Samfile( os.path.join(clipper.test_dir(), "allup_test.bam")) reads = reads.fetch(region="chr15:91536649-91537641") wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam( reads, 91537632, 91537675, '-', 'center', True) #wiggle, pos_counts, lengths = readsToWiggle_pysam(reads, 91537632, 91537675, '-', 'center', True) wiggle_true = [ 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.06060606060606061, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.33333333333333326, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.2727272727272727, 0.0, 0.0, 0.0 ] for true, test in zip(wiggle_true, wiggle): self.assertEqual(test, true)
def call_peaks(loc, gene_length, bam_fileobj=None, bam_file=None, margin=25, fdr_alpha=0.05, user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, correct_p=False): """ calls peaks for an individual gene loc - string of all gene location gene_length - effective length of gene takes bam file or bam file object. Serial uses object parallel uses location (name) margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson """ #setup chrom, gene_name, tx_start, tx_end, signstrand = loc #logic reading bam files if bam_file is None and bam_fileobj is None: #using a file opbject is faster for serial processing #but doesn't work in parallel verboseprint("""you have to pick either bam file or bam file object, not both""") exit() elif bam_fileobj is None: bam_fileobj = pysam.Samfile(bam_file, 'rb') tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start, end=tx_end) #need to document reads to wiggle wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam(subset_reads, tx_start, tx_end, signstrand, "center", False) #wiggle, pos_counts, lengths = readsToWiggle_pysam(subset_reads, tx_start, tx_end, signstrand, "center", False) #TODO have a check to kill this if there aren't any reads in a region result = peaks_from_info(list(wiggle), pos_counts, lengths, loc, gene_length, margin, fdr_alpha, user_threshold, minreads, poisson_cutoff, plotit, w_cutoff, windowsize, SloP, correct_p) return result
def assign_reads(gene, splicedict=None, bam_file=None, alignment_slop=10, flip=True, splicetypes=None): if splicedict is None or bam_file is None: raise Exception bam_fileobj = pysam.Samfile(bam_file, 'rb') data = {} chrom = splicedict["chromosome"] strand = splicedict["strand"] tx_start = splicedict["tx_start"] tx_end = splicedict["tx_end"] signstrand = None if flip is True: usestrand = strand * -1 else: usestrand = strand if usestrand == 1: signstrand = "+" elif usestrand == -1: signstrand = "-" interval = pybedtools.Interval(chrom, tx_start, tx_end, strand=signstrand) subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start,end=tx_end) (wig, jxns, nrCounts, readLengths, reads) = readsToWiggle_pysam(subset_reads, (tx_start-1000), (tx_end+1000), signstrand, "center", False) data["descriptor"] = gene if "SE" in splicedict and "SE" in splicetypes: data["SE"] = {} for loc in splicedict["SE"]: #rangestart = splicedict[gene]["SE"][loc]["rangestart"] #rangeend = splicedict[gene]["SE"][loc]["rangeend"] data["SE"][loc] = {} data["SE"][loc]["IN"] = 0 data["SE"][loc]["EX"] = 0 bodyLoc = splicedict['SE'][loc]["BODY"] upLoc = splicedict['SE'][loc]["UP"] downLoc = splicedict['SE'][loc]["DOWN"] if strand == 1: upIntronLoc = upLoc.split("-")[1] + "-" + bodyLoc.split("-")[0] downIntronLoc = bodyLoc.split("-")[1] + "-" + downLoc.split("-")[0] else: upIntronLoc = bodyLoc.split("-")[1] + "-" + upLoc.split("-")[0] downIntronLoc = downLoc.split("-")[1] + "-" + bodyLoc.split("-")[0] try: data["SE"][loc]["BODY_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, bodyLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["UP_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, upLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["DOWN_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, downLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["UPI_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, upIntronLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["DOWNI_RPK"] = region_rpk(pybedtools.Interval(chrom, *map(int, downIntronLoc.split("-")), strand=signstrand), bam_fileobj) except: #import pdb; pdb.set_trace() print "uh oh %s" %(gene + loc) continue for structure in splicedict["SE"][loc]["IN"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["SE"][loc]["IN"] += jxns[structurestrip] for structure in splicedict["SE"][loc]["EX"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["SE"][loc]["EX"] += jxns[structurestrip] if "MXE" in splicedict and "MXE" in splicetypes: data["MXE"] = {} # for loc in splicedict[gene]["MXE"]: for loc in splicedict["MXE"]: #rangestart = splicedict[gene]["SE"][loc]["rangestart"] #rangeend = splicedict[gene]["SE"][loc]["rangeend"] data["MXE"][loc] = {} data["MXE"][loc]["A"] = 0 data["MXE"][loc]["B"] = 0 #import code #code.interact(local=locals()) for structure in splicedict["MXE"][loc]["A"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["MXE"][loc]["A"] += jxns[structurestrip] elif structure.startswith("b"): continue exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop+1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["MXE"][loc]["A"] += reads[position][read_end] for structure in splicedict["MXE"][loc]["B"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["MXE"][loc]["B"] += jxns[structurestrip] elif structure.startswith("b"): continue exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop+1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["MXE"][loc]["B"] += reads[position][read_end] return data
def assign_reads(gene, splicedict=None, bam_file=None, alignment_slop=10, flip=True, splicetypes=None): if splicedict is None or bam_file is None: raise Exception bam_fileobj = pysam.Samfile(bam_file, 'rb') data = {} chrom = splicedict["chromosome"] strand = splicedict["strand"] tx_start = splicedict["tx_start"] tx_end = splicedict["tx_end"] signstrand = None if flip is not None: if flip is True: usestrand = strand * -1 else: usestrand = strand if usestrand == 1: signstrand = "+" elif usestrand == -1: signstrand = "-" subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start,end=tx_end) wig, jxns, nrCounts, readLengths, reads = readsToWiggle_pysam(subset_reads, (tx_start-1000), (tx_end+1000), signstrand, "center", False) data["descriptor"] = gene if "SE" in splicedict and "SE" in splicetypes: data["SE"] = {} for loc in splicedict["SE"]: #rangestart = splicedict[gene]["SE"][loc]["rangestart"] #rangeend = splicedict[gene]["SE"][loc]["rangeend"] data["SE"][loc] = {} data["SE"][loc]["IN"] = 0 data["SE"][loc]["EX"] = 0 for structure in splicedict["SE"][loc]["IN"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["SE"][loc]["IN"] += jxns[structurestrip] elif structure.startswith("b"): continue #skip exon body exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop+1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["SE"][loc]["IN"] += reads[position][read_end] #for. read in reads: # rstart, rstop = map(int, read.split("-")) # if rstart >= (exstart-alignment_slop) and rstop <= (exstop + alignment_slop): # data["SE"][loc]["IN"] += reads[read] # else: # pass for structure in splicedict["SE"][loc]["EX"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["SE"][loc]["EX"] += jxns[structurestrip] if "MXE" in splicedict and "MXE" in splicetypes: data["MXE"] = {} # for loc in splicedict[gene]["MXE"]: for loc in splicedict["MXE"]: #rangestart = splicedict[gene]["SE"][loc]["rangestart"] #rangeend = splicedict[gene]["SE"][loc]["rangeend"] data["MXE"][loc] = {} data["MXE"][loc]["A"] = 0 data["MXE"][loc]["B"] = 0 #import code #code.interact(local=locals()) for structure in splicedict["MXE"][loc]["A"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["MXE"][loc]["A"] += jxns[structurestrip] elif structure.startswith("b"): continue exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop+1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["MXE"][loc]["A"] += reads[position][read_end] for structure in splicedict["MXE"][loc]["B"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["MXE"][loc]["B"] += jxns[structurestrip] elif structure.startswith("b"): continue exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop+1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["MXE"][loc]["B"] += reads[position][read_end] return data
def call_peaks(loc, gene_length, bam_fileobj=None, bam_file=None, margin=25, fdr_alpha=0.05, user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, w_cutoff=10, windowsize=1000, SloP=False, correct_p=False): """ calls peaks for an individual gene loc - string of all gene location gene_length - effective length of gene takes bam file or bam file object. Serial uses object parallel uses location (name) margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson """ #setup chrom, gene_name, tx_start, tx_end, signstrand = loc #logic reading bam files if bam_file is None and bam_fileobj is None: #using a file opbject is faster for serial processing #but doesn't work in parallel verboseprint("""you have to pick either bam file or bam file object, not both""") exit() elif bam_fileobj is None: bam_fileobj = pysam.Samfile(bam_file, 'rb') tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start, end=tx_end) #need to document reads to wiggle wiggle, jxns, pos_counts, lengths, allreads = readsToWiggle_pysam( subset_reads, tx_start, tx_end, signstrand, "center", False) #wiggle, pos_counts, lengths = readsToWiggle_pysam(subset_reads, tx_start, tx_end, signstrand, "center", False) #TODO have a check to kill this if there aren't any reads in a region result = peaks_from_info(list(wiggle), pos_counts, lengths, loc, gene_length, margin, fdr_alpha, user_threshold, minreads, poisson_cutoff, plotit, w_cutoff, windowsize, SloP, correct_p) return result
def assign_reads(gene, splicedict=None, bam_file=None, alignment_slop=10, flip=True, splicetypes=None): if splicedict is None or bam_file is None: raise Exception bam_fileobj = pysam.Samfile(bam_file, 'rb') data = {} chrom = splicedict["chromosome"] strand = splicedict["strand"] tx_start = splicedict["tx_start"] tx_end = splicedict["tx_end"] signstrand = None if flip is True: usestrand = strand * -1 else: usestrand = strand if usestrand == 1: signstrand = "+" elif usestrand == -1: signstrand = "-" interval = pybedtools.Interval(chrom, tx_start, tx_end, strand=signstrand) subset_reads = bam_fileobj.fetch(reference=chrom, start=tx_start, end=tx_end) (wig, jxns, nrCounts, readLengths, reads) = readsToWiggle_pysam(subset_reads, (tx_start - 1000), (tx_end + 1000), signstrand, "center", False) data["descriptor"] = gene if "SE" in splicedict and "SE" in splicetypes: data["SE"] = {} for loc in splicedict["SE"]: #rangestart = splicedict[gene]["SE"][loc]["rangestart"] #rangeend = splicedict[gene]["SE"][loc]["rangeend"] data["SE"][loc] = {} data["SE"][loc]["IN"] = 0 data["SE"][loc]["EX"] = 0 bodyLoc = splicedict['SE'][loc]["BODY"] upLoc = splicedict['SE'][loc]["UP"] downLoc = splicedict['SE'][loc]["DOWN"] if strand == 1: upIntronLoc = upLoc.split("-")[1] + "-" + bodyLoc.split("-")[0] downIntronLoc = bodyLoc.split("-")[1] + "-" + downLoc.split( "-")[0] else: upIntronLoc = bodyLoc.split("-")[1] + "-" + upLoc.split("-")[0] downIntronLoc = downLoc.split("-")[1] + "-" + bodyLoc.split( "-")[0] try: data["SE"][loc]["BODY_RPK"] = region_rpk( pybedtools.Interval(chrom, *map(int, bodyLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["UP_RPK"] = region_rpk( pybedtools.Interval(chrom, *map(int, upLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["DOWN_RPK"] = region_rpk( pybedtools.Interval(chrom, *map(int, downLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["UPI_RPK"] = region_rpk( pybedtools.Interval(chrom, *map(int, upIntronLoc.split("-")), strand=signstrand), bam_fileobj) data["SE"][loc]["DOWNI_RPK"] = region_rpk( pybedtools.Interval(chrom, *map(int, downIntronLoc.split("-")), strand=signstrand), bam_fileobj) except: #import pdb; pdb.set_trace() print "uh oh %s" % (gene + loc) continue for structure in splicedict["SE"][loc]["IN"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["SE"][loc]["IN"] += jxns[structurestrip] for structure in splicedict["SE"][loc]["EX"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["SE"][loc]["EX"] += jxns[structurestrip] if "MXE" in splicedict and "MXE" in splicetypes: data["MXE"] = {} # for loc in splicedict[gene]["MXE"]: for loc in splicedict["MXE"]: #rangestart = splicedict[gene]["SE"][loc]["rangestart"] #rangeend = splicedict[gene]["SE"][loc]["rangeend"] data["MXE"][loc] = {} data["MXE"][loc]["A"] = 0 data["MXE"][loc]["B"] = 0 #import code #code.interact(local=locals()) for structure in splicedict["MXE"][loc]["A"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["MXE"][loc]["A"] += jxns[structurestrip] elif structure.startswith("b"): continue exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop + 1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["MXE"][loc]["A"] += reads[position][ read_end] for structure in splicedict["MXE"][loc]["B"]: if structure.startswith("j"): structurestrip = structure.lstrip("j") structurestrip = tuple(map(int, structurestrip.split(":"))) if structurestrip in jxns: data["MXE"][loc]["B"] += jxns[structurestrip] elif structure.startswith("b"): continue exstart, exstop = map(int, structure.lstrip("b").split("-")) for position in range(exstart, (exstop + 1)): if position in reads: for read_end in reads[position]: if read_end <= exstop: data["MXE"][loc]["B"] += reads[position][ read_end] return data