def runBAMtoJunctionBED(paths_to_run): bamfile_dir,bed_reference_dir,output_bedfile_path = paths_to_run output_bedfile_path = string.replace(bamfile_dir,'.bam','__junction.bed') #if os.path.exists(output_bedfile_path) == False: ### Only run if the file doesn't exist results = BAMtoJunctionBED.parseJunctionEntries(bamfile_dir,multi=True) #else: print output_bedfile_path, 'already exists.' return results
def runBAMtoJunctionBED(paths_to_run): bamfile_dir, bed_reference_dir, output_bedfile_path = paths_to_run output_bedfile_path = string.replace(bamfile_dir, '.bam', '__junction.bed') #if os.path.exists(output_bedfile_path) == False: ### Only run if the file doesn't exist results = BAMtoJunctionBED.parseJunctionEntries(bamfile_dir, multi=True) #else: print output_bedfile_path, 'already exists.' return results
def parseExonReferences(bam_dir,reference_exon_bed,multi=False): start_time = time.time() bamfile = pysam.Samfile(bam_dir, "rb" ) reference_rows=0 output_bed_rows=0 o = open (string.replace(bam_dir,'.bam','__exon.bed'),"w") for line in open(reference_exon_bed,'rU').xreadlines(): ### read each line one-at-a-time rather than loading all in memory line = line.rstrip('\n') reference_rows+=1 ref_entries = string.split(line,'\t'); #'12', '6998470', '6998522', 'ENSG00000111671:E1.1_ENSE00001754003', '0', '-' chr,start,stop,exon,null,strand = ref_entries[:6] read_count=0 try: #if exon == 'ENSMUSG00000001472:E17.1': #chr = '12'; start = '6998470'; stop = '6998522' for alignedread in bamfile.fetch(chr, int(start),int(stop)): proceed = True try: cigarstring = alignedread.cigarstring except Exception: codes = map(lambda x: x[0],alignedread.cigar) if 3 in codes: cigarstring = 'N' else: cigarstring = None if cigarstring == None: pass else: ### Exclude junction reads ("N") if 'N' in cigarstring: X=int(alignedread.pos) Y=int(alignedread.pos+alignedread.alen) start= int(start) stop = int(stop) proceed = False a = [X,Y]; a.sort() b = [X,Y,start,stop]; b.sort() if a[0]==b[1] or a[1]==b[2]: ### Hence, the read starts or ends in that interval proceed = True if proceed == False: ### Also search for cases were part of the read is contained within the exon import BAMtoJunctionBED coordinates,up_to_intron_dist = BAMtoJunctionBED.getSpliceSites(alignedread.cigar,X) for (five_prime_ss,three_prime_ss) in coordinates: five_prime_ss,three_prime_ss=int(five_prime_ss),int(three_prime_ss) if five_prime_ss==start or three_prime_ss==start or five_prime_ss==stop or three_prime_ss==stop: proceed = True #print five_prime_ss, three_prime_ss, start, stop;sys.exit() if proceed: read_count+=1 entries = [chr,str(start),str(stop),exon,null,strand,str(read_count),'0',str(int(stop)-int(start)),'0'] o.write(string.join(entries,'\t')+'\n') output_bed_rows+=1 except Exception,e: ### Occurs also due to non-chromosome contigs in the annotation file if 'bamfile without index' in e: print 'Please ensure an index exists for the bam file:',bam_dir;sys.exit()
def parseExonReferences(bam_dir,reference_exon_bed,multi=False): start_time = time.time() bamfile = pysam.Samfile(bam_dir, "rb" ) reference_rows=0 output_bed_rows=0 o = open (string.replace(bam_dir,'.bam','__exon.bed'),"w") io = AppendOrWrite(string.replace(bam_dir,'.bam','__junction.bed')) for line in open(reference_exon_bed,'rU').xreadlines(): ### read each line one-at-a-time rather than loading all in memory line = line.rstrip('\n') reference_rows+=1 ref_entries = string.split(line,'\t'); #'12', '6998470', '6998522', 'ENSG00000111671:E1.1_ENSE00001754003', '0', '-' chr,start,stop,exon,null,strand = ref_entries[:6] read_count=0; five_intron_junction_count=0 three_intron_junction_count=0 try: #if exon == 'ENSMUSG00000001472:E17.1': #chr = '12'; start = '6998470'; stop = '6998522' for alignedread in bamfile.fetch(chr, int(start),int(stop)): proceed = True try: cigarstring = alignedread.cigarstring except Exception: codes = map(lambda x: x[0],alignedread.cigar) if 3 in codes: cigarstring = 'N' else: cigarstring = None try: read_strand = alignedread.opt('XS') ### TopHat/STAR knows which sequences are likely real splice sites so it assigns a real strand to the read except Exception,e: #if multi == False: print 'No TopHat strand information';sys.exit() read_strand = None ### TopHat doesn't predict strand for many reads if read_strand==None or read_strand==strand: ### Tries to ensure the propper strand reads are considered (if strand read info available) if cigarstring == None: pass else: ### Exclude junction reads ("N") if 'N' in cigarstring: X=int(alignedread.pos) Y=int(alignedread.pos+alignedread.alen) start= int(start) stop = int(stop) proceed = False a = [X,Y]; a.sort() b = [X,Y,start,stop]; b.sort() if a[0]==b[1] or a[1]==b[2]: ### Hence, the read starts or ends in that interval proceed = True if proceed == False: ### Also search for cases were part of the read is contained within the exon import BAMtoJunctionBED coordinates,up_to_intron_dist = BAMtoJunctionBED.getSpliceSites(alignedread.cigar,X) for (five_prime_ss,three_prime_ss) in coordinates: five_prime_ss,three_prime_ss=int(five_prime_ss),int(three_prime_ss) if five_prime_ss==start or three_prime_ss==start or five_prime_ss==stop or three_prime_ss==stop: proceed = True #print five_prime_ss, three_prime_ss, start, stop;sys.exit() else: ### Below code is for more accurate estimation of intron retention try: if 'I' in exon: X=int(alignedread.pos) Y=int(alignedread.pos+alignedread.alen) start= int(start) stop = int(stop) a1 = [X,Y]; a1.sort() b1 = [X,Y,start,stop]; b1.sort() if a1[0]==b1[0] or a1[1]==b1[-1]: ### Hence, the read starts or ends OUTSIDE of that interval mate = bamfile.mate(alignedread) ### looup the paired-end mate for this read try: cigarstring = mate.cigarstring except Exception: codes = map(lambda x: x[0],mate.cigar) if 3 in codes: cigarstring = 'N' else: cigarstring = None if 'N' not in cigarstring: RX=int(mate.pos) RY=int(mate.pos+mate.alen) a2 = [start,stop]; a2.sort() b2 = [RX,RY,start,stop]; b2.sort() if a2[0]==b2[0] and a2[-1]==b2[-1]: if a1[0]==b1[0]: five_intron_junction_count+=1 ### intron junction read that spans the 5' intron-exon #print exon, start, stop, X, Y, RX, RY, strand, read_strand;sys.exit() elif a1[1]==b1[-1]: three_intron_junction_count+=1 ### intron junction read that spans the 3' intron-exon except Exception,e: ### Usually an unmapped read #print traceback.format_exc();sys.exit() pass if proceed: read_count+=1 entries = [chr,str(start),str(stop),exon,null,strand,str(read_count),'0',str(int(stop)-int(start)),'0'] o.write(string.join(entries,'\t')+'\n') output_bed_rows+=1 if 'I' in exon and five_intron_junction_count>0 and three_intron_junction_count>0: if strand=='-': increment = -1 else: increment = -1 outlier_start = start-10+increment; outlier_end = start+10+increment junction_id = exon+'-'+str(start) exon_lengths = '10,10'; dist = '0,0' entries = [chr,str(outlier_start),str(outlier_end),junction_id,str(five_intron_junction_count),strand,str(outlier_start),str(outlier_end),'255,0,0\t2',exon_lengths,'0,'+dist] io.write(string.join(entries,'\t')+'\n') ### 3' junction if strand=='-': increment = 0 else: increment = 0 outlier_start = stop-10+increment; outlier_end = stop+10+increment junction_id = exon+'-'+str(stop) exon_lengths = '10,10'; dist = '0,0' entries = [chr,str(outlier_start),str(outlier_end),junction_id,str(three_intron_junction_count),strand,str(outlier_start),str(outlier_end),'255,0,0\t2',exon_lengths,'0,'+dist] io.write(string.join(entries,'\t')+'\n')
def parseExonReferences(bam_dir, reference_exon_bed, multi=False): start_time = time.time() bamfile = pysam.Samfile(bam_dir, "rb") reference_rows = 0 output_bed_rows = 0 o = open(string.replace(bam_dir, '.bam', '__exon.bed'), "w") for line in open(reference_exon_bed, 'rU').xreadlines( ): ### read each line one-at-a-time rather than loading all in memory line = line.rstrip('\n') reference_rows += 1 ref_entries = string.split(line, '\t') #'12', '6998470', '6998522', 'ENSG00000111671:E1.1_ENSE00001754003', '0', '-' chr, start, stop, exon, null, strand = ref_entries[:6] read_count = 0 try: #if exon == 'ENSMUSG00000001472:E17.1': #chr = '12'; start = '6998470'; stop = '6998522' for alignedread in bamfile.fetch(chr, int(start), int(stop)): proceed = True try: cigarstring = alignedread.cigarstring except Exception: codes = map(lambda x: x[0], alignedread.cigar) if 3 in codes: cigarstring = 'N' else: cigarstring = None if cigarstring == None: pass else: ### Exclude junction reads ("N") if 'N' in cigarstring: X = int(alignedread.pos) Y = int(alignedread.pos + alignedread.alen) start = int(start) stop = int(stop) proceed = False a = [X, Y] a.sort() b = [X, Y, start, stop] b.sort() if a[0] == b[1] or a[1] == b[ 2]: ### Hence, the read starts or ends in that interval proceed = True if proceed == False: ### Also search for cases were part of the read is contained within the exon import BAMtoJunctionBED coordinates, up_to_intron_dist = BAMtoJunctionBED.getSpliceSites( alignedread.cigar, X) for (five_prime_ss, three_prime_ss) in coordinates: five_prime_ss, three_prime_ss = int( five_prime_ss), int(three_prime_ss) if five_prime_ss == start or three_prime_ss == start or five_prime_ss == stop or three_prime_ss == stop: proceed = True #print five_prime_ss, three_prime_ss, start, stop;sys.exit() if proceed: read_count += 1 entries = [ chr, str(start), str(stop), exon, null, strand, str(read_count), '0', str(int(stop) - int(start)), '0' ] o.write(string.join(entries, '\t') + '\n') output_bed_rows += 1 except Exception, e: ### Occurs also due to non-chromosome contigs in the annotation file if 'bamfile without index' in e: print 'Please ensure an index exists for the bam file:', bam_dir sys.exit()