def main(): if len(sys.argv) < 2: print sys.argv[0] + " <psl filename> <smoothing parameter (default 10)>" sys.exit() smoothing = 10 if len(sys.argv) == 3: smoothing = int(sys.argv[2]) psl_filename = sys.argv[1] with open(psl_filename) as fh: for line in fh: line = line.rstrip() psl_entry = psl_basics.read_psl_entry(line) gpd_line = psl_basics.convert_entry_to_genepred_line(psl_entry) gpd_entry = genepred_basics.genepred_line_to_dictionary(gpd_line) smoothed_gpd_entry = genepred_basics.smooth_gaps(gpd_entry,smoothing) #get longest exon only longest_exon = 0 best_start = 0 best_end = 0 for i in range(0,smoothed_gpd_entry['exonCount']): exon_length = smoothed_gpd_entry['exonEnds'][i]-smoothed_gpd_entry['exonStarts'][i] if exon_length > longest_exon: longest_exon = exon_length best_end = smoothed_gpd_entry['exonEnds'][i] best_start = smoothed_gpd_entry['exonStarts'][i] print psl_entry['tName'] + "\t" + str(best_start) + "\t" + str(best_end) + "\t" + psl_entry['qName'] + "\t" + str(psl_entry['qSize']) + "\t" + str(psl_entry['tSize']) + "\t" + str(longest_exon)
def make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname): bestcont = {} of = open(bestalignmentbedfname,'w') with open(alignmentfname) as f: for line in f: if re.match('^#',line): continue pe = psl_basics.read_psl_entry(line) gl = psl_basics.convert_entry_to_genepred_line(pe) ge = genepred_basics.genepred_line_to_dictionary(gl) ges = genepred_basics.smooth_gaps(ge,10) for i in range(0,len(ges['exonStarts'])): exonlen = ges['exonEnds'][i]-ges['exonStarts'][i] if ges['name'] not in bestcont: bestcont[ges['name']] = {} if ges['chrom'] not in bestcont[ges['name']]: entry = {} entry['bestlen'] = 0 entry['start'] = 0 entry['end'] = 0 entry['target_length'] = 0 entry['query_length'] = 0 entry['matches'] = 0 bestcont[ges['name']][ges['chrom']] = entry if exonlen > bestcont[ges['name']][ges['chrom']]['bestlen']: bestcont[ges['name']][ges['chrom']]['bestlen'] = exonlen bestcont[ges['name']][ges['chrom']]['start'] = ges['exonStarts'][i] bestcont[ges['name']][ges['chrom']]['end'] = ges['exonEnds'][i] bestcont[ges['name']][ges['chrom']]['target_length'] = pe['tSize'] bestcont[ges['name']][ges['chrom']]['query_length'] = pe['qSize'] bestcont[ges['name']][ges['chrom']]['matches'] = pe['matches'] for read in bestcont: for tx in bestcont[read]: of.write(tx + "\t" + str(bestcont[read][tx]['start']) + "\t" + str(bestcont[read][tx]['end']) + "\t" + read + "\t" + str(bestcont[read][tx]['query_length']) + "\t" + str(bestcont[read][tx]['target_length']) + "\t" + str(bestcont[read][tx]['bestlen']) + "\t" + str(bestcont[read][tx]['matches']) + "\n") return
def main(): if len(sys.argv) < 2: print sys.argv[0] + ' <psl filename> <smoothing size (optional)>' sys.exit() pslfilename = sys.argv[1] smooth_size = 0 if len(sys.argv) == 3: smooth_size = int(sys.argv[2]) with open(pslfilename) as infile: for line in infile: psl_entry = psl_basics.read_psl_entry(line) genepred_line = psl_basics.convert_entry_to_genepred_line(psl_entry) genepred_entry = genepred_basics.genepred_line_to_dictionary(genepred_line) if smooth_size > 0: genepred_entry = genepred_basics.smooth_gaps(genepred_entry,smooth_size) for i in range(0,len(genepred_entry['exonStarts'])): print genepred_entry['chrom'] + "\t" + str(genepred_entry['exonStarts'][i]+1) + "\t" + str(genepred_entry['exonEnds'][i]) + "\t" + genepred_entry['gene_name']
def make_fusion_genepred(psl_filename, min_gap_in_block_size, outfile): i = 0 ofile = open(outfile, 'w') fusion_gpd = {} with open(psl_filename) as infile: for line in infile: m = i % 2 i += 1 entry = psl_basics.read_psl_entry(line) gline = psl_basics.convert_entry_to_genepred_line(entry) e = genepred_basics.genepred_line_to_dictionary(gline) # m == 0 is left, m == 1 is right look = re.search('\|(F[\d]+)([+-])', e['gene_name']) fid = look.group(1) fsidesign = look.group(2) #which side is the fusion on e['cdsStart'] = e['txStart'] e['cdsEnd'] = e['txEnd'] f = genepred_basics.smooth_gaps(e, min_gap_in_block_size) # get new transcript length newlen = 0 for j in range(0, f['exonCount']): newlen += f['exonEnds'][j] - f['exonStarts'][j] # make a new name based on the new fusion site and new length look = re.match('^([\d\.]+_)\d+(\|F\d+[+-].*)$', f['gene_name']) v1 = look.group(1) v2 = str(newlen) v3 = look.group(2) f['gene_name'] = v1 + v2 + v3 gpline = genepred_basics.genepred_entry_to_genepred_line(f) ofile.write(gpline + "\n") if fid not in fusion_gpd: n = [0, 0] fusion_gpd[fid] = n fusion_gpd[fid][m] = [ f, fsidesign ] #send back the genepred and the side the fusion is on ofile.close() return fusion_gpd
def make_fusion_genepred(psl_filename,min_gap_in_block_size,outfile): i = 0 ofile = open(outfile,'w') fusion_gpd = {} with open(psl_filename) as infile: for line in infile: m = i % 2 i += 1 entry = psl_basics.read_psl_entry(line) gline = psl_basics.convert_entry_to_genepred_line(entry) e = genepred_basics.genepred_line_to_dictionary(gline) # m == 0 is left, m == 1 is right look = re.search('\|(F[\d]+)([+-])',e['gene_name']) fid = look.group(1) fsidesign = look.group(2) #which side is the fusion on e['cdsStart'] = e['txStart'] e['cdsEnd'] = e['txEnd'] f = genepred_basics.smooth_gaps(e,min_gap_in_block_size) # get new transcript length newlen = 0 for j in range(0,f['exonCount']): newlen += f['exonEnds'][j] - f['exonStarts'][j] # make a new name based on the new fusion site and new length look = re.match('^([\d\.]+_)\d+(\|F\d+[+-].*)$',f['gene_name']) v1 = look.group(1) v2 = str(newlen) v3 = look.group(2) f['gene_name'] = v1 + v2 + v3 gpline = genepred_basics.genepred_entry_to_genepred_line(f) ofile.write(gpline+"\n") if fid not in fusion_gpd: n = [0,0] fusion_gpd[fid] = n fusion_gpd[fid][m] = [f, fsidesign] #send back the genepred and the side the fusion is on ofile.close() return fusion_gpd
def main(): if len(sys.argv) < 2: print sys.argv[0] + ' <psl filename> <smoothing size (optional)>' sys.exit() pslfilename = sys.argv[1] smooth_size = 0 if len(sys.argv) == 3: smooth_size = int(sys.argv[2]) with open(pslfilename) as infile: for line in infile: psl_entry = psl_basics.read_psl_entry(line) genepred_line = psl_basics.convert_entry_to_genepred_line( psl_entry) genepred_entry = genepred_basics.genepred_line_to_dictionary( genepred_line) if smooth_size > 0: genepred_entry = genepred_basics.smooth_gaps( genepred_entry, smooth_size) for i in range(0, len(genepred_entry['exonStarts'])): print genepred_entry['chrom'] + "\t" + str( genepred_entry['exonStarts'][i] + 1) + "\t" + str( genepred_entry['exonEnds'] [i]) + "\t" + genepred_entry['gene_name']
def make_fusion_genepred(psl_filename,fid_coords,min_gap_in_block_size,outfile): i = 0 ofile = open(outfile,'w') fusion_gpd = {} with open(psl_filename) as infile: for line in infile: m = i % 2 i += 1 entry = psl_basics.read_psl_entry(line) gline = psl_basics.convert_entry_to_genepred_line(entry) d = genepred_basics.genepred_line_to_dictionary(gline) # m == 0 is left, m == 1 is right look = re.search('\|(F[\d]+)([+-])',d['gene_name']) fid = look.group(1) fsidesign = look.group(2) #which side is the fusion on if fid not in fid_coords: continue # we only are working on the ones that met criteria for short reads #fix our end points based on short read defined fusions side = 'left' if m == 1: side = 'right' [chr, coord, dir] = fid_coords[fid][side] coord = int(coord) # fix our endpoints e = {} # our corrected genepred newpos = 0 #this will be used to update the if fsidesign == '-': #print 'right ' +str(coord) + " " + str(d['exonEnds'][d['exonCount']-1]) # if the new txEnd is greater than the last exon, we need to move it out if coord == d['exonEnds'][d['exonCount']-1]: e = d #print "right side stays the same" elif coord > d['exonEnds'][d['exonCount']-1]: e = genepred_basics.right_extend_genepred(d,coord) #print "right extend transcript" #everything is all set else: e = genepred_basics.right_trim_genepred(d,coord) #print "right shorten transcript" newpos = e['txEnd'] else: # end point is on the left side of the alignment #print 'left ' + str(coord) + " " + str(d['exonStarts'][0]) if coord-1 == d['exonStarts'][0]: e = d #print "left stays the same" if coord-1 < d['exonStarts'][0]: e = genepred_basics.left_extend_genepred(d,coord-1) #print "left extend transcript" else: e = genepred_basics.left_trim_genepred(d,coord-1) #print "left trim transcript" newpos = e['txStart'] # we treat cdsStart and cdsEnd the same as tx e['cdsStart'] = e['txStart'] e['cdsEnd'] = e['txEnd'] f = genepred_basics.smooth_gaps(e,min_gap_in_block_size) # get new transcript length newlen = 0 for j in range(0,f['exonCount']): newlen += f['exonEnds'][j] - f['exonStarts'][j] # make a new name based on the new fusion site and new length look = re.match('^([\d\.]+_)\d+(\|F\d+[+-])\d+(\|.*)$',f['gene_name']) v1 = look.group(1) v2 = str(newlen) v3 = look.group(2) v4 = str(newpos) v5 = look.group(3) f['gene_name'] = v1 + v2 + v3 + v4 + v5 gpline = genepred_basics.genepred_entry_to_genepred_line(f) ofile.write(gpline+"\n") if fid not in fusion_gpd: n = [0,0] fusion_gpd[fid] = n fusion_gpd[fid][m] = [f, fsidesign] #send back the genepred and the side the fusion is on ofile.close() return fusion_gpd
def main(): if len(sys.argv) != 5: print sys.argv[ 0] + ' <IDP temp dir> <junction (i.e. chr1:1000-/chr2:1000+)> <range> <output dir>' sys.exit() inputdir = sys.argv[1].rstrip('/') psl_filename = inputdir + '/' + 'LR.psl_fusion_pair_filtered' map_filename = inputdir + '/uniqueness/fusion_read.map' fasta_filename = inputdir + '/uniqueness/unmapped_shortread.fa' txn_filename = inputdir + '/uniqueness/txn.map' junction_abbreviation = sys.argv[2] myrange = int(sys.argv[3]) outdir = sys.argv[4].rstrip('/') if not os.path.isdir(outdir): print "make directory " + outdir os.mkdir(outdir) of1 = open(outdir + "/fusion_coordiante.txt", 'w') of1.write(junction_abbreviation + "\t" + str(myrange) + "\n") of1.close() m = re.match('([^:]+):(\d+)([+-])\/([^:]+):(\d+)([+-])', junction_abbreviation) chr1 = m.group(1) coo1 = int(m.group(2)) dir1 = m.group(3) chr2 = m.group(4) coo2 = int(m.group(5)) dir2 = m.group(6) of = open(outdir + "/long_read_query_locations.txt", 'w') oleft = open(outdir + "/long_read_left.bed", 'w') oright = open(outdir + "/long_read_right.bed", 'w') oleftgpd = open(outdir + "/long_read_left.gpd", 'w') orightgpd = open(outdir + "/long_read_right.gpd", 'w') # Work through psl file for long reads lrcnt = 0 with open(psl_filename) as f: while True: l1 = f.readline().rstrip() if not l1: break l2 = f.readline().rstrip() if not l2: break e1 = psl_basics.read_psl_entry(l1) e2 = psl_basics.read_psl_entry(l2) g1 = genepred_basics.smooth_gaps( genepred_basics.genepred_line_to_dictionary( psl_basics.convert_entry_to_genepred_line(e1)), 30) g2 = genepred_basics.smooth_gaps( genepred_basics.genepred_line_to_dictionary( psl_basics.convert_entry_to_genepred_line(e2)), 30) if check_coordiantes(e1, e2, chr1, coo1, dir1, chr2, coo2, dir2, myrange): oleftgpd.write( genepred_basics.genepred_entry_to_genepred_line(g1) + "\n") orightgpd.write( genepred_basics.genepred_entry_to_genepred_line(g2) + "\n") lrcnt += 1 of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \ str(e1['qEnd']) + "\t" + dir1 + "\t" + \ str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + dir2 + "\n") for i in range(0, g1['exonCount']): if g1["exonEnds"][i] - g1["exonStarts"][i] >= 30: oleft.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \ str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n") for i in range(0, g2['exonCount']): if g2["exonEnds"][i] - g2["exonStarts"][i] >= 30: oright.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \ str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n") elif check_coordiantes(e1, e2, chr2, coo2, opposite(dir2), chr1, coo1, opposite(dir1), myrange): oleftgpd.write( genepred_basics.genepred_entry_to_genepred_line(g2) + "\n") orightgpd.write( genepred_basics.genepred_entry_to_genepred_line(g1) + "\n") lrcnt += 1 of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \ str(e1['qEnd']) + "\t" + opposite(dir2) + "\t" + \ str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + opposite(dir1) + "\n") for i in range(0, g1['exonCount']): if g1["exonEnds"][i] - g1["exonStarts"][i] >= 30: oright.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \ str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n") for i in range(0, g2['exonCount']): if g2["exonEnds"][i] - g2["exonStarts"][i] >= 30: oleft.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \ str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n") of.close() oleft.close() oright.close() oleftgpd.close() orightgpd.close() print str(lrcnt) + " long reads found supporting the fusion" #Work through fusion read map for short reads rnames = {} seenhit = {} with open(map_filename) as inf: for line in inf: f = line.rstrip().split("\t") srname = f[0] loc = f[2] m = re.search('^([^:]+):.*\/([^:]+):', loc) srchr1 = m.group(1) srchr2 = m.group(2) m = re.search( '[,:](-?\d+)-(-?\d+)([+-])\/[^:]+:(-?\d+)-(-?\d+),?.*([+-])', loc) srcoo1start = int(m.group(1)) srcoo1finish = int(m.group(2)) srdir1 = m.group(3) srcoo2start = int(m.group(4)) srcoo2finish = int(m.group(5)) srdir2 = m.group(6) m = re.search srcooleft = srcoo1finish if srdir1 == '-': srcooleft = srcoo1start srcooright = srcoo2start if srdir2 == '-': srcooright = srcoo2finish #print srchr1 + "\t" + srchr2 + "\t" + str(srcooleft) + "\t" + str(srcooright) if srdir1 == dir1 and srchr1 == chr1 and srdir2 == dir2 and srchr2 == chr2 and srcooleft == coo1 and srcooright == coo2: rnames[srname] = {} rnames[srname]['left'] = srchr1 + "\t" + str( srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname rnames[srname]['right'] = srchr2 + "\t" + str( srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname if srname not in seenhit: seenhit[srname] = 0 seenhit[srname] += 1 if srdir1 == opposite( dir1 ) and srchr1 == chr2 and srdir2 == opposite( dir2 ) and srchr2 == chr1 and srcooleft == coo2 and srcooright == coo1: rnames[srname] = {} rnames[srname]['left'] = srchr2 + "\t" + str( srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname rnames[srname]['right'] = srchr1 + "\t" + str( srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname if srname not in seenhit: seenhit[srname] = 0 seenhit[srname] += 1 print "found " + str(len(rnames)) + " short reads" for srname in seenhit: if seenhit[srname] > 1: print "removing " + srname del rnames[srname] print "found " + str( len(rnames)) + " short reads with no multihits among fusions" validreads = {} with open(fasta_filename) as inf: for line in inf: m = re.match('^>(.*)$', line.rstrip()) if m: validreads[m.group(1)] = 1 namelist = rnames.keys() for rname in namelist: if rname not in validreads: print "removing " + rname del rnames[rname] print "found " + str( len(rnames)) + " unique short reads with no hits in the genome" with open(txn_filename) as inf: for line in inf: f = line.rstrip().split("\t") if f[0] in rnames: print "removing " + f[0] del rnames[f[0]] print "found " + str( len(rnames) ) + " unique short reads with no hits in the genome or transcriptome" oleft = open(outdir + "/short_read_left.bed", 'w') oright = open(outdir + "/short_read_right.bed", 'w') for rname in rnames: oleft.write(rnames[rname]['left'] + "\n") oright.write(rnames[rname]['right'] + "\n") oleft.close() oright.close()
def main(): if len(sys.argv) != 5: print sys.argv[0] + ' <IDP temp dir> <junction (i.e. chr1:1000-/chr2:1000+)> <range> <output dir>' sys.exit() inputdir = sys.argv[1].rstrip('/') psl_filename = inputdir+'/'+'LR.psl_fusion_pair_filtered' map_filename = inputdir+'/uniqueness/fusion_read.map' fasta_filename = inputdir+'/uniqueness/unmapped_shortread.fa' txn_filename = inputdir+'/uniqueness/txn.map' junction_abbreviation = sys.argv[2] myrange = int(sys.argv[3]) outdir = sys.argv[4].rstrip('/') if not os.path.isdir(outdir): print "make directory "+outdir os.mkdir(outdir) of1 = open(outdir+"/fusion_coordiante.txt",'w') of1.write(junction_abbreviation+"\t"+str(myrange)+"\n") of1.close() m = re.match('([^:]+):(\d+)([+-])\/([^:]+):(\d+)([+-])',junction_abbreviation) chr1 = m.group(1) coo1 = int(m.group(2)) dir1 = m.group(3) chr2 = m.group(4) coo2 = int(m.group(5)) dir2 = m.group(6) of = open(outdir+"/long_read_query_locations.txt",'w') oleft = open(outdir+"/long_read_left.bed",'w') oright = open(outdir+"/long_read_right.bed",'w') oleftgpd = open(outdir+"/long_read_left.gpd",'w') orightgpd = open(outdir+"/long_read_right.gpd",'w') # Work through psl file for long reads lrcnt = 0 with open(psl_filename) as f: while True: l1 = f.readline().rstrip() if not l1: break l2 = f.readline().rstrip() if not l2: break e1 = psl_basics.read_psl_entry(l1) e2 = psl_basics.read_psl_entry(l2) g1 = genepred_basics.smooth_gaps(genepred_basics.genepred_line_to_dictionary(psl_basics.convert_entry_to_genepred_line(e1)),30) g2 = genepred_basics.smooth_gaps(genepred_basics.genepred_line_to_dictionary(psl_basics.convert_entry_to_genepred_line(e2)),30) if check_coordiantes(e1,e2,chr1,coo1,dir1,chr2,coo2,dir2,myrange): oleftgpd.write(genepred_basics.genepred_entry_to_genepred_line(g1)+"\n") orightgpd.write(genepred_basics.genepred_entry_to_genepred_line(g2)+"\n") lrcnt += 1 of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \ str(e1['qEnd']) + "\t" + dir1 + "\t" + \ str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + dir2 + "\n") for i in range(0,g1['exonCount']): if g1["exonEnds"][i]-g1["exonStarts"][i] >= 30: oleft.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \ str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n") for i in range(0,g2['exonCount']): if g2["exonEnds"][i]-g2["exonStarts"][i] >= 30: oright.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \ str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n") elif check_coordiantes(e1,e2,chr2,coo2,opposite(dir2),chr1,coo1,opposite(dir1),myrange): oleftgpd.write(genepred_basics.genepred_entry_to_genepred_line(g2)+"\n") orightgpd.write(genepred_basics.genepred_entry_to_genepred_line(g1)+"\n") lrcnt += 1 of.write(e1["qName"] + "\t" + str(e1['qStart']+1) + "\t" + \ str(e1['qEnd']) + "\t" + opposite(dir2) + "\t" + \ str(e2['qStart']+1) + "\t" + str(e2['qEnd']) + "\t" + opposite(dir1) + "\n") for i in range(0,g1['exonCount']): if g1["exonEnds"][i]-g1["exonStarts"][i] >= 30: oright.write(g1["chrom"] + "\t" + str(g1["exonStarts"][i]+1) + "\t" + \ str(g1["exonEnds"][i]) + "\t" + e1["qName"] + "\n") for i in range(0,g2['exonCount']): if g2["exonEnds"][i]-g2["exonStarts"][i] >= 30: oleft.write(g2["chrom"] + "\t" + str(g2["exonStarts"][i]+1) + "\t" + \ str(g2["exonEnds"][i]) + "\t" + e1["qName"] + "\n") of.close() oleft.close() oright.close() oleftgpd.close() orightgpd.close() print str(lrcnt) + " long reads found supporting the fusion" #Work through fusion read map for short reads rnames = {} seenhit = {} with open(map_filename) as inf: for line in inf: f = line.rstrip().split("\t") srname = f[0] loc = f[2] m = re.search('^([^:]+):.*\/([^:]+):',loc) srchr1 = m.group(1) srchr2 = m.group(2) m = re.search('[,:](-?\d+)-(-?\d+)([+-])\/[^:]+:(-?\d+)-(-?\d+),?.*([+-])',loc) srcoo1start = int(m.group(1)) srcoo1finish = int(m.group(2)) srdir1 = m.group(3) srcoo2start = int(m.group(4)) srcoo2finish = int(m.group(5)) srdir2 = m.group(6) m = re.search srcooleft = srcoo1finish if srdir1 == '-': srcooleft = srcoo1start srcooright = srcoo2start if srdir2 == '-': srcooright = srcoo2finish #print srchr1 + "\t" + srchr2 + "\t" + str(srcooleft) + "\t" + str(srcooright) if srdir1 == dir1 and srchr1 == chr1 and srdir2 == dir2 and srchr2 == chr2 and srcooleft == coo1 and srcooright == coo2: rnames[srname] = {} rnames[srname]['left'] = srchr1 + "\t" + str(srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname rnames[srname]['right'] = srchr2 + "\t" + str(srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname if srname not in seenhit: seenhit[srname] = 0 seenhit[srname] += 1 if srdir1 == opposite(dir1) and srchr1 == chr2 and srdir2 == opposite(dir2) and srchr2 == chr1 and srcooleft == coo2 and srcooright == coo1: rnames[srname] = {} rnames[srname]['left'] = srchr2 + "\t" + str(srcoo2start) + "\t" + str(srcoo2finish) + "\t" + srname rnames[srname]['right'] = srchr1 + "\t" + str(srcoo1start) + "\t" + str(srcoo1finish) + "\t" + srname if srname not in seenhit: seenhit[srname] = 0 seenhit[srname] += 1 print "found "+str(len(rnames))+" short reads" for srname in seenhit: if seenhit[srname] > 1: print "removing " + srname del rnames[srname] print "found "+str(len(rnames))+" short reads with no multihits among fusions" validreads = {} with open(fasta_filename) as inf: for line in inf: m = re.match('^>(.*)$',line.rstrip()) if m: validreads[m.group(1)] = 1 namelist = rnames.keys() for rname in namelist: if rname not in validreads: print "removing " + rname del rnames[rname] print "found "+str(len(rnames))+" unique short reads with no hits in the genome" with open(txn_filename) as inf: for line in inf: f = line.rstrip().split("\t") if f[0] in rnames: print "removing " + f[0] del rnames[f[0]] print "found "+str(len(rnames))+" unique short reads with no hits in the genome or transcriptome" oleft = open(outdir+"/short_read_left.bed",'w') oright = open(outdir+"/short_read_right.bed",'w') for rname in rnames: oleft.write(rnames[rname]['left']+"\n") oright.write(rnames[rname]['right']+"\n") oleft.close() oright.close()