def process_bp(inFileName,outFileName,coordH,regionL): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') outFile.write('browser full knownGene\n') outFile.write('track name="%s" visibility=2\n' % inFileName) for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'): raise Exception locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])] for loc in locL: loc.chrSta += coordH[loc.chrom][1] -1 loc.chrEnd += coordH[loc.chrom][1] -1 loc.chrom = coordH[loc.chrom][0] flag = False for loc in locL: for region in regionL: if loc.overlap(region) > 0: flag = True if flag: print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())), for loc in locL: outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bed(inFileName, outFileName, coordH): result = mygsnap.gsnapFile(inFileName, True) outFile = open(outFileName, 'w') count_all = 0 count_strand = 0 outFile.write('browser full knownGene\n') outFile.write('track name="targeted" visibility=2\n') for rL in result: if not (rL[0].nLoci == 1 and rL[1].nLoci == 1 and rL[0].pairRel == 'unpaired'): raise Exception locL = [ mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2]) ] for loc in locL: loc.chrSta += coordH[loc.chrom][1] - 1 loc.chrEnd += coordH[loc.chrom][1] - 1 loc.chrom = coordH[loc.chrom][0] for loc in locL: outFile.write('%s\t%s\t%s\n' % (loc.chrom, loc.chrSta, loc.chrEnd))
def filter_transloc(inFileName, outFileName): result = mygsnap.gsnapFile(inFileName, False) outFile = open(outFileName, 'w') count_all = 0 count_transloc = 0 for r in result: count_all += 1 if not '(transloc)' in r.pairRel: continue match = r.matchL()[0] segObjL = match.getSegInfo() skip = False for segObj in segObjL: if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5: skip = True break if skip: continue outFile.write(r.rawText() + '\n') count_transloc += 1 print 'Results:', count_transloc, count_all
def filter_transloc(inFileName,outFileName): result = mygsnap.gsnapFile(inFileName,False) outFile = open(outFileName, 'w') count_all = 0 count_transloc = 0 for r in result: count_all += 1 if not '(transloc)' in r.pairRel: continue match = r.matchL()[0] segObjL = match.getSegInfo() skip = False for segObj in segObjL: if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5: skip = True break if skip: continue outFile.write(r.rawText()+'\n') count_transloc += 1 print 'Results:',count_transloc,count_all
def filter_strand(inFileName,outFileName): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') count_all = 0 count_strand = 0 for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'): raise Exception chrom0 = rL[0].matchL()[0].segL[0][2].split(':')[0] chrom1 = rL[1].matchL()[0].segL[0][2].split(':')[0] if ((chrom0[0]==chrom0[-1] and chrom1[0]!=chrom1[-1]) or (chrom0[0]!=chrom0[-1] and chrom1[0]==chrom1[-1])) and chrom0[1:-1]!=chrom1[1:-1]: for i in (0,1): outFile.write(rL[i].rawText()+'\n') count_strand += 1 else: for i in (0,1): print rL[i].rawText() count_all += 1 print count_strand, count_all
def process_bp(inFileName,outFileName,regionL): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') outFile.write('browser full knownGene\n') outFile.write('track name="%s" visibility=2\n' % inFileName) for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='concordant'): raise Exception locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])] flag = False for loc in locL: for region in regionL: if loc.overlap(region) > 0: flag = True if flag: print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())), for loc in locL: outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def filter_strand(inFileName, outFileName): result = mygsnap.gsnapFile(inFileName, True) outFile = open(outFileName, 'w') count_all = 0 count_strand = 0 for rL in result: if not (rL[0].nLoci == 1 and rL[1].nLoci == 1 and rL[0].pairRel == 'unpaired'): raise Exception chrom0 = rL[0].matchL()[0].segL[0][2].split(':')[0] chrom1 = rL[1].matchL()[0].segL[0][2].split(':')[0] if ((chrom0[0] == chrom0[-1] and chrom1[0] != chrom1[-1]) or (chrom0[0] != chrom0[-1] and chrom1[0] == chrom1[-1])) and chrom0[1:-1] != chrom1[1:-1]: for i in (0, 1): outFile.write(rL[i].rawText() + '\n') count_strand += 1 else: for i in (0, 1): print rL[i].rawText() count_all += 1 print count_strand, count_all
def process_bed(inFileName,outFileName,coordH): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') count_all = 0 count_strand = 0 outFile.write('browser full knownGene\n') outFile.write('track name="targeted" visibility=2\n') for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'): raise Exception locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])] for loc in locL: loc.chrSta += coordH[loc.chrom][1] -1 loc.chrEnd += coordH[loc.chrom][1] -1 loc.chrom = coordH[loc.chrom][0] for loc in locL: outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bp(inGsnapFileName, outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName, False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(), 'DNA') offset = len(seq) - int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH, junction, (offset, seq)) for ((j1, j2), vL) in seqH.items(): vL.sort(lambda x, y: cmp(x[0], y[0])) vL_mod = [] for (offset, seq) in vL: offset = blockSize - offset + 1 vL_mod.append('%s:%s' % (offset, seq)) outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0], int(j1[1]) - blockSize, j1[1], j1[0].split('_')[0], j1[1], int(j1[1]) + blockSize, '|'.join(vL_mod)))
def process_bp(inGsnapFileName): result = mygsnap.gsnapFile(inGsnapFileName, False) #outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups() bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups() # if bp1[0] == bp2[0]: # continue if direction == 'sense': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) bp12 = (bp1, bp2) else: seq = mybasic.rc(r.seq(), 'DNA') offset = len(seq) - int(match.segL[0][1].split('..')[1]) bp12 = (bp2, bp1) mybasic.addHash(seqH, bp12, (offset, seq)) seqL = seqH.items() seqL.sort(lambda x, y: cmp(len(y[1]), len(x[1]))) for ((bp1, bp2), vL) in seqL: vL.sort(lambda x, y: cmp(y[0], x[0])) maxOffset = vL[0][0] print '\n', bp1, bp2, len(vL), '\n' for (offset, seq) in vL: print '%s%s %s' % (' ' * (maxOffset - offset), seq[:offset], seq[offset:])
def main(inGsnapFileName, outReportFileName, sampN, geneNL=[], overlap=10): eiH, ei_keyH, juncInfoH = loadAnnot(geneNL) print 'Finished loading refFlat' result = mygsnap.gsnapFile(inGsnapFileName, False) count = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] for seg in match.segL: loc = seg[2] rm = re.match('([+-])([^:]+):([0-9,]+)..([0-9,]+)', loc) strand = rm.group(1) chrom = rm.group(2) chrPosL = [int(rm.group(3)), int(rm.group(4))] chrSta = min(chrPosL) - 1 chrEnd = max(chrPosL) for pos in ei_keyH[chrom]: if chrSta + overlap <= pos <= chrEnd - overlap: eiH[chrom][pos] += 1 elif chrEnd - overlap < pos: break # count += 1 # # if count % 10000 == 0: # print count outReportFile = open(outReportFileName, 'w') for chrom in ei_keyH.keys(): for e in ei_keyH[chrom]: if eiH[chrom][e] == []: continue outReportFile.write( '%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom, e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
def process_bp(inGsnapFileName,outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH,junction,(offset,seq)) for ((k1,k2), v) in seqH.items(): v.sort(lambda x,y: cmp(y[0],x[0])) k1T = re.match() k2T = re.match() k1_pos = k2_pos = k1_seq = k2_seq = outBpFile.write('%s,%s,%s\n' % (':'.join(k1),':'.join(k2),'|'.join(['%s:%s' % (offset,seq) for (offset,seq) in v])))
def process_bp(inGsnapFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) #outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() # if bp1[0] == bp2[0]: # continue if direction == 'sense': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) bp12 = (bp1, bp2) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) bp12 = (bp2, bp1) mybasic.addHash(seqH,bp12,(offset,seq)) seqL = seqH.items() seqL.sort(lambda x,y: cmp(len(y[1]),len(x[1]))) for ((bp1,bp2), vL) in seqL: vL.sort(lambda x,y: cmp(y[0],x[0])) maxOffset = vL[0][0] print '\n',bp1,bp2,len(vL),'\n' for (offset,seq) in vL: print '%s%s %s' % (' ' * (maxOffset-offset),seq[:offset],seq[offset:])
def process_bp(inGsnapFileName,outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH,junction,(offset,seq)) for ((j1,j2), vL) in seqH.items(): vL.sort(lambda x,y: cmp(x[0],y[0])) vL_mod = [] for (offset,seq) in vL: offset = blockSize-offset+1 vL_mod.append('%s:%s' % (offset,seq)) outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0],int(j1[1])-blockSize,j1[1], j1[0].split('_')[0],j1[1],int(j1[1])+blockSize, '|'.join(vL_mod)))
def main(inGsnapFileName,outReportFileName,sampN,geneNL=[],overlap=10): eiH, ei_keyH, juncInfoH = loadAnnot(geneNL) print 'Finished loading refFlat' result = mygsnap.gsnapFile(inGsnapFileName,False) count = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] for seg in match.segL: loc = seg[2] rm = re.match('([+-])([^:]+):([0-9,]+)..([0-9,]+)',loc) strand = rm.group(1) chrom = rm.group(2) chrPosL = [int(rm.group(3)), int(rm.group(4))] chrSta = min(chrPosL) - 1 chrEnd = max(chrPosL) for pos in ei_keyH[chrom]: if chrSta+overlap <= pos <= chrEnd-overlap: eiH[chrom][pos] += 1 elif chrEnd-overlap < pos: break # count += 1 # # if count % 10000 == 0: # print count outReportFile = open(outReportFileName,'w') for chrom in ei_keyH.keys(): for e in ei_keyH[chrom]: if eiH[chrom][e]==[]: continue outReportFile.write('%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom,e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
def main(inGsnapFileName,outReportFileName,sampN,geneNL=[],overlap=10): eiH, ei_keyH, juncInfoH, ei_cntH = loadAnnot(geneNL) print 'Finished loading refFlat' result = mygsnap.gsnapFile(inGsnapFileName,False) count = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] for seg in match.segL: loc = mygenome.locus(seg[2]) if loc.chrSta + overlap > loc.chrEnd - overlap: continue cnt_s = findCut(ei_cntH[loc.chrom], ei_keyH[loc.chrom], loc.chrSta + overlap - 1) cnt_e = findCut(ei_cntH[loc.chrom], ei_keyH[loc.chrom], loc.chrEnd - overlap) if cnt_e < 1: ## no junction overlaps continue elif cnt_s != cnt_e: # overlapping junction exists pos_min = bisect.bisect_right(ei_keyH[loc.chrom], loc.chrSta + overlap - 1) - 1 pos_max = bisect.bisect_right(ei_keyH[loc.chrom], loc.chrEnd - overlap) for pos in range(pos_min, pos_max): if loc.chrSta+overlap <= ei_keyH[loc.chrom][pos] <= loc.chrEnd-overlap: eiH[loc.chrom][ei_keyH[loc.chrom][pos]] += 1 # count += 1 # # if count % 10000 == 0: # print count outReportFile = open(outReportFileName,'w') for chrom in ei_keyH.keys(): for e in ei_keyH[chrom]: if eiH[chrom][e]==[]: continue outReportFile.write('%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom,e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
def filter_annot1(inFileName, outFileName): result = mygsnap.gsnapFile(inFileName, False) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for r in result: if not '(transloc)' in r.pairRel: raise Exception match = r.matchL()[0] geneSetL = [] for i in range(len(match.segL)): rm = re.search('label_[12]:([^,\t]*)', match.segL[i][3]) if rm: geneSetL.append( set([x.split('.exon')[0] for x in rm.group(1).split('|')])) else: geneSetL.append(set()) geneSetCommon = geneSetL[0] for s in geneSetL[1:]: geneSetCommon = geneSetCommon.intersection(s) if len(geneSetCommon) == 0: outFile.write(r.rawText() + '\n') count_include += 1 # else: # print r.rawText() count_all += 1 print 'Results:', count_include, count_all
def filter_annot2(inFileName, outFileName): result = mygsnap.gsnapFile(inFileName, False) outFile = open(outFileName, "w") count_all = 0 count_include = 0 for r in result: if not "(transloc)" in r.pairRel: raise Exception match = r.matchL()[0] geneSetL = [] isThereEmptySet = False for i in range(len(match.segL)): rm = re.search("label_[12]:([^,\t]*)", match.segL[i][3]) if rm: geneSetL.append(set([x.split(".exon")[0] for x in rm.group(1).split("|")])) else: geneSetL.append(set()) isThereEmptySet = True geneSetCommon = geneSetL[0] for s in geneSetL[1:]: geneSetCommon = geneSetCommon.intersection(s) if len(geneSetCommon) == 0 and isThereEmptySet == False: outFile.write(r.rawText() + "\n") count_include += 1 else: print r.rawText() count_all += 1 print count_include, count_all
def filter_crossMap(inFileName,outFileName): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') count_all = 0 count_crossMap = 0 for rL in result: if rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired': for i in (0,1): outFile.write(rL[i].rawText()+'\n') count_crossMap += 1 count_all += 1 print count_crossMap,count_all
def main(inGsnapFileName,outReportFileName,sampN,geneNL=[],overlap=10): eiH, ei_keyH, juncInfoH = loadAnnot(geneNL) print 'Finished loading refFlat' result = mygsnap.gsnapFile(inGsnapFileName,False) count = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] for seg in match.segL: loc = mygenome.locus(seg[2]) cursor.execute('select 1 from temp_table where chrom="%s" and pos>=%s and pos<=%s' % (loc.chrom,loc.chrSta+overlap,loc.chrEnd-overlap)) if cursor.fetchone(): eiH[loc.chrom][pos] += 1 count += 1 if count % 10000 == 0: print count outReportFile = open(outReportFileName,'w') for chrom in ei_keyH.keys(): for e in ei_keyH[chrom]: if eiH[chrom][e]==[]: continue outReportFile.write('%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom,e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
def filter_crossMap(inFileName, outFileName): result = mygsnap.gsnapFile(inFileName, True) outFile = open(outFileName, 'w') count_all = 0 count_crossMap = 0 for rL in result: if rL[0].nLoci == 1 and rL[1].nLoci == 1 and rL[ 0].pairRel == 'unpaired': for i in (0, 1): outFile.write(rL[i].rawText() + '\n') count_crossMap += 1 count_all += 1 print count_crossMap, count_all
def bp_filter(inFileName,outFileName,expSize): ''' filters gsnap records with (no-mismatch, no-indel, no-splicing) and (insert_length > N-nt) ''' result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for rL in result: count_all += 1 if rL[0].nLoci != 1 or rL[1].nLoci != 1: continue skip = False for i in (0,1): match = rL[i].matchL()[0] seg = match.getSegInfo()[0] if len(match.segL) > 1 or match.pairInfo()[0] < 1000 or match.pairInfo()[0] > 10000 or (seg.len - seg.numMatch) > 0: skip = True break if skip: continue for i in (0,1): print rL[i].rawText() print rL[i].matchL()[0].getSegInfo()[0].len, rL[i].matchL()[0].getSegInfo()[0].numMatch outFile.write(rL[i].rawText()+'\n') count_include += 1 print count_include, count_all
def filter_annot1(inFileName,outFileName): result = mygsnap.gsnapFile(inFileName,False) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for r in result: if not '(transloc)' in r.pairRel: raise Exception match = r.matchL()[0] geneSetL = [] for i in range(len(match.segL)): rm = re.search('label_[12]:([^,\t]*)',match.segL[i][3]) if rm: geneSetL.append(set([x.split('.exon')[0] for x in rm.group(1).split('|')])) else: geneSetL.append(set()) geneSetCommon = geneSetL[0] for s in geneSetL[1:]: geneSetCommon = geneSetCommon.intersection(s) if len(geneSetCommon) == 0: outFile.write(r.rawText()+'\n') count_include += 1 # else: # print r.rawText() count_all += 1 print 'Results:',count_include,count_all
def bp_filter(inFileName, outFileName, expSize): ''' filters gsnap records with (no-mismatch, no-indel, no-splicing) and (insert_length > N-nt) ''' result = mygsnap.gsnapFile(inFileName, True) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for rL in result: count_all += 1 if rL[0].nLoci != 1 or rL[1].nLoci != 1 or rL[0].pairRel not in ( 'concordant', 'paired'): continue skip = False for i in (0, 1): match = rL[i].matchL()[0] if len(match.segL) > 1 or match.pairInfo()[0] == int( expSize) or match.getSegInfo()[0].numMismatch > 0: skip = True break if skip: continue for i in (0, 1): outFile.write(rL[i].rawText() + '\n') count_include += 1 print count_include, count_all
def bp_filter(inFileName,outFileName,expSize): ''' filters gsnap records with (no-mismatch, no-indel, no-splicing) and (insert_length > N-nt) ''' result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for rL in result: count_all += 1 if rL[0].nLoci != 1 or rL[1].nLoci != 1 or rL[0].pairRel not in ('concordant','paired'): continue skip = False for i in (0,1): match = rL[i].matchL()[0] if len(match.segL) > 1 or match.pairInfo()[0] == int(expSize) or match.getSegInfo()[0].numMismatch > 0: skip = True break if skip: continue for i in (0,1): outFile.write(rL[i].rawText()+'\n') count_include += 1 print count_include, count_all
def process_bp(inFileName,outFileName,regionL): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') outFile.write('browser full knownGene\n') outFile.write('track name="%s" visibility=2\n' % inFileName) for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='concordant'): raise Exception if int(rL[0].pairInfo()[0]) < 76: continue locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])] flag = False for loc in locL: for region in regionL: if loc.overlap(region) > 0: flag = True if flag: # print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())), # for loc in locL: # outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd)) if locL[0].chrEnd < locL[1].chrSta: outFile.write('%s\t%s\t%s\n' % (loc.chrom,locL[0].chrEnd,locL[1].chrSta)) else: outFile.write('%s\t%s\t%s\n' % (loc.chrom,locL[1].chrEnd,locL[0].chrSta))
def fusion_proc_sort(inGsnapFileName,outGsnapFileName,outReportFileName,sampN): result = mygsnap.gsnapFile(inGsnapFileName,False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) transcriptL = [] for i in range(2): rm = re.search('label_[12]:([^,\t]*)', match.segL[i][3]) if rm: transcriptL.append(rm.group(1).replace('|',',')) else: transcriptL.append('') s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2) if (bp1.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand1 = '+' elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand2 = '+' elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand2 = '-' else: raise Exception if direction=='sense': key = ((trans_strand1,)+bp1.groups()[1:],(trans_strand2,)+bp2.groups()[1:]) elif direction=='antisense': key = ((trans_strand2,)+bp2.groups()[1:],(trans_strand1,)+bp1.groups()[1:]) transcriptL = transcriptL[::-1] else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['pos'].append((direction,offset)) else: juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'pos':[(direction,offset)], 'transcript':transcriptL} juncKH = juncHH.items() juncKH.sort(lambda x,y: cmp(len(set(y[1]['pos'])),len(set(x[1]['pos'])))) outGsnapFile = open(outGsnapFileName,'w') outReportFile = open(outReportFileName,'w') for (key, juncH) in juncKH: outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (juncH['splice_type'], sampN, key[0][0]+':'.join(key[0][1:]), key[1][0]+':'.join(key[1][1:]), \ juncH['transcript'][0], juncH['transcript'][1], \ len(juncH['match']), len(set(juncH['seq'])), len(set(juncH['pos'])))) for m in juncH['match']: outGsnapFile.write(m.rawText()+'\n')
def gsnap_process_junction(inGsnapFileName,outGsnapFileName,outReportFileName,sampN): result = mygsnap.gsnapFile(inGsnapFileName,False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) transcript1 = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) if transcript1: transcript1 = tuple([x.split('.exon')[0] for x in transcript1.group(1).split('|')]) else: transcript1 = () transcript2 = re.search('label_[12]:([^,\t]*)', match.segL[1][3]) if transcript2: transcript2 = tuple([x.split('.exon')[0] for x in transcript2.group(1).split('|')]) else: transcript2 = () s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2) if (bp1.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand1 = '+' elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand2 = '+' elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand2 = '-' else: raise Exception if direction=='sense': key = (bp1.groups()[1:],bp2.groups()[1:]) transcript = (transcript1,transcript2) elif direction=='antisense': key = (bp2.groups()[1:],bp1.groups()[1:]) transcript = (transcript2,transcript1) else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['reg'].append((direction,offset)) else: juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'reg':[(direction,offset)], 'transcript':transcript} juncKH = juncHH.items() juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg'])))) outGsnapFile = open(outGsnapFileName,'w') outReportFile = open(outReportFileName,'w') for (key, juncH) in juncKH: outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (juncH['splice_type'], sampN,':'.join(key[0]), ':'.join(key[1]),\ ';'.join(juncH['transcript'][0]), ';'.join(juncH['transcript'][1]),\ len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg'])))) for m in juncH['match']: outGsnapFile.write(m.rawText()+'\n')
def make_samse(ifileN, ofileN): headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes') # ofile = open(ofileN, 'w') for header in headerL: print header # ofile.write('%s\n' % header) result = mygsnap.gsnapFile( '/pipeline/test_ini_gsnap2sam/S022_single.gsnap', False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True) ## for unpaired for r in result: qname = r.rid() flag = 0x0 rname = '*' pos = 0 mapq = 0 cigar = '' rnext = '*' ## assume --npath=1 (maximum 1 alignment per read) pnext = 0 ## assume --npath=1 (maximum 1 alignment per read) tlen = 0 ## assume --npath=1 (maximum 1 alignment per read) seq = r.seq() qual = r.qual() extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read) if r.nLoci > 1: flag = flag | 0x4 cigar = '*' new_cigar = '*' extra = '' print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra)) else: if r.pairRel == '(transloc)': match = r.matchL()[0] segL = match.getSegInfo() mapq = segL[0].mapq seq = r.seq() qual = r.qual() for seg in segL: flag = 0x0 (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() pos = min(int(pos1), int(pos2)) (cigar, clip) = seg.toCIGAR_trans() if clip < 0: ## first half seq2 = seq[:clip] qual2 = qual[:clip] else: ## second half seq2 = seq[clip:] qual2 = qual[clip:] if strand == '-': flag = flag | 0x10 seq2 = mybasic.rc(seq2) qual2 = mybasic.rev(qual2) # print qname,seg.toCIGAR_trans() print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra)) else: match = r.matchL()[ 0] ## assume --npath=1 (maximum 1 alignment per read) segL = match.getSegInfo() (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups() pos = min(int(pos1), int(pos2)) mapq = segL[0].mapq seg_nm = segL[0].numSub cigar2 = match.toCIGAR() # print qname, match.toCIGAR() if segL[0].start != '' and segL[0].start != '0': cigar = str(segL[0].start) + 'S' if strand == '-': cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar if segL[0].ins != '' and segL[0].ins != '0': cigar = str(segL[0].ins) + 'I' + cigar else: cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M' if segL[0].ins != '' and segL[0].ins != '0': cigar = cigar + str(segL[0].ins) + 'I' if len(segL) == 1: new_cigar = segL[0].toCIGAR(True) else: new_cigar = segL[0].toCIGAR() prev_cigar = new_cigar index = 0 for seg in segL[1:]: index = index + 1 if index == (len(segL) - 1): final = True else: final = False rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() match = str(seg.numMatch + seg.numSub) + 'M' if seg.ins != '' and seg.ins != '0': ins = str(seg.ins) + 'I' else: ins = '' if pos == 0 or pos > min(int(rm[2]), int(rm[3])): pos = min(int(rm[2]), int(rm[3])) if strand == '-': dist = int(pos2) - int(rm[2]) - 1 if dist > 0: cigar = match + ins + str(dist) + 'N' + cigar else: cigar = match + ins + cigar else: dist = int(rm[2]) - int(pos2) - 1 if dist > 0: cigar = cigar + str(dist) + 'N' + match + ins else: cigar = cigar + match + ins seg_nm = seg_nm + seg.numSub pos1 = rm[2] pos2 = rm[3] cur_cigar = seg.toCIGAR(final) if strand == '-': if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = cur_cigar + str(dist) + 'N' + new_cigar else: new_cigar = cur_cigar + new_cigar else: if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = new_cigar + str(dist) + 'N' + cur_cigar else: new_cigar = new_cigar + cur_cigar prev_cigar = cur_cigar if segL[-1].end != '' and segL[-1].end != '0': ## last segment if strand == '-': cigar = str(segL[-1].end) + 'S' + cigar else: cigar = cigar + str(segL[-1].end) + 'S' extra = extra + ('\tNM:i:%s' % seg_nm) if strand == '-': flag = flag | 0x10 seq = mybasic.rc(seq) qual = mybasic.rev(qual) ## print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra)) print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
def exonSkip_filter(inFileName,outFileName): ''' filters-in exon-skipping candidates in splice-mapped gsnap ''' result = mygsnap.gsnapFile(inFileName, False) outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] if len(match.segL) != 2: continue segObjL = match.getSegInfo() jncH = {} skip = False for segObj in segObjL: if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5: skip = True break if segObj.label == '': break for b in segObj.label.split('|'): rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+',b) transId = rm2.group(1) exonNum = int(rm2.group(2)) mybasic.addHash(jncH,transId,exonNum) if skip: continue jncL = jncH.items() if len(jncL)>0 and max([len(j[1]) for j in jncL])>1: minDist = 100 for i in range(len(jncL)): if len(jncL[i][1]) == 2 and abs(jncL[i][1][0]-jncL[i][1][1]) < minDist: minDist = abs(jncL[i][1][0]-jncL[i][1][1]) if minDist > 1: outFile.write(r.rawText()+'\n') count_include += 1 count_all += 1 print 'Results:',count_include, count_all
def exonSkip_proc(inGsnapFileName, outGsnapFileName, outReportFileName, sampN): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH, geneSetH) refFlatH = mygenome.loadRefFlatByChr() result = mygsnap.gsnapFile(inGsnapFileName, False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) transcript1 = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) gene1 = set() if transcript1: transcript1 = tuple( [x.split('.exon')[0] for x in transcript1.group(1).split('|')]) for t in transcript1: g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: gene1.add(g.geneName) else: transcript1 = () transcript2 = re.search('label_[12]:([^,\t]*)', match.segL[1][3]) gene2 = set() if transcript2: transcript2 = tuple( [x.split('.exon')[0] for x in transcript2.group(1).split('|')]) for t in transcript2: g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH) if g.geneName: gene2.add(g.geneName) else: transcript2 = () s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)', s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+', s2) if (bp1.group(1), direction) in (('+', 'sense'), ('-', 'antisense')): trans_strand1 = '+' elif (bp1.group(1), direction) in (('+', 'antisense'), ('-', 'sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1), direction) in (('+', 'sense'), ('-', 'antisense')): trans_strand2 = '+' elif (bp2.group(1), direction) in (('+', 'antisense'), ('-', 'sense')): trans_strand2 = '-' else: raise Exception bp_gene1 = mygenome.locus( '%s:%s-%s%s' % (bp1.group(2), int(bp1.group(3)) - 1, bp1.group(3), trans_strand1)).overlappingGeneL( refFlatH=refFlatH, strand_sensitive=True) bp_gene2 = mygenome.locus( '%s:%s-%s%s' % (bp2.group(2), int(bp2.group(3)) - 1, bp2.group(3), trans_strand2)).overlappingGeneL( refFlatH=refFlatH, strand_sensitive=True) if direction == 'sense': key = (bp1.groups()[1:], bp2.groups()[1:]) transcript = (transcript1, transcript2) gene = (tuple(gene1), tuple(gene2)) bp_gene = (bp_gene1, bp_gene2) elif direction == 'antisense': key = (bp2.groups()[1:], bp1.groups()[1:]) transcript = (transcript2, transcript1) gene = (tuple(gene2), tuple(gene1)) bp_gene = (bp_gene2, bp_gene1) else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['reg'].append((direction, offset)) else: juncHH[key] = { 'match': [r], 'splice_type': splice_type, 'seq': [r.seq()], 'reg': [(direction, offset)], 'transcript': transcript, 'gene': gene, 'bp_gene': bp_gene } juncKH = juncHH.items() juncKH.sort(lambda x, y: cmp(len(set(y[1]['reg'])), len(set(x[1]['reg'])))) outGsnapFile = open(outGsnapFileName, 'w') outReportFile = open(outReportFileName, 'w') for (key, juncH) in juncKH: if key[0][0] == key[1][0]: type = 'intra' else: type = 'inter' geneInfo1 = [] censusInfo1 = [] for geneName in juncH['gene'][0]: gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH) geneInfo1.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) geneInfo2 = [] censusInfo2 = [] for geneName in juncH['gene'][1]: gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH) geneInfo2.append( '%s:%s:%s' % (geneName, gene.getAttr('desc'), gene.getAttr('summary'))) censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'), gene.getAttr('census_germline'), gene.getAttr('census_mutType'), gene.getAttr('census_translocPartners'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \ ';'.join(juncH['transcript'][0]), ';'.join(juncH['transcript'][1]), ';'.join(juncH['gene'][0]), ';'.join(juncH['gene'][1]), ';'.join(geneInfo1), ';'.join(geneInfo2), \ ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(juncH['bp_gene'][0]), ','.join(juncH['bp_gene'][1]), \ len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg'])))) for m in juncH['match']: outGsnapFile.write(m.rawText() + '\n')
def exonSkip_proc_sort(inGsnapFileName,outGsnapFileName,outReportFileName,sampN): result = mygsnap.gsnapFile(inGsnapFileName,False) juncHH = {} for r in result: if r.nLoci != 1: raise Exception match = r.matchL()[0] if len(match.segL) != 2: raise Exception direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) exonLP = [] for i in range(len(match.segL)): rm = re.search('label_[12]:([^,\t]*)',match.segL[i][3]) if not rm: raise Exception exonLP.append(rm.group(1).replace('|',',')) s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2) if (bp1.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand1 = '+' elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand2 = '+' elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand2 = '-' else: raise Exception if direction=='sense': key = ((trans_strand1,)+bp1.groups()[1:],(trans_strand2,)+bp2.groups()[1:]) elif direction=='antisense': key = ((trans_strand2,)+bp2.groups()[1:],(trans_strand1,)+bp1.groups()[1:]) exonLP = exonLP[::-1] else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['reg'].append((direction,offset)) else: juncHH[key] = {'match':[r], 'seq':[r.seq()], 'reg':[(direction,offset)], 'exonLP':exonLP} juncKH = juncHH.items() juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg'])))) outGsnapFile = open(outGsnapFileName,'w') outReportFile = open(outReportFileName,'w') for (key, juncH) in juncKH: outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (sampN, key[0][0]+':'.join(key[0][1:]), key[1][0]+':'.join(key[1][1:]),\ juncH['exonLP'][0], juncH['exonLP'][1],\ len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg'])))) for m in juncH['match']: outGsnapFile.write(m.rawText()+'\n')
#!/usr/bin/python import sys, re, numpy import mygsnap if len(sys.argv) >= 4: inFileName = sys.argv[1] outFileName_matches = sys.argv[2] outFileName_mmPos = sys.argv[3] else: inFileName = 'GH.txt' outFileName_matches = 'GH_matches.dst' outFileName_mmPos = 'GH_mmPos.txt' result = mygsnap.gsnapFile(inFileName) out_matches = open(outFileName_matches, 'w') out_mmPos = open(outFileName_mmPos, 'w') matches_count = {'unpaired': [], 'concordant': []} matches_score = {'unpaired': [], 'concordant': []} totalPairs = {'unpaired': 0, 'concordant': 0} mmPos = {'unpaired': {0: None, 1: None}, 'concordant': {0: None, 1: None}} for rL in result: if rL[0].nLoci == 1 and rL[1].nLoci == 1 and not '(transloc)' in rL[ 0].pairRel: # unique, no-within-read-splicing mL = [rL[0].matchL()[0], rL[1].matchL()[0]]
def gsnap_process_junction(inGsnapFileName,outGsnapFileName,outReportFileName,sampN): geneNameH = mygenome.geneNameH() geneSetH = mygenome.geneSetH() geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH) refFlatH = mygenome.loadRefFlatByChr() result = mygsnap.gsnapFile(inGsnapFileName,False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) gene1 = set() if rm: trans_exon1 = rm.group(1).split('|') for t in trans_exon1: g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH) if g.geneName: gene1.add(g.geneName) else: trans_exon1 = () rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3]) gene2 = set() if rm: trans_exon2 = rm.group(1).split('|') for t in trans_exon2: g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH) if g.geneName: gene2.add(g.geneName) else: trans_exon2 = () s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2) if (bp1.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand1 = '+' elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1),direction) in (('+','sense'),('-','antisense')): trans_strand2 = '+' elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')): trans_strand2 = '-' else: raise Exception locus1 = mygenome.locus('%s:%s-%s%s' % (bp1.group(2),int(bp1.group(3))-1,bp1.group(3),trans_strand1)) bp_gene1 = list(set(locus1.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene1)) locus2 = mygenome.locus('%s:%s-%s%s' % (bp2.group(2),int(bp2.group(3))-2,bp2.group(3),trans_strand2)) bp_gene2 = list(set(locus2.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene2)) if direction=='sense': key = (bp1.groups()[1:],bp2.groups()[1:]) trans_exon = (trans_exon1,trans_exon2) gene = (list(gene1),list(gene2)) bp_gene = (bp_gene1,bp_gene2) elif direction=='antisense': key = (bp2.groups()[1:],bp1.groups()[1:]) trans_exon = (trans_exon2,trans_exon1) gene = (list(gene2),list(gene1)) bp_gene = (bp_gene2,bp_gene1) else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['reg'].append((direction,offset)) else: juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'reg':[(direction,offset)], 'trans_exon':trans_exon, 'gene':gene, 'bp_gene':bp_gene} juncKH = juncHH.items() juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg'])))) outGsnapFile = open(outGsnapFileName,'w') outReportFile = open(outReportFileName,'w') for (key, juncH) in juncKH: if key[0][0] == key[1][0]: type = 'intra' else: type = 'inter' geneInfo1 = [] censusInfo1 = [] for geneName in juncH['gene'][0]+juncH['bp_gene'][0]: gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH) geneInfo1.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) geneInfo2 = [] censusInfo2 = [] for geneName in juncH['gene'][1]+juncH['bp_gene'][1]: gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH) geneInfo2.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary'))) censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners'))) outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s;%s\t%s;%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \ ','.join(juncH['trans_exon'][0]), ','.join(juncH['trans_exon'][1]), \ ','.join(juncH['gene'][0]), ','.join(juncH['bp_gene'][0]), ','.join(juncH['gene'][1]), ','.join(juncH['bp_gene'][1]), \ ';'.join(geneInfo1), ';'.join(geneInfo2), ';'.join(censusInfo1), ';'.join(censusInfo2), \ len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg'])))) for m in juncH['match']: outGsnapFile.write(m.rawText()+'\n')
def fusion_proc_sort(inGsnapFileName, outGsnapFileName, outReportFileName, sampN): result = mygsnap.gsnapFile(inGsnapFileName, False) juncHH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1) direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1)) transcriptL = [] for i in range(2): rm = re.search('label_[12]:([^,\t]*)', match.segL[i][3]) if rm: transcriptL.append(rm.group(1).replace('|', ',')) else: transcriptL.append('') s1 = match.segL[0][2] s2 = match.segL[1][2] bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)', s1) bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+', s2) if (bp1.group(1), direction) in (('+', 'sense'), ('-', 'antisense')): trans_strand1 = '+' elif (bp1.group(1), direction) in (('+', 'antisense'), ('-', 'sense')): trans_strand1 = '-' else: raise Exception if (bp2.group(1), direction) in (('+', 'sense'), ('-', 'antisense')): trans_strand2 = '+' elif (bp2.group(1), direction) in (('+', 'antisense'), ('-', 'sense')): trans_strand2 = '-' else: raise Exception if direction == 'sense': key = ((trans_strand1, ) + bp1.groups()[1:], (trans_strand2, ) + bp2.groups()[1:]) elif direction == 'antisense': key = ((trans_strand2, ) + bp2.groups()[1:], (trans_strand1, ) + bp1.groups()[1:]) transcriptL = transcriptL[::-1] else: raise Exception if key in juncHH: juncHH[key]['match'].append(r) juncHH[key]['seq'].append(r.seq()) juncHH[key]['pos'].append((direction, offset)) else: juncHH[key] = { 'match': [r], 'splice_type': splice_type, 'seq': [r.seq()], 'pos': [(direction, offset)], 'transcript': transcriptL } juncKH = juncHH.items() juncKH.sort(lambda x, y: cmp(len(set(y[1]['pos'])), len(set(x[1]['pos'])))) outGsnapFile = open(outGsnapFileName, 'w') outReportFile = open(outReportFileName, 'w') for (key, juncH) in juncKH: outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (juncH['splice_type'], sampN, key[0][0]+':'.join(key[0][1:]), key[1][0]+':'.join(key[1][1:]), \ juncH['transcript'][0], juncH['transcript'][1], \ len(juncH['match']), len(set(juncH['seq'])), len(set(juncH['pos'])))) for m in juncH['match']: outGsnapFile.write(m.rawText() + '\n')
def make_samse(ifileN, ofileN): headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes') # ofile = open(ofileN, 'w') for header in headerL: print header # ofile.write('%s\n' % header) result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_single.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True) ## for unpaired for r in result: qname = r.rid() flag = 0x0 rname = '*' pos = 0 mapq = 0 cigar = '' rnext = '*' ## assume --npath=1 (maximum 1 alignment per read) pnext = 0 ## assume --npath=1 (maximum 1 alignment per read) tlen = 0 ## assume --npath=1 (maximum 1 alignment per read) seq = r.seq() qual = r.qual() extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read) if r.nLoci > 1: flag = flag | 0x4 cigar = '*' new_cigar = '*' extra = '' print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra)) else: if r.pairRel == '(transloc)': match = r.matchL()[0] segL = match.getSegInfo() mapq = segL[0].mapq seq = r.seq() qual = r.qual() for seg in segL: flag = 0x0 (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() pos = min(int(pos1), int(pos2)) (cigar,clip) = seg.toCIGAR_trans() if clip < 0: ## first half seq2 = seq[:clip] qual2 = qual[:clip] else: ## second half seq2 = seq[clip:] qual2 = qual[clip:] if strand == '-': flag = flag | 0x10 seq2 = mybasic.rc(seq2) qual2 = mybasic.rev(qual2) # print qname,seg.toCIGAR_trans() print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra)) else: match = r.matchL()[0] ## assume --npath=1 (maximum 1 alignment per read) segL = match.getSegInfo() (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups() pos = min(int(pos1), int(pos2)) mapq = segL[0].mapq seg_nm = segL[0].numSub cigar2 = match.toCIGAR() # print qname, match.toCIGAR() if segL[0].start != '' and segL[0].start != '0': cigar = str(segL[0].start) + 'S' if strand == '-': cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar if segL[0].ins != '' and segL[0].ins != '0': cigar = str(segL[0].ins) + 'I' + cigar else: cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M' if segL[0].ins != '' and segL[0].ins != '0': cigar = cigar + str(segL[0].ins) + 'I' if len(segL) == 1: new_cigar = segL[0].toCIGAR(True) else: new_cigar = segL[0].toCIGAR() prev_cigar = new_cigar index = 0 for seg in segL[1:]: index = index + 1 if index == (len(segL) - 1): final = True else: final = False rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() match = str(seg.numMatch + seg.numSub) + 'M' if seg.ins != '' and seg.ins != '0': ins = str(seg.ins) + 'I' else: ins = '' if pos == 0 or pos > min(int(rm[2]), int(rm[3])): pos = min(int(rm[2]), int(rm[3])) if strand == '-': dist = int(pos2) - int(rm[2]) - 1 if dist > 0: cigar = match + ins + str(dist) + 'N' + cigar else: cigar = match + ins + cigar else: dist = int(rm[2]) - int(pos2) - 1 if dist > 0: cigar = cigar + str(dist) + 'N' + match + ins else: cigar = cigar + match + ins seg_nm = seg_nm + seg.numSub pos1 = rm[2] pos2 = rm[3] cur_cigar = seg.toCIGAR(final) if strand == '-': if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = cur_cigar + str(dist) + 'N' + new_cigar else: new_cigar = cur_cigar + new_cigar else: if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = new_cigar + str(dist) + 'N' + cur_cigar else: new_cigar = new_cigar + cur_cigar prev_cigar = cur_cigar if segL[-1].end != '' and segL[-1].end != '0': ## last segment if strand == '-': cigar = str(segL[-1].end) + 'S' + cigar else: cigar = cigar + str(segL[-1].end) + 'S' extra = extra + ('\tNM:i:%s' % seg_nm) if strand == '-': flag = flag | 0x10 seq = mybasic.rc(seq) qual = mybasic.rev(qual) ## print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra)) print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
def exonSkip_filter(inFileName, outFileName): ''' filters-in exon-skipping candidates in splice-mapped gsnap ''' result = mygsnap.gsnapFile(inFileName, False) if outFileName[-3:] == '.gz': outFile = gzip.open(outFileName, 'wb') else: outFile = open(outFileName, 'w') count_all = 0 count_include = 0 for r in result: if r.nLoci != 1: continue match = r.matchL()[0] if len(match.segL) != 2: continue segObjL = match.getSegInfo() jncH = {} skip = False for segObj in segObjL: if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5: skip = True break if segObj.label == '': break for b in segObj.label.split('|'): rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+', b) transId = rm2.group(1) exonNum = int(rm2.group(2)) mybasic.addHash(jncH, transId, exonNum) if skip: continue jncL = jncH.items() if len(jncL) > 0 and max([len(j[1]) for j in jncL]) > 1: minDist = 100 for i in range(len(jncL)): if len(jncL[i][1]) == 2 and abs(jncL[i][1][0] - jncL[i][1][1]) < minDist: minDist = abs(jncL[i][1][0] - jncL[i][1][1]) if minDist == 1: # only difference outFile.write(r.rawText() + '\n') count_include += 1 count_all += 1 print 'Results:', count_include, count_all
import sys import mygsnap if len(sys.argv) >= 3: inFileName = sys.argv[1] outFileName = sys.argv[2] else: inFileName = '/Data2/RNASeq_SMC1_S02_result.txt' outFileName = 'GH_S02_matchfilter2.txt' matchCutOff = 90 result = mygsnap.gsnapFile(inFileName) outFile = open(outFileName, 'w') for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired') or '(transloc)' in rL[0].pairRel: continue if not (len(rL[0].matchL()[0].mergedLocusL())==1 and len(rL[1].matchL()[0].mergedLocusL())==1): continue if not (rL[0].matchL()[0].numMatch()>=matchCutOff and rL[1].matchL()[0].numMatch()>=matchCutOff): continue for i in (0,1):