def extractSVReadPairs(bamFilePath, outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2, max_depth, search_length, search_margin): """ read pairs containing break points are extracted. (yshira 2015/04/23) The exact condition is as follows: 1. one of the read in the pair has the break point of the SV candidate 2. the start positions of the read pairs are within 800bp of the break point of the SV candidate Some minor concern for the above conditions are: 1. Depending on the choice of the "start position" or "end position", the distance between the read and break point differs. This can generate slight bias... (but I believe we can recover this by setting sufficient margin (800bp), and summarize the alignment result carefully.) 2. Maybe, for some of the read pair, the result of alignment is obvious. But should we re-align them? """ bamfile = pysam.Samfile(bamFilePath, 'rb') # if the #sequence read is over the `maxDepth`, then that key is ignored depthFlag = 0 if bamfile.count(juncChr1, int(juncPos1) - 1, int(juncPos1) + 1) >= max_depth: depthFlag = 1 if bamfile.count(juncChr2, int(juncPos2) - 1, int(juncPos2) + 1) >= max_depth: depthFlag = 1 if depthFlag == 1: print >> sys.stderr, "sequence depth exceeds the threshould for: " + ','.join( [juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2]) return 1 hOUT = open(outputFilePath, 'w') readID2exist = {} for read in bamfile.fetch(juncChr1, max(0, int(juncPos1) - search_length), int(juncPos1) + search_length): # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue chr_current = bamfile.getrname(read.tid) pos_current = int(read.pos + 1) dir_current = ("-" if flags[4] == "1" else "+") chr_pair = bamfile.getrname(read.rnext) pos_pair = int(read.pnext + 1) dir_pair = ("-" if flags[5] == "1" else "+") # the read (with margin) contains break point if pos_current - search_margin <= int(juncPos1) <= (read.aend - 1) + search_margin: readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr1 and pos_current <= int( juncPos1 ) <= pos_pair and dir_current == "+" and dir_pair == "-": readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr2: juncFlag = 0 if juncDir1 == "+" and juncDir2 == "+" and pos_current <= int( juncPos1) and pos_pair <= int(juncPos2): juncFlag = 1 if juncDir1 == "+" and juncDir2 == "-" and pos_current <= int( juncPos1) and pos_pair >= int(juncPos2): juncFlag = 1 if juncDir1 == "-" and juncDir2 == "+" and pos_current >= int( juncPos1) and pos_pair <= int(juncPos2): juncFlag = 1 if juncDir1 == "-" and juncDir2 == "-" and pos_current >= int( juncPos1) and pos_pair >= int(juncPos2): juncFlag = 1 if juncFlag == 1: readID2exist[read.qname] = 1 for read in bamfile.fetch(juncChr2, max(0, int(juncPos2) - search_length), int(juncPos2) + search_length): if read.qname == "ST-E00104:162:H03UUALXX:5:1222:21168:16006": pass # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue chr_current = bamfile.getrname(read.tid) pos_current = int(read.pos + 1) dir_current = ("-" if flags[4] == "1" else "+") chr_pair = bamfile.getrname(read.rnext) pos_pair = int(read.pnext + 1) dir_pair = ("-" if flags[5] == "1" else "+") # the read (with margin) contains break point if pos_current - search_margin <= int(juncPos2) <= (read.aend - 1) + search_margin: readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr2 and pos_current <= int( juncPos2 ) <= pos_pair and dir_current == "+" and dir_pair == "-": readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr1: juncFlag = 0 if juncDir2 == "+" and juncDir1 == "+" and pos_current <= int( juncPos2) and pos_pair <= int(juncPos1): juncFlag = 1 if juncDir2 == "+" and juncDir1 == "-" and pos_current <= int( juncPos2) and pos_pair >= int(juncPos1): juncFlag = 1 if juncDir2 == "-" and juncDir1 == "+" and pos_current >= int( juncPos2) and pos_pair <= int(juncPos1): juncFlag = 1 if juncDir2 == "-" and juncDir1 == "-" and pos_current >= int( juncPos2) and pos_pair >= int(juncPos1): juncFlag = 1 if juncFlag == 1: readID2exist[read.qname] = 1 readID2seq1 = {} readID2seq2 = {} complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} for read in bamfile.fetch(juncChr1, max(0, int(juncPos1) - search_length), int(juncPos1) + search_length): if read.qname in readID2exist: # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue tempSeq = "" if flags[4] == "1": tempSeq = utils.reverseComplement(str(read.seq)) else: tempSeq = read.seq # the first read if flags[6] == "1": readID2seq1[read.qname] = tempSeq else: readID2seq2[read.qname] = tempSeq for read in bamfile.fetch(juncChr2, max(0, int(juncPos2) - search_length), int(juncPos2) + search_length): if read.qname in readID2exist: # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue tempSeq = "" if flags[4] == "1": tempSeq = utils.reverseComplement(str(read.seq)) else: tempSeq = read.seq # the first read if flags[6] == "1": readID2seq1[read.qname] = tempSeq else: readID2seq2[read.qname] = tempSeq for readID in readID2seq1: if readID in readID2seq2: print >> hOUT, '>' + readID + '/1' print >> hOUT, readID2seq1[readID] print >> hOUT, '>' + readID + '/2' print >> hOUT, readID2seq2[readID] bamfile.close() hOUT.close() return 0
def clusterJunction(inputFilePath, outputFilePath, check_margin_size): """ script for merging and summarizing junction read pairs """ hIN = open(inputFilePath, 'r') hOUT = open(outputFilePath, 'w') mergedBedpeInfo = {} mergedJunction = {} for line in hIN: F = line.rstrip('\n').split('\t') match = 0 delList = [] for key in sorted(mergedBedpeInfo): tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split( '\t') tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[ key].split('\t') # the investigated key is sufficiently far from the current line in the input file and no additional line to merge is expected. therefore flush the key and information if F[0] != tchr1 or int(F[1]) > int(tend1) + check_margin_size: # obtain the most frequent junction junc_counter = collections.Counter( mergedJunction[key].split(';')) best_junc = junc_counter.most_common(1)[0][0] btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split( ',') btstart1 = str(int(btend1) - 1) btstart2 = str(int(btend2) - 1) print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \ tids, btinseq, btdir1, btdir2, tmqs1, talns1, \ tmqs2, talns2, tpinds, tcinds]) + '\t' + \ mergedJunction[key] # add to the deletion list (later the key will removed from the dictionaries) delList.append(key) continue else: # check whether the investigated key and the current line should be merged or not if F[0] == tchr1 and F[3] == tchr2 and F[8] == tdir1 and F[ 9] == tdir2: flag = 0 # detailed check on the junction position considering inserted sequences if F[8] == "+": expectedDiffSize = (int(F[2]) - int(tend1)) + ( len(F[7]) - int(inseqSize)) if (F[9] == "+" and int(F[5]) == int(tend2) - int(expectedDiffSize)) or ( F[9] == "-" and int(F[5]) == int(tend2) + int(expectedDiffSize)): flag = 1 else: expectedDiffSize = (int(F[2]) - int(tend1)) + ( int(inseqSize) - len(F[7])) if (F[9] == "+" and int(F[5]) == int(tend2) + int(expectedDiffSize)) or ( F[9] == "-" and int(F[5]) == int(tend2) - int(expectedDiffSize)): flag = 1 # if the junction position and direciton match if flag == 1: match = 1 newIds = tids + ';' + F[6] newInseqs = tinseqs + ';' + F[7] newMqs1 = tmqs1 + ';' + F[10] newAlns1 = talns1 + ';' + F[11] newMqs2 = tmqs2 + ';' + F[12] newAlns2 = talns2 + ';' + F[13] newPinds = tpinds + ';' + F[14] newCinds = tcinds + ';' + F[15] mergedBedpeInfo[key] = '\t'.join([ newIds, newInseqs, newMqs1, newAlns1, newMqs2, newAlns2, newPinds, newCinds ]) # check whether the inserted sequence should be reverse-complemented tinseq = F[7] if F[7] != "---" and F[8] == F[9] and F[15] == "2": # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement()) tinseq = utils.reverseComplement(F[7]) mergedJunction[ key] = mergedJunction[key] + ";" + ','.join( [F[0], F[2], F[8], F[3], F[5], F[9], tinseq]) for item in delList: del mergedBedpeInfo[item] del mergedJunction[item] # if the current line in the input file does not match any of the pooled keys if match == 0: newKey = '\t'.join([ F[0], F[1], F[2], F[3], F[4], F[5], F[8], F[9], str(len(F[7])) ]) mergedBedpeInfo[newKey] = F[6] + '\t' + F[7] + '\t' + '\t'.join( F[10:16]) # check whether the inserted sequence should be reverse-complemented tinseq = F[7] if F[7] != "---" and F[8] == F[9] and F[15] == "2": # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement()) tinseq = utils.reverseComplement(F[7]) mergedJunction[newKey] = ','.join( [F[0], F[2], F[8], F[3], F[5], F[9], tinseq]) hIN.close() # last treatment for key in sorted(mergedBedpeInfo): tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split( '\t') tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[ key].split('\t') # obtain the most frequent junction junc_counter = collections.Counter(mergedJunction[key].split(';')) best_junc = junc_counter.most_common(1)[0][0] btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split( ',') btstart1 = str(int(btend1) - 1) btstart2 = str(int(btend2) - 1) print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \ tids, btinseq, btdir1, btdir2, tmqs1, talns1, \ tmqs2, talns2, tpinds, tcinds]) + '\t' + \ mergedJunction[key] hOUT.close()
def getRefAltForSV(outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2, juncSeq, reference_genome, split_refernece_thres, validate_sequence_length): """ for short SV (mid-range (<= split_refernece_thres bp) deletion, tandem duplication), we get the two sequence for large SV (> split_refernece_thres bp), we get three sequence (one joint sequence by two break points, and two reference sequences around the break points) the only concern is short inversion... (are there some somatic short inversion?) however, this will be filtered beforehand by the "cover filter", and maybe we have to give up detecting these class of SVs. """ hOUT = open(outputFilePath, 'w') if juncSeq == "---": juncSeq = "" # for mid-range deletion or tandem duplication if juncChr1 == juncChr2 and abs(int(juncPos1) - int( juncPos2)) <= split_refernece_thres and juncDir1 != juncDir2: seq = "" for item in pysam.faidx( reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([ juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2 ]) + "_ref" print >> hOUT, seq # for mid-range deletion if juncDir1 == "+" and juncDir2 == "-": seq = "" for item in pysam.faidx( reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(juncPos1)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() seq = seq + juncSeq for item in pysam.faidx( reference_genome, juncChr2 + ":" + str(juncPos2) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([ juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2 ]) + "_alt" print >> hOUT, seq # for mid-range tandem duplication else: seq = "" for item in pysam.faidx( reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(juncPos2)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() seq = seq + juncSeq for item in pysam.faidx( reference_genome, juncChr1 + ":" + str(juncPos1) + "-" + str(int(juncPos1) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([ juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2 ]) + "_alt" print >> hOUT, seq else: seq = "" for item in pysam.faidx( reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(int(juncPos1) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([ juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2 ]) + "_ref1" print >> hOUT, seq seq = "" for item in pysam.faidx( reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([ juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2 ]) + "_ref2" print >> hOUT, seq seq = "" if juncDir1 == "+": tseq = "" for item in pysam.faidx( reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(juncPos1)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() else: tseq = "" for item in pysam.faidx( reference_genome, juncChr1 + ":" + str(juncPos1) + "-" + str(int(juncPos1) + validate_sequence_length)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() tseq = utils.reverseComplement(tseq) seq = tseq + juncSeq if juncDir2 == "-": tseq = "" for item in pysam.faidx( reference_genome, juncChr2 + ":" + str(juncPos2) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() else: tseq = "" for item in pysam.faidx( reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(juncPos2)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() tseq = utils.reverseComplement(tseq) seq = seq + tseq print >> hOUT, '>' + ','.join([ juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2 ]) + "_alt" print >> hOUT, seq hOUT.close()
def extractSVReadPairs(bamFilePath, outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2, max_depth, search_length, search_margin): """ read pairs containing break points are extracted. (yshira 2015/04/23) The exact condition is as follows: 1. one of the read in the pair has the break point of the SV candidate 2. the start positions of the read pairs are within 800bp of the break point of the SV candidate Some minor concern for the above conditions are: 1. Depending on the choice of the "start position" or "end position", the distance between the read and break point differs. This can generate slight bias... (but I believe we can recover this by setting sufficient margin (800bp), and summarize the alignment result carefully.) 2. Maybe, for some of the read pair, the result of alignment is obvious. But should we re-align them? """ bamfile = pysam.Samfile(bamFilePath, 'rb') # if the #sequence read is over the `maxDepth`, then that key is ignored depthFlag = 0 if bamfile.count(juncChr1, int(juncPos1) - 1, int(juncPos1) + 1) >= max_depth: depthFlag = 1 if bamfile.count(juncChr2, int(juncPos2) - 1, int(juncPos2) + 1) >= max_depth: depthFlag = 1 if depthFlag == 1: print >> sys.stderr, "sequence depth exceeds the threshould for: " + ','.join([juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2]) return 1 hOUT = open(outputFilePath, 'w') readID2exist = {} for read in bamfile.fetch(juncChr1, max(0, int(juncPos1) - search_length), int(juncPos1) + search_length): # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue chr_current = bamfile.getrname(read.tid) pos_current = int(read.pos + 1) dir_current = ("-" if flags[4] == "1" else "+") chr_pair = bamfile.getrname(read.rnext) pos_pair = int(read.pnext + 1) dir_pair = ("-" if flags[5] == "1" else "+") # the read (with margin) contains break point if pos_current - search_margin <= int(juncPos1) <= (read.aend - 1) + search_margin: readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr1 and pos_current <= int(juncPos1) <= pos_pair and dir_current == "+" and dir_pair == "-": readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr2: juncFlag = 0 if juncDir1 == "+" and juncDir2 == "+" and pos_current <= int(juncPos1) and pos_pair <= int(juncPos2): juncFlag = 1 if juncDir1 == "+" and juncDir2 == "-" and pos_current <= int(juncPos1) and pos_pair >= int(juncPos2): juncFlag = 1 if juncDir1 == "-" and juncDir2 == "+" and pos_current >= int(juncPos1) and pos_pair <= int(juncPos2): juncFlag = 1 if juncDir1 == "-" and juncDir2 == "-" and pos_current >= int(juncPos1) and pos_pair >= int(juncPos2): juncFlag = 1 if juncFlag == 1: readID2exist[read.qname] = 1 for read in bamfile.fetch(juncChr2, max(0, int(juncPos2) - search_length), int(juncPos2) + search_length): if read.qname == "ST-E00104:162:H03UUALXX:5:1222:21168:16006": pass # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue chr_current = bamfile.getrname(read.tid) pos_current = int(read.pos + 1) dir_current = ("-" if flags[4] == "1" else "+") chr_pair = bamfile.getrname(read.rnext) pos_pair = int(read.pnext + 1) dir_pair = ("-" if flags[5] == "1" else "+") # the read (with margin) contains break point if pos_current - search_margin <= int(juncPos2) <= (read.aend - 1) + search_margin: readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr2 and pos_current <= int(juncPos2) <= pos_pair and dir_current == "+" and dir_pair == "-": readID2exist[read.qname] = 1 # the read pair covers break point if chr_pair == juncChr1: juncFlag = 0 if juncDir2 == "+" and juncDir1 == "+" and pos_current <= int(juncPos2) and pos_pair <= int(juncPos1): juncFlag = 1 if juncDir2 == "+" and juncDir1 == "-" and pos_current <= int(juncPos2) and pos_pair >= int(juncPos1): juncFlag = 1 if juncDir2 == "-" and juncDir1 == "+" and pos_current >= int(juncPos2) and pos_pair <= int(juncPos1): juncFlag = 1 if juncDir2 == "-" and juncDir1 == "-" and pos_current >= int(juncPos2) and pos_pair >= int(juncPos1): juncFlag = 1 if juncFlag == 1: readID2exist[read.qname] = 1 readID2seq1 = {} readID2seq2 = {} complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} for read in bamfile.fetch(juncChr1, max(0, int(juncPos1) - search_length), int(juncPos1) + search_length): if read.qname in readID2exist: # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue tempSeq = "" if flags[4] == "1": tempSeq = utils.reverseComplement(str(read.seq)) else: tempSeq = read.seq # the first read if flags[6] == "1": readID2seq1[read.qname] = tempSeq else: readID2seq2[read.qname] = tempSeq for read in bamfile.fetch(juncChr2, max(0, int(juncPos2) - search_length), int(juncPos2) + search_length): if read.qname in readID2exist: # get the flag information flags = format(int(read.flag), "#014b")[:1:-1] # skip unmapped read if flags[2] == "1" or flags[3] == "1": continue # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue tempSeq = "" if flags[4] == "1": tempSeq = utils.reverseComplement(str(read.seq)) else: tempSeq = read.seq # the first read if flags[6] == "1": readID2seq1[read.qname] = tempSeq else: readID2seq2[read.qname] = tempSeq for readID in readID2seq1: if readID in readID2seq2: print >> hOUT, '>' + readID + '/1' print >> hOUT, readID2seq1[readID] print >> hOUT, '>' + readID + '/2' print >> hOUT, readID2seq2[readID] bamfile.close() hOUT.close() return 0
def getRefAltForSV(outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2, juncSeq, reference_genome, split_refernece_thres, validate_sequence_length): """ for short SV (mid-range (<= split_refernece_thres bp) deletion, tandem duplication), we get the two sequence for large SV (> split_refernece_thres bp), we get three sequence (one joint sequence by two break points, and two reference sequences around the break points) the only concern is short inversion... (are there some somatic short inversion?) however, this will be filtered beforehand by the "cover filter", and maybe we have to give up detecting these class of SVs. """ hOUT = open(outputFilePath, 'w') if juncSeq == "---": juncSeq = "" # for mid-range deletion or tandem duplication if juncChr1 == juncChr2 and abs(int(juncPos1) - int(juncPos2)) <= split_refernece_thres and juncDir1 != juncDir2: seq = "" for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_ref" print >> hOUT, seq # for mid-range deletion if juncDir1 == "+" and juncDir2 == "-": seq = "" for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(juncPos1)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() seq = seq + juncSeq for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(juncPos2) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_alt" print >> hOUT, seq # for mid-range tandem duplication else: seq = "" for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(juncPos2)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() seq = seq + juncSeq for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(juncPos1) + "-" + str(int(juncPos1) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_alt" print >> hOUT, seq else: seq = "" for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(int(juncPos1) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_ref1" print >> hOUT, seq seq = "" for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_ref2" print >> hOUT, seq seq = "" if juncDir1 == "+": tseq = "" for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(juncPos1)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() else: tseq = "" for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(juncPos1) + "-" + str(int(juncPos1) + validate_sequence_length)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() tseq = utils.reverseComplement(tseq) seq = tseq + juncSeq if juncDir2 == "-": tseq = "" for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(juncPos2) + "-" + str(int(juncPos2) + validate_sequence_length)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() else: tseq = "" for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(juncPos2)): if item[0] == ">": continue tseq = tseq + item.rstrip('\n').upper() tseq = utils.reverseComplement(tseq) seq = seq + tseq print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_alt" print >> hOUT, seq hOUT.close()
def clusterJunction(inputFilePath, outputFilePath, check_margin_size): """ script for merging and summarizing junction read pairs """ hIN = open(inputFilePath, 'r') hOUT = open(outputFilePath, 'w') mergedBedpeInfo = {} mergedJunction = {} for line in hIN: F = line.rstrip('\n').split('\t') match = 0 delList = [] for key in sorted(mergedBedpeInfo): tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split('\t') tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[key].split('\t') # the investigated key is sufficiently far from the current line in the input file and no additional line to merge is expected. therefore flush the key and information if F[0] != tchr1 or int(F[1]) > int(tend1) + check_margin_size: # obtain the most frequent junction junc_counter = collections.Counter(mergedJunction[key].split(';')) best_junc = junc_counter.most_common(1)[0][0] btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split(',') btstart1 = str(int(btend1) - 1) btstart2 = str(int(btend2) - 1) print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \ tids, btinseq, btdir1, btdir2, tmqs1, talns1, \ tmqs2, talns2, tpinds, tcinds]) + '\t' + \ mergedJunction[key] # add to the deletion list (later the key will removed from the dictionaries) delList.append(key) continue else: # check whether the investigated key and the current line should be merged or not if F[0] == tchr1 and F[3] == tchr2 and F[8] == tdir1 and F[9] == tdir2: flag = 0 # detailed check on the junction position considering inserted sequences if F[8] == "+": expectedDiffSize = (int(F[2]) - int(tend1)) + (len(F[7]) - int(inseqSize)) if (F[9] == "+" and int(F[5]) == int(tend2) - int(expectedDiffSize)) or (F[9] == "-" and int(F[5]) == int(tend2) + int(expectedDiffSize)): flag = 1 else: expectedDiffSize = (int(F[2]) - int(tend1)) + (int(inseqSize) - len(F[7])) if (F[9] == "+" and int(F[5]) == int(tend2) + int(expectedDiffSize)) or (F[9] == "-" and int(F[5]) == int(tend2) - int(expectedDiffSize)): flag = 1 # if the junction position and direciton match if flag == 1: match = 1 newIds = tids + ';' + F[6] newInseqs = tinseqs + ';' + F[7] newMqs1 = tmqs1 + ';' + F[10] newAlns1 = talns1 + ';' + F[11] newMqs2 = tmqs2 + ';' + F[12] newAlns2 = talns2 + ';' + F[13] newPinds = tpinds + ';' + F[14] newCinds = tcinds + ';' + F[15] mergedBedpeInfo[key] = '\t'.join([newIds, newInseqs, newMqs1, newAlns1, newMqs2, newAlns2, newPinds, newCinds]) # check whether the inserted sequence should be reverse-complemented tinseq = F[7] if F[7] != "---" and F[8] == F[9] and F[15] == "2": # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement()) tinseq = utils.reverseComplement(F[7]) mergedJunction[key] = mergedJunction[key] + ";" + ','.join([F[0], F[2], F[8], F[3], F[5], F[9], tinseq]) for item in delList: del mergedBedpeInfo[item] del mergedJunction[item] # if the current line in the input file does not match any of the pooled keys if match == 0: newKey = '\t'.join([F[0], F[1], F[2], F[3], F[4], F[5], F[8], F[9], str(len(F[7]))]) mergedBedpeInfo[newKey] = F[6] + '\t' + F[7] + '\t' + '\t'.join(F[10:16]) # check whether the inserted sequence should be reverse-complemented tinseq = F[7] if F[7] != "---" and F[8] == F[9] and F[15] == "2": # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement()) tinseq = utils.reverseComplement(F[7]) mergedJunction[newKey] = ','.join([F[0], F[2], F[8], F[3], F[5], F[9], tinseq]) hIN.close() # last treatment for key in sorted(mergedBedpeInfo): tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split('\t') tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[key].split('\t') # obtain the most frequent junction junc_counter = collections.Counter(mergedJunction[key].split(';')) best_junc = junc_counter.most_common(1)[0][0] btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split(',') btstart1 = str(int(btend1) - 1) btstart2 = str(int(btend2) - 1) print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \ tids, btinseq, btdir1, btdir2, tmqs1, talns1, \ tmqs2, talns2, tpinds, tcinds]) + '\t' + \ mergedJunction[key] hOUT.close()
'TTA': 'L', 'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*', 'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W', } for header, sequence in utils.parse_fasta(sequenceFile): positiveStrand = "" longestCDS = 0 strands = [sequence, utils.reverseComplement(sequence)] for strand in strands: for frame in range(3): proteinSequence = "" for fragment in range(frame, len(strand), 3): codon = strand[fragment:fragment + 3] if len(codon) != 3: continue try: proteinSequence += codon2aminoacid[codon] except KeyError: proteinSequence += 'X' matches = regex_orf.findall(proteinSequence) allORFs = "".join([x for x in matches if x]) if len(allORFs) / float(len(strand)) > longestCDS: longestCDS = len(allORFs) / float(len(strand))