def makeTypeWig(tranFN, wigDir, chrom, strand, species): '''Using 14th column in transcripts for type info...might want to use something different?''' coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tID = ls[0] tType = ls[13] tStart, tEnd = int(ls[3]), int( ls[4] ) - 1 #0BASE CONVERSION !!! it might have to be 0BASE for making wig...? tIDType = '%s:%s' % (tID, tType) for i in xrange(tStart, tEnd + 1): coord_id[i] = coord_id.get(i, '') + '%s ' % tIDType #unique, string for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'tType', wigDir, 'None')
def makeTranscriptome(tranFN, outFN): p = bioLibCG.cgPrint() p.show = False gf = GenomeFetch.GenomeFetch('hg19') fOut = open(outFN, 'w') f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')] exonEnds = [int(x) for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) tID = ls[0] gID = ls[10] seqList = [] for eStart, eEnd in exonPairs: tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd) seqList.append(gf.getSequence(tcc)) mRNA = ''.join(seqList) #reverse direction if negative strand if tStrand == '-1': mRNA = mRNA[::-1] fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA))) fOut.write(mRNA + '\n\n') fOut.close() f.close()
def convertEnsemblBiomart(fN, outFN): fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom = 'chr' + ls[1] strand = bioLibCG.switchStrandFormat(ls[2]) ls[5], ls[6] = ls[4], ls[4] numBlocks = 1 eStarts = ls[3] + ',' eEnds = ls[4] + ',' cs, ce = 'none', 'none' tType = ls[8] + '_noncoding' intType = 'None' gType = 'noncoding_noncoding' pString = [ ls[0], chrom, strand, ls[3], ls[4], ls[5], ls[6], numBlocks, eStarts, eEnds, ls[7], cs, ce, tType, intType, gType ] pString = [str(x) for x in pString] pString = '\t'.join(pString) fOut.write(pString + '\n') f.close() fOut.close()
def get3UTRFromTranscriptome(tranFN, outFN, wholeGene = False ): fOut = open(outFN, 'w') f = open(tranFN, 'r') for i, line in enumerate(f): ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 if wholeGene: utrTcc = bioLibCG.makeTcc(tChrom, tStrand, tStart, tEnd) fOut.write('%s\n' % utrTcc) continue #5UTR if tStrand == '1': range5 = (tStart, cStart - 1) else: range5 = (cEnd + 1, tEnd) #3UTR if tStrand == '1': range3 = (cEnd + 1, tEnd) else: range3 = (tStart, cStart - 1) utrTcc = bioLibCG.makeTcc(tChrom, tStrand, range3[0], range3[1]) fOut.write('%s\n' % utrTcc) f.close() fOut.close()
def makeTypeWig(tranFN, wigDir, chrom, strand, species): '''Using 14th column in transcripts for type info...might want to use something different?''' coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tID = ls[0] tType = ls[13] tStart, tEnd = int(ls[3]), int(ls[4]) - 1 #0BASE CONVERSION !!! it might have to be 0BASE for making wig...? tIDType = '%s:%s' % (tID, tType) for i in xrange(tStart, tEnd + 1): coord_id[i] = coord_id.get(i, '') + '%s ' % tIDType #unique, string for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'tType', wigDir, 'None')
def makeGeneWig(tranFN, wigDir, chrom, strand): coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue gID = ls[10] #gID = gID.replace(" ", "_") tStart, tEnd = int(ls[3]), int(ls[4]) - 1 for i in xrange(tStart, tEnd + 1): coord_id[i] = coord_id.get( i, '') + '%s$' % gID #$ is used because of spaces #unique, string for i, ids in coord_id.iteritems(): coord_id[i] = ','.join( [x for x in set(ids.strip().split('$')) if x != '']) #write wig to file writeWigDictToWig(coord_id, chrom, strand, 'hg19', 'ALL', wigDir, 'None')
def convertEnsemblBiomart(fN, outFN): fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom = 'chr' + ls[1] strand = bioLibCG.switchStrandFormat(ls[2]) ls[5], ls[6] = ls[4], ls[4] numBlocks = 1 eStarts = ls[3] + ',' eEnds = ls[4] + ',' cs, ce = 'none', 'none' tType = ls[8] + '_noncoding' intType = 'None' gType = 'noncoding_noncoding' pString = [ls[0], chrom, strand, ls[3], ls[4], ls[5], ls[6], numBlocks, eStarts, eEnds, ls[7], cs, ce, tType, intType, gType] pString = [str(x) for x in pString] pString = '\t'.join(pString) fOut.write(pString + '\n') f.close() fOut.close()
def getTccs(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat(ls[5]), ls[1], ls[2] print bioLibCG.makeTcc(chrom, strand, start, end)
def getTccs(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat( ls[5]), ls[1], ls[2] print bioLibCG.makeTcc(chrom, strand, start, end)
def collectIDs2(fN, fN2, fN3): '''Used for getting the # repeat reads on target results''' idSet = set() f = open(fN, 'r') for line in f: ls = line.strip().split('\t') for id in ls[9].split(','): idSet.add(id) idSetDeg = set() f = open(fN2, 'r') for line in f: ls = line.strip().split('\t') if ls[18] == "F": continue if ls[0] in idSet: idSetDeg.add(ls[11]) chrom_strand_coord = getHitMap(list(idSetDeg)) #print 'target Tccs', len(idSetDeg) #print chrom_strand_coord['chr1']['1'] eSet = set() eDict = {} readNames = set() f = open(fN3, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start = ls[2], ls[1], int(ls[3]) strand = bioLibCG.switchStrandFormat(strand) for i in range(start - 3, start + 3): try: if i in chrom_strand_coord[chrom][strand]: readNames.add(ls[0]) break except KeyError: continue f.close() #now go back through and count the times the read appears readName_count = {} f = open(fN3, 'r') for line in f: ls = line.strip().split('\t') if ls[0] in readNames: readName_count[ls[0]] = readName_count.get(ls[0], 0) + 1 for read, count in readName_count.iteritems(): print '%s\t%s' % (read, count)
def getCoords(fN, rn = None, tn = None): oNX = cgNexusFlat.Nexus(fN, cgOriginRNAFlat.OriginRNA) oNX.load(['tcc'], [rn, tn]) for id in oNX.tcc: chrom, strand, start, end = bioLibCG.tccSplit(oNX.tcc[id]) name = 'None' score = '0' strand = bioLibCG.switchStrandFormat(strand) thickStart = start thickEnd = end pString = [str(x) for x in [chrom, start, end, name, score, strand, thickStart, thickEnd]] print '\t'.join(pString)
def getCoords(fN, rn=None, tn=None): oNX = cgNexusFlat.Nexus(fN, cgOriginRNAFlat.OriginRNA) oNX.load(['tcc'], [rn, tn]) for id in oNX.tcc: chrom, strand, start, end = bioLibCG.tccSplit(oNX.tcc[id]) name = 'None' score = '0' strand = bioLibCG.switchStrandFormat(strand) thickStart = start thickEnd = end pString = [ str(x) for x in [chrom, start, end, name, score, strand, thickStart, thickEnd] ] print '\t'.join(pString)
def get20mers(aluFN): gf = GenomeFetch.GenomeFetch('hg19') seq_count = {} f = open(aluFN, 'r') for line in f: ls = line.strip().split('\t') coord = ls[0] chrom, start, end = coord.split(':')[0], coord.split(':')[1].split( '-')[0], coord.split(':')[1].split('-')[1] strand = bioLibCG.switchStrandFormat(ls[2]) tcc = bioLibCG.makeTcc(chrom, strand, start, end) seq = gf.getSequence(tcc) frames = bioLibCG.returnFrames(seq, 20) if frames == 1: continue for smallSeq in frames: count = seq_count.get(smallSeq, 0) seq_count[smallSeq] = count + 1 for seq, count in seq_count.items(): print '%s\t%s' % (seq, count)
def get20mers(aluFN): gf = GenomeFetch.GenomeFetch('hg19') seq_count = {} f = open(aluFN, 'r') for line in f: ls = line.strip().split('\t') coord = ls[0] chrom, start, end = coord.split(':')[0], coord.split(':')[1].split('-')[0], coord.split(':')[1].split('-')[1] strand = bioLibCG.switchStrandFormat(ls[2]) tcc = bioLibCG.makeTcc(chrom, strand, start, end) seq = gf.getSequence(tcc) frames = bioLibCG.returnFrames(seq, 20) if frames == 1: continue for smallSeq in frames: count = seq_count.get(smallSeq, 0) seq_count[smallSeq] = count + 1 for seq, count in seq_count.items(): print '%s\t%s' % (seq, count)
def makeTranscriptWig(tranFN, wigDir, chrom, strand, species = 'hg19'): coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tID = ls[0] tStart, tEnd = int(ls[3]), int(ls[4]) - 1 for i in xrange(tStart, tEnd + 1): coord_id[i] = coord_id.get(i, '') + '%s ' % tID #unique, string for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'transcript', wigDir, 'None')
def updateMultipleTccs(oDir, mappedFN): #parse bowtie f = open(mappedFN, 'r') oID_tccs = {} for line in f: ls = line.strip().split('\t') oID = int(ls[0]) strand, chrom, firstCoord = bioLibCG.switchStrandFormat(ls[1]), ls[2], int(ls[3]) secondCoord = firstCoord + len(ls[4]) - 1 tcc = bioLibCG.makeTcc(chrom, strand, firstCoord, secondCoord) oID_tccs.setdefault(oID, []).append(tcc) oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for id, oRNA in id_oRNA.items(): oRNA.tccs = oID_tccs[id] oDC.commit(id_oRNA)
def updateMultipleTccs(oDir, mappedFN): #parse bowtie f = open(mappedFN, 'r') oID_tccs = {} for line in f: ls = line.strip().split('\t') oID = int(ls[0]) strand, chrom, firstCoord = bioLibCG.switchStrandFormat( ls[1]), ls[2], int(ls[3]) secondCoord = firstCoord + len(ls[4]) - 1 tcc = bioLibCG.makeTcc(chrom, strand, firstCoord, secondCoord) oID_tccs.setdefault(oID, []).append(tcc) oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for id, oRNA in id_oRNA.items(): oRNA.tccs = oID_tccs[id] oDC.commit(id_oRNA)
def makeGeneWig(tranFN, wigDir, chrom, strand): coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue gID = ls[10] #gID = gID.replace(" ", "_") tStart, tEnd = int(ls[3]), int(ls[4]) - 1 for i in xrange(tStart, tEnd + 1): coord_id[i] = coord_id.get(i, '') + '%s$' % gID #$ is used because of spaces #unique, string for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split('$')) if x != '']) #write wig to file writeWigDictToWig(coord_id, chrom, strand, 'hg19', 'ALL', wigDir, 'None')
def makeTranscriptWig(tranFN, wigDir, chrom, strand, species='hg19'): coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tID = ls[0] tStart, tEnd = int(ls[3]), int(ls[4]) - 1 for i in xrange(tStart, tEnd + 1): coord_id[i] = coord_id.get(i, '') + '%s ' % tID #unique, string for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'transcript', wigDir, 'None')
def updateTccAndSNR(mFN, alignmentFN): mNX = cgNexusFlat.Nexus(mFN, cgOriginRNAFlat.OriginRNA) mNX.load(['tcc', 'snrSS']) f = open(alignmentFN, 'r') i = 0 for line in f: ls = line.strip().split('\t') chrom = ls[2] strand = bioLibCG.switchStrandFormat(ls[1]) start = int(ls[3]) + 1 # 1BASE conversion end = start + len(ls[4]) newTcc = bioLibCG.makeTcc(chrom, strand, start, end) newSNR = 10.0 mNX.tcc[i] = newTcc mNX.snrSS[i] = newSNR i += 1 mNX.save()
def checkMessy(tranFN): p = bioLibCG.cgPrint() f = open(tranFN, 'r') a = 0 b = 0 c = 0 d = 0 e = 0 for line in f: ls = line.strip().split('\t') chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2]) tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[15] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (tStart, cStart - 1) a += 1 print a, b, c, d, e
def createCollapsedGeneSets(tranFN, outFN, acceptableTypes = 'EXON', onlyCoding = True): '''get areas occupied by all transcripts in a gene''' acceptableTypes = acceptableTypes.strip().split(',') geneName_intervalSet = {} geneName_info = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) #if tChrom != chrom or tStrand != strand: #continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] geneName = ls[10] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if tStrand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if tStrand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) geneName_info.setdefault(geneName, set()).add((tChrom, tStrand)) pairs__type = [ (exonPairs, 'EXON'), (intronPairs, 'INTRON'), (utr5, '5UTR'), (utr3, '3UTR') ] for pairs, type in pairs__type: for pair in pairs: if type in acceptableTypes: if onlyCoding and not codingStatus: continue #create geneset/info if does not exist if geneName not in geneName_intervalSet: geneName_intervalSet[geneName] = IntervalSet() geneName_intervalSet[geneName].add(Interval(pair[0], pair[1] + 1)) for geneName, info in geneName_info.iteritems(): if len(info) > 1: if geneName in geneName_intervalSet: del geneName_intervalSet[geneName] # if it spans different chromosomes/strands... print geneName, info, 'FAILED' fOut = open(outFN, 'w') for geneName, iSet in geneName_intervalSet.iteritems(): gStarts = [] gEnds = [] for interv in iSet: gStarts.append(interv.lower_bound) gEnds.append(interv.upper_bound) chrom, strand = geneName_info[geneName].pop() outString = [geneName, chrom, strand, ','.join([str(x) for x in gStarts]), ','.join([str(x) for x in gEnds])] fOut.write('\t'.join([str(x) for x in outString]) + '\n') fOut.close()
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): '''get the number of spots in each data set, and the number that overlap''' '''wigDir2 has to be hela cuz strand flip''' maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict( (i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': if i in covered: continue #multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): #in1 = (val1 >= cut) #in2 = (val2 >= cut) in1 = (val1 == cut) in2 = (val2 == cut) if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(['%s:%s' % (chrom, strand), i]) pString = '\t'.join([ str(x) for x in cutoff_overlap[i] ]) print pString
def makeContextWig(tranFN, wigDir, chrom, strand, species = 'hg19'): p = bioLibCG.cgPrint() coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (tStart, cStart - 1) p.tell('ranges', range5, range3) p.tell('intronRange', intronPairs) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) p.tell('utr', utr5, utr3) p.tell('exon before', exonPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) p.tell('exon after', exonPairs) debugSpot = 23631989 #5UTR for pair in utr5: p.tell('filling utr5', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_5UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_5UTR ' #Exons for pair in exonPairs: p.tell('filling exons', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** exon', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_EXON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_EXON ' #Introns for pair in intronPairs: p.tell('filling introns', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** INTRON', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_INTRON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_INTRON ' #3UTR for pair in utr3: p.tell('filling utr3', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_3UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_3UTR ' p.show = False #uniqify, stringify for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #p.tell('finalInfo', utr5, exonPairs, utr3) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')
def getSplicingUnitLengths(tranFN, wigDir, chrom, strand): exonChr_strand_coord = {} intronChr_strand_coord = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) #if tChrom != chrom or tStrand != strand: #continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [(exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON')] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': exonChr_strand_coord.setdefault( tChrom, {}).setdefault(tStrand, set()).add(i) elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass iLength = 0 for chrom in intronChr_strand_coord: for strand in intronChr_strand_coord[chrom]: iLength += len(intronChr_strand_coord[chrom][strand]) eLength = 0 for chrom in exonChr_strand_coord: for strand in exonChr_strand_coord[chrom]: eLength += len(exonChr_strand_coord[chrom][strand]) print 'total Exon Length (all exons overlapped)', eLength print 'total intron Length (all introns overlapped)', iLength
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): """get the number of spots in each data set, and the number that overlap""" """wigDir2 has to be hela cuz strand flip""" maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL") coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL") # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, "r") for line in f: ls = line.strip().split("\t") tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(",")] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")] exonPairs = zip(exonStarts, exonEnds) codingStatus = "_coding" in ls[13] tID = ls[0] # calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 # take care of messy UTRs and assign utr ranges # 5UTR if strand == "1": if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) # 3UTR if strand == "1": if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == "C_EXON": if i in covered: continue # multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): # in1 = (val1 >= cut) # in2 = (val2 >= cut) in1 = val1 == cut in2 = val2 == cut if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == "C_INTRON": # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i]) pString = "\t".join([str(x) for x in cutoff_overlap[i]]) print pString
def makeContextWig(tranFN, wigDir, chrom, strand, species='hg19'): p = bioLibCG.cgPrint() coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (tStart, cStart - 1) p.tell('ranges', range5, range3) p.tell('intronRange', intronPairs) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) p.tell('utr', utr5, utr3) p.tell('exon before', exonPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) p.tell('exon after', exonPairs) debugSpot = 23631989 #5UTR for pair in utr5: p.tell('filling utr5', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_5UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_5UTR ' #Exons for pair in exonPairs: p.tell('filling exons', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** exon', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_EXON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_EXON ' #Introns for pair in intronPairs: p.tell('filling introns', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** INTRON', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_INTRON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_INTRON ' #3UTR for pair in utr3: p.tell('filling utr3', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_3UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_3UTR ' p.show = False #uniqify, stringify for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #p.tell('finalInfo', utr5, exonPairs, utr3) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')
def makeContextDB(tranFN, chrom, strand, outFN): f = open(tranFN, 'r') fOut = open(outFN, 'w') id = 0 for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) #5UTR for pair in utr5: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Exons for pair in exonPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Introns for pair in intronPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #3UTR for pair in utr3: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 f.close() fOut.close()
def getSplicingUnitLengths(tranFN, wigDir, chrom, strand): exonChr_strand_coord = {} intronChr_strand_coord = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) #if tChrom != chrom or tStrand != strand: #continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': exonChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass iLength = 0 for chrom in intronChr_strand_coord: for strand in intronChr_strand_coord[chrom]: iLength += len(intronChr_strand_coord[chrom][strand]) eLength = 0 for chrom in exonChr_strand_coord: for strand in exonChr_strand_coord[chrom]: eLength += len(exonChr_strand_coord[chrom][strand]) print 'total Exon Length (all exons overlapped)', eLength print 'total intron Length (all introns overlapped)', iLength
def checkMessy(tranFN): p = bioLibCG.cgPrint() f = open(tranFN, 'r') a = 0 b = 0 c = 0 d = 0 e = 0 for line in f: ls = line.strip().split('\t') chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2]) tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[15] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (tStart, cStart - 1) a += 1 print a, b, c, d, e
def makeContextDB(tranFN, chrom, strand, outFN): '''outputs cID TYPE TCC, used for making wigs''' f = open(tranFN, 'r') fOut = open(outFN, 'w') id = 0 for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) #5UTR for pair in utr5: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Exons for pair in exonPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Introns for pair in intronPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #3UTR for pair in utr3: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 f.close() fOut.close()