def makeTranscriptome(tranFN, outFN): p = bioLibCG.cgPrint() p.show = False gf = GenomeFetch.GenomeFetch('hg19') fOut = open(outFN, 'w') f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')] exonEnds = [int(x) for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) tID = ls[0] gID = ls[10] seqList = [] for eStart, eEnd in exonPairs: tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd) seqList.append(gf.getSequence(tcc)) mRNA = ''.join(seqList) #reverse direction if negative strand if tStrand == '-1': mRNA = mRNA[::-1] fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA))) fOut.write(mRNA + '\n\n') fOut.close() f.close()
def checkMessy(tranFN): p = bioLibCG.cgPrint() f = open(tranFN, 'r') a = 0 b = 0 c = 0 d = 0 e = 0 for line in f: ls = line.strip().split('\t') chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2]) tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[15] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (tStart, cStart - 1) a += 1 print a, b, c, d, e
def checkMessy(tranFN): p = bioLibCG.cgPrint() f = open(tranFN, 'r') a = 0 b = 0 c = 0 d = 0 e = 0 for line in f: ls = line.strip().split('\t') chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2]) tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[15] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') b += 1 if codingStatus: d += 1 range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (tStart, cStart - 1) a += 1 print a, b, c, d, e
def makeContextWig(tranFN, wigDir, chrom, strand, species = 'hg19'): p = bioLibCG.cgPrint() coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (tStart, cStart - 1) p.tell('ranges', range5, range3) p.tell('intronRange', intronPairs) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) p.tell('utr', utr5, utr3) p.tell('exon before', exonPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) p.tell('exon after', exonPairs) debugSpot = 23631989 #5UTR for pair in utr5: p.tell('filling utr5', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_5UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_5UTR ' #Exons for pair in exonPairs: p.tell('filling exons', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** exon', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_EXON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_EXON ' #Introns for pair in intronPairs: p.tell('filling introns', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** INTRON', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_INTRON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_INTRON ' #3UTR for pair in utr3: p.tell('filling utr3', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_3UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_3UTR ' p.show = False #uniqify, stringify for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #p.tell('finalInfo', utr5, exonPairs, utr3) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')
def makeContextWig(tranFN, wigDir, chrom, strand, species='hg19'): p = bioLibCG.cgPrint() coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (tStart, cStart - 1) p.tell('ranges', range5, range3) p.tell('intronRange', intronPairs) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) p.tell('utr', utr5, utr3) p.tell('exon before', exonPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) p.tell('exon after', exonPairs) debugSpot = 23631989 #5UTR for pair in utr5: p.tell('filling utr5', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_5UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_5UTR ' #Exons for pair in exonPairs: p.tell('filling exons', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** exon', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_EXON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_EXON ' #Introns for pair in intronPairs: p.tell('filling introns', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** INTRON', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_INTRON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_INTRON ' #3UTR for pair in utr3: p.tell('filling utr3', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_3UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_3UTR ' p.show = False #uniqify, stringify for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #p.tell('finalInfo', utr5, exonPairs, utr3) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')