for achrom in regionDict: print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) for region in regionDict[achrom]: border = 0 if usePeaks: (rstart, rstop, rlen, peakPos, peakHeight) = region border = 200 else: (rstart, rstop, rlen) = region if rlen > maxsize: print "%s:%d-%d length %d > %d max region size - skipping" % ( achrom, rstart, rstop, rlen, maxsize) continue try: seq = hg.sequence(achrom, rstart - border, rlen + 2 * border) except: print "problem with %s" % str((rstart, rstop, rlen)) continue if usePeaks: topPos = peakPos - rstart if peakHeight > minHitThresh: ncregions[achrom].append( (rstart, rstop, rlen, [topPos], peakHeight)) index += 1 elif doDataset: thechrom = 'chr' + achrom print '.' hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True,
for (start, stop, length) in regions[chrom]: regionList.append((chrom, start, length)) if usePeak: regionList.sort() regionList.reverse() notFoundIndex = 0 currentChrom = '' count = 0 for tuple in regionList: if usePeak: (rpeakheight, rchrom, start, length, rpeakpos) = tuple else: (rchrom, start, length) = tuple try: seq = hg.sequence(rchrom, start, length) except: print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length) continue count += 1 numHits = -1 if usePeak: peakpos = rpeakpos if useRank: numHits = count else: numHits = rpeakheight elif doRDS: if rchrom != currentChrom: fullchrom = 'chr' + rchrom hitDict = hitRDS.getReadsDict(chrom=fullchrom)
def main(argv): if len(argv) < 4: print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[0] sys.exit(1) genome = argv[1] gtf=argv[2] outputfilename = argv[3] doPolyA=False if '-polyA' in argv: doPolyA=True tailsize=int(argv[argv.index('-polyA')+1]) tail='' for i in range(tailsize): tail=tail+'A' print 'will add a polyA tail of ', tailsize, 'nt' outfile = open(outputfilename, 'w') hg = Genome(genome) j=0 lineslist = open(gtf) TranscriptDict={} for line in lineslist: j+=1 if j % 100000 == 0: print j, 'lines processed' if line.startswith('#'): continue fields=line.strip().split('\t') if fields[2]!='exon': continue if 'transcript_name "' in fields[8]: TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0] else: TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0] if TranscriptDict.has_key(TranscriptID): pass else: TranscriptDict[TranscriptID]=[] chr=fields[0] left=int(fields[3]) right=int(fields[4]) orientation=fields[6] TranscriptDict[TranscriptID].append((chr,left,right,orientation)) g=0 print 'Found', len(TranscriptDict.keys()), 'transcripts' for transcript in TranscriptDict.keys(): g+=1 if g % 1000 == 0: print g, 'transcripts sequences processed' sequence='' leftEnds=[] rightEnds=[] TranscriptDict[transcript].sort() orientation = TranscriptDict[transcript][0][3] if orientation=='+' or orientation=='F': for (chr,left,right,orientation) in TranscriptDict[transcript]: leftEnds.append(left) rightEnds.append(right) try: sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left) print "can't retrieve sequence" except: for p in range(left,right-left): try: sequence=sequence+hg.sequence(chr[3:len(chr)],p,1) except: sequence=sequence+'N' missed+=1 sense='plus_strand' if orientation=='-' or orientation=='R': for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]): leftEnds.append(left) rightEnds.append(right) try: exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1) sequence=sequence+getReverseComplement(exonsequence) except: for p in range(left-1,right-left+1): try: sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1)) except: sequence=sequence+'N' missed+=1 sense='minus_strand' LeftEnd=min(leftEnds) RightEnd=max(rightEnds) outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense outfile.write(outline+'\n') if doPolyA: outfile.write(sequence+tail+'\n') else: outfile.write(sequence+'\n') outfile.close()
def main(argv): if len(argv) < 4: print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[ 0] sys.exit(1) genome = argv[1] gtf = argv[2] outputfilename = argv[3] doPolyA = False if '-polyA' in argv: doPolyA = True tailsize = int(argv[argv.index('-polyA') + 1]) tail = '' for i in range(tailsize): tail = tail + 'A' print 'will add a polyA tail of ', tailsize, 'nt' outfile = open(outputfilename, 'w') hg = Genome(genome) j = 0 lineslist = open(gtf) TranscriptDict = {} for line in lineslist: j += 1 if j % 100000 == 0: print j, 'lines processed' if line.startswith('#'): continue fields = line.strip().split('\t') if fields[2] != 'exon': continue if 'transcript_name "' in fields[8]: TranscriptID = fields[8].split('transcript_name "')[1].split( '";')[0] else: TranscriptID = fields[8].split('transcript_id "')[1].split('";')[0] if TranscriptDict.has_key(TranscriptID): pass else: TranscriptDict[TranscriptID] = [] chr = fields[0] left = int(fields[3]) right = int(fields[4]) orientation = fields[6] TranscriptDict[TranscriptID].append((chr, left, right, orientation)) g = 0 print 'Found', len(TranscriptDict.keys()), 'transcripts' for transcript in TranscriptDict.keys(): g += 1 if g % 1000 == 0: print g, 'transcripts sequences processed' sequence = '' leftEnds = [] rightEnds = [] TranscriptDict[transcript].sort() orientation = TranscriptDict[transcript][0][3] if orientation == '+' or orientation == 'F': for (chr, left, right, orientation) in TranscriptDict[transcript]: leftEnds.append(left) rightEnds.append(right) try: sequence = sequence + hg.sequence(chr[3:len(chr)], left, right - left) print "can't retrieve sequence" except: for p in range(left, right - left): try: sequence = sequence + hg.sequence( chr[3:len(chr)], p, 1) except: sequence = sequence + 'N' missed += 1 sense = 'plus_strand' if orientation == '-' or orientation == 'R': for (chr, left, right, orientation) in reversed(TranscriptDict[transcript]): leftEnds.append(left) rightEnds.append(right) try: exonsequence = hg.sequence(chr[3:len(chr)], left - 1, right - left + 1) sequence = sequence + getReverseComplement(exonsequence) except: for p in range(left - 1, right - left + 1): try: sequence = sequence + getReverseComplement( hg.sequence(chr[3:len(chr)], p, 1)) except: sequence = sequence + 'N' missed += 1 sense = 'minus_strand' LeftEnd = min(leftEnds) RightEnd = max(rightEnds) outline = '>' + transcript + ':' + chr + ':' + str( LeftEnd) + '-' + str(RightEnd) + '-' + sense outfile.write(outline + '\n') if doPolyA: outfile.write(sequence + tail + '\n') else: outfile.write(sequence + '\n') outfile.close()
regionstart = exonStops[index] - maxBorder alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1])) beforeLen = exonLengths[index] afterLen = exonLengths[index + 1] if (beforeLen + afterLen) < maxBorder + spacer: #print 'splice chr%s:%d-%d too short: %d' % (chrom, exonStops[index], exonStarts[index + 1], beforeLen + afterLen) missedCount += 1 continue if (beforeLen + afterLen) < 2 * maxBorder: depressedCount += 1 if beforeLen > maxBorder: beforeLen = maxBorder if afterLen > maxBorder: afterLen = maxBorder try: beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder) afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder) #beforeSplice = hg.sequence(chrom, exonStops[index] - beforeLen, beforeLen) #afterSplice = hg.sequence(chrom, exonStarts[index + 1], afterLen) except: if doVerbose: print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1]) continue outstring = '>%s%s%d%s%d\n%s\n' % ( name, delimiter, index, delimiter, regionstart, spacerseq + beforeSplice.upper() + afterSplice.upper() + spacerseq) outfile.write(outstring) splicefileindex += 1 spliceCounter += 1 if spliceCounter > 10000:
countList = [] posList = [] index = 0 regionList = [] for rchrom in regions: if 'rand' in rchrom or 'M' in rchrom or 'hap' in rchrom: continue for (start, stop, length) in regions[rchrom]: regionList.append((rchrom, start, length)) notFoundIndex = 0 currentChrom = '' for (rchrom, start, length) in regionList: seq = hg.sequence(rchrom, start, length) if doDataset: if rchrom != currentChrom: fullchrom = 'chr' + rchrom hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True) currentChrom = rchrom (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True) if len(topPos) == 0: print 'topPos error' peakpos = topPos[0] peakscore = smoothArray[peakpos] if peakscore == 0.: peakscore = -1. if normalize: numHits /= normalizeBy peakscore /= normalizeBy
from cistematic.genomes import Genome print '%s: version 1.1' % sys.argv[0] if len(sys.argv) < 5: print 'usage: python %s genome merlen chrAny:start-stop outfile' % sys.argv[ 0] sys.exit(1) genome = sys.argv[1] merlen = int(sys.argv[2]) location = sys.argv[3] outfilename = sys.argv[4] (chrom, pos) = location.split(':') chrom = chrom[3:] (start, stop) = pos.split('-') start = int(start) regionlength = int(stop) - start + 1 hg = Genome(genome) seq = hg.sequence(chrom, start, regionlength) outfile = open(outfilename, 'w') print 'writing %d %d-mers' % (regionlength - merlen, merlen) for index in range(regionlength - merlen): outfile.write(seq[index:index + merlen].upper() + '\n') outfile.close()
def main(argv): if len(argv) < 3: print 'usage: python %s genome gtf outfilename [-spliced] [-class_code symbol]' % argv[0] print ' this script will output the translation of all three possible reading frames; stop codons will be converted to a .' sys.exit(1) genome = argv[1] gtf=argv[2] outputfilename = argv[3] doSpliced=False if '-spliced' in argv: doSpliced=True print 'will only look at transciprs with more than one exon' doClassCode=False if '-class_code' in argv: doClassCode=True class_code=argv[argv.index('-class_code')+1] print 'will only look at transciprs if class code', class_code CodonDict={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A', 'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L', 'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R', 'AAA':'K', 'AAG':'K', 'AAU':'N', 'AAC':'N', 'AUG':'M', 'GAU':'D', 'GAC':'D', 'UUU':'F', 'UUC':'F', 'UGU':'C', 'UGC':'C', 'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'CAA':'Q', 'CAG':'Q', 'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S', 'GAA':'E', 'GAG':'E', 'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T', 'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G', 'UGG':'W', 'CAU':'H', 'CAC':'H', 'UAU':'Y', 'UAC':'Y', 'AUU':'I', 'AUC':'I', 'AUA':'I', 'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V', 'START':'AUG', 'UAA':'.', 'UGA':'.', 'UAG':'.'} outfile = open(outputfilename, 'w') hg = Genome(genome) j=0 lineslist = open(gtf) TranscriptDict={} for line in lineslist: j+=1 if j % 100000 == 0: print j, 'lines processed' if line.startswith('#'): continue fields=line.strip().split('\t') if fields[2]!='exon': continue if doClassCode: if 'class_code "' in fields[8]: cc = fields[8].split('class_code "')[1].split('";')[0] if cc != class_code: continue else: continue if 'transcript_name "' in fields[8]: TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0] else: TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0] if TranscriptDict.has_key(TranscriptID): pass else: TranscriptDict[TranscriptID]=[] chr=fields[0] left=int(fields[3]) right=int(fields[4]) orientation=fields[6] TranscriptDict[TranscriptID].append((chr,left,right,orientation)) g=0 print 'Found', len(TranscriptDict.keys()), 'transcripts' for transcript in TranscriptDict.keys(): g+=1 if g % 1000 == 0: print g, 'transcripts sequences processed' TranscriptDict[transcript] = list(Set(TranscriptDict[transcript])) if doSpliced: if len(TranscriptDict[transcript]) == 1: del TranscriptDict[transcript] continue sequence='' leftEnds=[] rightEnds=[] orientation = TranscriptDict[transcript][0][3] TranscriptDict[transcript].sort() if orientation=='+': for (chr,left,right,orientation) in TranscriptDict[transcript]: try: sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left) except: print "can't retrieve sequence", chr,left,right,orientation for p in range(left,right-left): try: sequence=sequence+hg.sequence(chr[3:len(chr)],p,1) except: sequence=sequence+'N' missed+=1 sense='plus_strand' if orientation=='-': for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]): try: exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1) sequence=sequence+getReverseComplement(exonsequence) except: print "can not retrieve sequence", chr,left,right,orientation for p in range(left-1,right-left+1): try: sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1)) except: sequence=sequence+'N' missed+=1 sense='minus_strand' if orientation=='.': for (chr,left,right,orientation) in TranscriptDict[transcript]: try: sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left) except: print "can not retrieve sequence", chr,left,right,orientation for p in range(left,right-left): try: sequence=sequence+hg.sequence(chr[3:len(chr)],p,1) except: sequence=sequence+'N' missed+=1 sense='unknown_strand' LeftEnd=TranscriptDict[transcript][0][1] RightEnd=TranscriptDict[transcript][-1][2] if orientation == '+' or orientation == '-': sequence = sequence.upper().replace('T','U') max_protein_length = len(sequence) outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1' outfile.write(outline+'\n') protein = '' for i in range(0,max_protein_length-3,3): if 'N' in sequence[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2' outfile.write(outline+'\n') protein = '' for i in range(1,max_protein_length-4,3): if 'N' in sequence[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3' outfile.write(outline+'\n') protein = '' for i in range(2,max_protein_length-5,3): if 'N' in sequence[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence[i:i+3]] outfile.write(protein+'\n') else: sequence1 = sequence.upper().replace('T','U') sequence2 = getReverseComplement(sequence).upper().replace('T','U') max_protein_length = len(sequence1) outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1' outfile.write(outline+'\n') protein = '' for i in range(0,max_protein_length-3,3): if 'N' in sequence1[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence1[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2' outfile.write(outline+'\n') protein = '' for i in range(1,max_protein_length-4,3): if 'N' in sequence1[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence1[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3' outfile.write(outline+'\n') protein = '' for i in range(2,max_protein_length-5,3): if 'N' in sequence1[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence1[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame4' outfile.write(outline+'\n') protein = '' for i in range(0,max_protein_length-3,3): if 'N' in sequence2[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence2[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame5' outfile.write(outline+'\n') protein = '' for i in range(1,max_protein_length-4,3): if 'N' in sequence2[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence2[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame6' outfile.write(outline+'\n') protein = '' for i in range(2,max_protein_length-5,3): if 'N' in sequence2[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence2[i:i+3]] outfile.write(protein+'\n') outfile.close()