Esempio n. 1
0
for achrom in regionDict:
    print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
    for region in regionDict[achrom]:
        border = 0
        if usePeaks:
            (rstart, rstop, rlen, peakPos, peakHeight) = region
            border = 200
        else:
            (rstart, rstop, rlen) = region
        if rlen > maxsize:
            print "%s:%d-%d length %d > %d max region size - skipping" % (
                achrom, rstart, rstop, rlen, maxsize)
            continue
        try:
            seq = hg.sequence(achrom, rstart - border, rlen + 2 * border)
        except:
            print "problem with %s" % str((rstart, rstop, rlen))
            continue
        if usePeaks:
            topPos = peakPos - rstart
            if peakHeight > minHitThresh:
                ncregions[achrom].append(
                    (rstart, rstop, rlen, [topPos], peakHeight))
                index += 1
        elif doDataset:
            thechrom = 'chr' + achrom
            print '.'
            hitDict = hitRDS.getReadsDict(chrom=thechrom,
                                          withWeight=True,
                                          doMulti=True,
Esempio n. 2
0
        for (start, stop, length) in regions[chrom]:
            regionList.append((chrom, start, length))

if usePeak:
    regionList.sort()
    regionList.reverse()
notFoundIndex = 0
currentChrom = ''
count = 0
for tuple in regionList:
    if usePeak:
        (rpeakheight, rchrom, start, length, rpeakpos) = tuple
    else:
        (rchrom, start, length) = tuple
    try:
        seq = hg.sequence(rchrom, start, length)
    except:
        print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
        continue
    count += 1
    numHits = -1
    if usePeak:
        peakpos = rpeakpos
        if useRank:
            numHits = count
        else:
            numHits = rpeakheight
    elif doRDS:
        if rchrom != currentChrom:
            fullchrom = 'chr' + rchrom
            hitDict = hitRDS.getReadsDict(chrom=fullchrom)
Esempio n. 3
0
def main(argv):

    if len(argv) < 4:
        print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[0]
        sys.exit(1)

    genome = argv[1]
    gtf=argv[2]
    outputfilename = argv[3]
    doPolyA=False
    if '-polyA' in argv:
        doPolyA=True
        tailsize=int(argv[argv.index('-polyA')+1])
        tail=''
        for i in range(tailsize):
            tail=tail+'A'
        print 'will add a polyA tail of ', tailsize, 'nt'

    outfile = open(outputfilename, 'w')

    hg = Genome(genome)

    j=0
    lineslist = open(gtf)
    TranscriptDict={}
    for line in lineslist:
        j+=1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2]!='exon':
            continue
        if 'transcript_name "' in fields[8]:
            TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(TranscriptID):
            pass
        else:
            TranscriptDict[TranscriptID]=[]
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        orientation=fields[6]
        TranscriptDict[TranscriptID].append((chr,left,right,orientation))

    g=0 
    print 'Found', len(TranscriptDict.keys()), 'transcripts'
    for transcript in TranscriptDict.keys():
        g+=1
        if g % 1000 == 0:
            print g, 'transcripts sequences processed'
        sequence=''
        leftEnds=[]
        rightEnds=[] 
        TranscriptDict[transcript].sort()
        orientation = TranscriptDict[transcript][0][3]
        if orientation=='+' or orientation=='F':
            for (chr,left,right,orientation) in TranscriptDict[transcript]:
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left)
                    print "can't retrieve sequence"
                except:
                    for p in range(left,right-left):
                        try:
                            sequence=sequence+hg.sequence(chr[3:len(chr)],p,1)
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='plus_strand'
        if orientation=='-' or orientation=='R':
            for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]):
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1)
                    sequence=sequence+getReverseComplement(exonsequence)
                except:
                    for p in range(left-1,right-left+1):
                        try:
                            sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1))
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='minus_strand'
        LeftEnd=min(leftEnds)
        RightEnd=max(rightEnds)
        outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense
        outfile.write(outline+'\n')
        if doPolyA:
            outfile.write(sequence+tail+'\n')
        else:
            outfile.write(sequence+'\n')

    outfile.close()
Esempio n. 4
0
def main(argv):

    if len(argv) < 4:
        print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[
            0]
        sys.exit(1)

    genome = argv[1]
    gtf = argv[2]
    outputfilename = argv[3]
    doPolyA = False
    if '-polyA' in argv:
        doPolyA = True
        tailsize = int(argv[argv.index('-polyA') + 1])
        tail = ''
        for i in range(tailsize):
            tail = tail + 'A'
        print 'will add a polyA tail of ', tailsize, 'nt'

    outfile = open(outputfilename, 'w')

    hg = Genome(genome)

    j = 0
    lineslist = open(gtf)
    TranscriptDict = {}
    for line in lineslist:
        j += 1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        if 'transcript_name "' in fields[8]:
            TranscriptID = fields[8].split('transcript_name "')[1].split(
                '";')[0]
        else:
            TranscriptID = fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(TranscriptID):
            pass
        else:
            TranscriptDict[TranscriptID] = []
        chr = fields[0]
        left = int(fields[3])
        right = int(fields[4])
        orientation = fields[6]
        TranscriptDict[TranscriptID].append((chr, left, right, orientation))

    g = 0
    print 'Found', len(TranscriptDict.keys()), 'transcripts'
    for transcript in TranscriptDict.keys():
        g += 1
        if g % 1000 == 0:
            print g, 'transcripts sequences processed'
        sequence = ''
        leftEnds = []
        rightEnds = []
        TranscriptDict[transcript].sort()
        orientation = TranscriptDict[transcript][0][3]
        if orientation == '+' or orientation == 'F':
            for (chr, left, right, orientation) in TranscriptDict[transcript]:
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    sequence = sequence + hg.sequence(chr[3:len(chr)], left,
                                                      right - left)
                    print "can't retrieve sequence"
                except:
                    for p in range(left, right - left):
                        try:
                            sequence = sequence + hg.sequence(
                                chr[3:len(chr)], p, 1)
                        except:
                            sequence = sequence + 'N'
                            missed += 1
            sense = 'plus_strand'
        if orientation == '-' or orientation == 'R':
            for (chr, left, right,
                 orientation) in reversed(TranscriptDict[transcript]):
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    exonsequence = hg.sequence(chr[3:len(chr)], left - 1,
                                               right - left + 1)
                    sequence = sequence + getReverseComplement(exonsequence)
                except:
                    for p in range(left - 1, right - left + 1):
                        try:
                            sequence = sequence + getReverseComplement(
                                hg.sequence(chr[3:len(chr)], p, 1))
                        except:
                            sequence = sequence + 'N'
                            missed += 1
            sense = 'minus_strand'
        LeftEnd = min(leftEnds)
        RightEnd = max(rightEnds)
        outline = '>' + transcript + ':' + chr + ':' + str(
            LeftEnd) + '-' + str(RightEnd) + '-' + sense
        outfile.write(outline + '\n')
        if doPolyA:
            outfile.write(sequence + tail + '\n')
        else:
            outfile.write(sequence + '\n')

    outfile.close()
Esempio n. 5
0
     regionstart = exonStops[index] - maxBorder
     alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
     beforeLen = exonLengths[index]
     afterLen = exonLengths[index + 1]
     if (beforeLen + afterLen) < maxBorder + spacer:
         #print 'splice chr%s:%d-%d too short: %d' % (chrom, exonStops[index], exonStarts[index + 1], beforeLen + afterLen)
         missedCount += 1
         continue
     if (beforeLen + afterLen) < 2 * maxBorder:
         depressedCount += 1
     if beforeLen > maxBorder:
         beforeLen = maxBorder
     if afterLen > maxBorder:
         afterLen = maxBorder
     try:
         beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder,
                                    maxBorder)
         afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
         #beforeSplice = hg.sequence(chrom, exonStops[index] - beforeLen, beforeLen)
         #afterSplice = hg.sequence(chrom, exonStarts[index + 1], afterLen)
     except:
         if doVerbose:
             print "could not get chr%s:%d-%d" % (chrom, exonStops[index],
                                                  exonStarts[index + 1])
         continue
     outstring = '>%s%s%d%s%d\n%s\n' % (
         name, delimiter, index, delimiter, regionstart,
         spacerseq + beforeSplice.upper() + afterSplice.upper() + spacerseq)
     outfile.write(outstring)
 splicefileindex += 1
 spliceCounter += 1
 if spliceCounter > 10000:
Esempio n. 6
0
countList = []
posList = []

index = 0
regionList = []

for rchrom in regions:
    if 'rand' in rchrom or 'M' in rchrom or 'hap' in rchrom:
        continue
    for (start, stop, length) in regions[rchrom]:
        regionList.append((rchrom, start, length))
	
notFoundIndex = 0
currentChrom = ''
for (rchrom, start, length) in regionList:
    seq = hg.sequence(rchrom, start, length)
    if doDataset:
        if rchrom != currentChrom:
            fullchrom = 'chr' + rchrom
            hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
            currentChrom = rchrom
        (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
        if len(topPos) == 0:
            print 'topPos error'
        peakpos = topPos[0]
        peakscore = smoothArray[peakpos]
        if peakscore == 0.:
            peakscore = -1.
        if normalize:
            numHits /= normalizeBy
            peakscore /= normalizeBy
Esempio n. 7
0
from cistematic.genomes import Genome

print '%s: version 1.1' % sys.argv[0]
if len(sys.argv) < 5:
    print 'usage: python %s genome merlen chrAny:start-stop outfile' % sys.argv[
        0]
    sys.exit(1)

genome = sys.argv[1]
merlen = int(sys.argv[2])
location = sys.argv[3]
outfilename = sys.argv[4]

(chrom, pos) = location.split(':')
chrom = chrom[3:]
(start, stop) = pos.split('-')
start = int(start)
regionlength = int(stop) - start + 1

hg = Genome(genome)

seq = hg.sequence(chrom, start, regionlength)

outfile = open(outfilename, 'w')
print 'writing %d %d-mers' % (regionlength - merlen, merlen)
for index in range(regionlength - merlen):
    outfile.write(seq[index:index + merlen].upper() + '\n')

outfile.close()
Esempio n. 8
0
def main(argv):

    if len(argv) < 3:
        print 'usage: python %s genome gtf outfilename [-spliced] [-class_code symbol]' % argv[0]
        print '     this script will output the translation of all three possible reading frames; stop codons will be converted to a .'
        sys.exit(1)

    genome = argv[1]
    gtf=argv[2]
    outputfilename = argv[3]

    doSpliced=False
    if '-spliced' in argv:
        doSpliced=True
        print 'will only look at transciprs with more than one exon'

    doClassCode=False
    if '-class_code' in argv:
        doClassCode=True
        class_code=argv[argv.index('-class_code')+1]
        print 'will only look at transciprs if class code', class_code

    CodonDict={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
               'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L',
               'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
               'AAA':'K', 'AAG':'K',
               'AAU':'N', 'AAC':'N',
               'AUG':'M',
               'GAU':'D', 'GAC':'D',
               'UUU':'F', 'UUC':'F',
               'UGU':'C', 'UGC':'C',
               'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
               'CAA':'Q', 'CAG':'Q',
               'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S',
               'GAA':'E', 'GAG':'E',
               'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
               'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
               'UGG':'W',
               'CAU':'H', 'CAC':'H',
               'UAU':'Y', 'UAC':'Y',
               'AUU':'I', 'AUC':'I', 'AUA':'I',
               'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V',
               'START':'AUG',
               'UAA':'.',
               'UGA':'.',
               'UAG':'.'}

    outfile = open(outputfilename, 'w')

    hg = Genome(genome)

    j=0
    lineslist = open(gtf)
    TranscriptDict={}
    for line in lineslist:
        j+=1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2]!='exon':
            continue
        if doClassCode:
            if 'class_code "' in fields[8]:
                cc = fields[8].split('class_code "')[1].split('";')[0]
                if cc != class_code:
                    continue
            else:
                continue
        if 'transcript_name "' in fields[8]:
            TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(TranscriptID):
            pass
        else:
            TranscriptDict[TranscriptID]=[]
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        orientation=fields[6]
        TranscriptDict[TranscriptID].append((chr,left,right,orientation))

    g=0 
    print 'Found', len(TranscriptDict.keys()), 'transcripts'
    for transcript in TranscriptDict.keys():
        g+=1
        if g % 1000 == 0:
            print g, 'transcripts sequences processed'
        TranscriptDict[transcript] = list(Set(TranscriptDict[transcript]))
        if doSpliced:
            if len(TranscriptDict[transcript]) == 1:
                del TranscriptDict[transcript]
                continue
        sequence=''
        leftEnds=[]
        rightEnds=[]
        orientation = TranscriptDict[transcript][0][3]
        TranscriptDict[transcript].sort()
        if orientation=='+':
            for (chr,left,right,orientation) in TranscriptDict[transcript]:
                try:
                    sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left)
                except:
                    print "can't retrieve sequence", chr,left,right,orientation
                    for p in range(left,right-left):
                        try:
                            sequence=sequence+hg.sequence(chr[3:len(chr)],p,1)
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='plus_strand'
        if orientation=='-':
            for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]):
                try:
                    exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1)
                    sequence=sequence+getReverseComplement(exonsequence)
                except:
                    print "can not retrieve sequence", chr,left,right,orientation
                    for p in range(left-1,right-left+1):
                        try:
                            sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1))
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='minus_strand'
        if orientation=='.':
            for (chr,left,right,orientation) in TranscriptDict[transcript]:
                try:
                    sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left)
                except:
                    print "can not retrieve sequence", chr,left,right,orientation
                    for p in range(left,right-left):
                        try:
                            sequence=sequence+hg.sequence(chr[3:len(chr)],p,1)
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='unknown_strand'
        LeftEnd=TranscriptDict[transcript][0][1]
        RightEnd=TranscriptDict[transcript][-1][2]
        if orientation == '+' or orientation == '-':
            sequence = sequence.upper().replace('T','U')
            max_protein_length = len(sequence)

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(0,max_protein_length-3,3):
                if 'N' in sequence[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(1,max_protein_length-4,3):
                if 'N' in sequence[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(2,max_protein_length-5,3):
                if 'N' in sequence[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence[i:i+3]]
            outfile.write(protein+'\n')
        else:
            sequence1 = sequence.upper().replace('T','U')
            sequence2 = getReverseComplement(sequence).upper().replace('T','U')
            max_protein_length = len(sequence1)

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(0,max_protein_length-3,3):
                if 'N' in sequence1[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence1[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(1,max_protein_length-4,3):
                if 'N' in sequence1[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence1[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(2,max_protein_length-5,3):
                if 'N' in sequence1[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence1[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame4'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(0,max_protein_length-3,3):
                if 'N' in sequence2[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence2[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame5'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(1,max_protein_length-4,3):
                if 'N' in sequence2[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence2[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame6'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(2,max_protein_length-5,3):
                if 'N' in sequence2[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence2[i:i+3]]
            outfile.write(protein+'\n')

    outfile.close()