Python Genome Examples, cistematic.genomes.Genome Python Examples

Example #1

0

Show file

    doCDS = False

limitNeighbor = True
if '-force':
    limitNeighbor = False

hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalizeBins:
    totalCount = len(hitRDS)
    normalizationFactor = totalCount / 1000000.

hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)

hg = Genome(genome)
idb = geneinfoDB(cache=doCache)

gidBins = {}
gidLen = {}
geneinfoDict = idb.getallGeneInfo(genome)
if doFlank:
    locusByChromDict = getLocusByChromDict(hg,
                                           upstream=upstreamBp,
                                           downstream=downstreamBp,
                                           useCDS=doCDS,
                                           additionalRegionsDict=acceptDict,
                                           keepSense=True,
                                           adjustToNeighbor=limitNeighbor)
else:
    locusByChromDict = getLocusByChromDict(hg,

Example #2

0

Show file

    print 'usage: python %s genome GOID1 [GOID2 ....] [-outfile outfilename] [-append] [-restrict genefile]'
    sys.exit(1)

genome = sys.argv[1]

writeOut = False
if '-outfile' in sys.argv:
    writeOut = True
    outfilename = sys.argv[sys.argv.index('-outfile') + 1]

restrict = False
if '-restrict' in sys.argv:
    restrictfilename = sys.argv[sys.argv.index('-restrict') + 1]
    restrict = True

hg = Genome(genome)
idb = geneinfoDB()

GOIDlist = []
for arg in sys.argv:
    if 'GO:' in arg:
        GOIDlist.append(arg)

print sys.argv
print GOIDlist

firstGeneList = []
for GOID in GOIDlist:
    testList = hg.allGIDsbyGOID(GOID)
    print 'GOID: %s (%d)' % (GOID, len(testList))
    firstGeneList += testList

Example #3

0

Show file

    pass

#Main program
if len(sys.argv) < 3:
    print 'usage: python2.5 %s genome snpsfile nondbsnp_geneinfo_outfile' % sys.argv[
        0]
    sys.exit(1)

outStr = ""
genome = sys.argv[1]
snpfile = sys.argv[2]
outfilename = sys.argv[3]

infile = file(snpfile, 'r')

hg = Genome(genome)
additionalDict = {}

outS = ""
outfile = open(outfilename, 'w')
outfile.write(
    "#Sl\tCl\tchrom\tmis pos\t\tmatch\tuniq_mis\ttot_mis\tbase_chg\tknown_snp\tfunction\tgene\tgeneId\trpkm\n"
)
for line in infile:
    if line[0] == '#':
        continue
    fields = line.split()
    if fields[8].find('N\A') == -1:
        outfile.write(line)
    else:
        outS = ''

Example #4

0

Show file

File: getallsites.py Project: komorowskilab/PFMS

if '-cache' in sys.argv:
    doCache = True

printSeq = False
if '-printseq' in sys.argv:
    printSeq = True
maxPvalue = 0.0001

mot = Motif('', motifFile=motfilename)
motLen = len(mot)
bestScore = mot.bestConsensusScore()

if hasMotifExtension:
    print "will use cistematic.core.motif C-extension to speed up motif search"

hg = Genome(genome)

# minHits=-1 will force regions to be used regardless
# maxDist= 0 prevents merging of non-overlapping regions
if '-nomerge' in sys.argv:
    regions = getMergedRegions(infilename,
                               maxDist=0,
                               minHits=-1,
                               verbose=True,
                               doMerge=False,
                               keepPeak=usePeak)
else:
    regions = getMergedRegions(infilename,
                               maxDist=0,
                               minHits=-1,
                               verbose=True,

Example #5

0

Show file

File: gtf-to-fasta.py Project: brianpenghe/gtfgff-scripts

def main(argv):

    if len(argv) < 4:
        print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[0]
        sys.exit(1)

    genome = argv[1]
    gtf=argv[2]
    outputfilename = argv[3]
    doPolyA=False
    if '-polyA' in argv:
        doPolyA=True
        tailsize=int(argv[argv.index('-polyA')+1])
        tail=''
        for i in range(tailsize):
            tail=tail+'A'
        print 'will add a polyA tail of ', tailsize, 'nt'

    outfile = open(outputfilename, 'w')

    hg = Genome(genome)

    j=0
    lineslist = open(gtf)
    TranscriptDict={}
    for line in lineslist:
        j+=1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2]!='exon':
            continue
        if 'transcript_name "' in fields[8]:
            TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(TranscriptID):
            pass
        else:
            TranscriptDict[TranscriptID]=[]
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        orientation=fields[6]
        TranscriptDict[TranscriptID].append((chr,left,right,orientation))

    g=0 
    print 'Found', len(TranscriptDict.keys()), 'transcripts'
    for transcript in TranscriptDict.keys():
        g+=1
        if g % 1000 == 0:
            print g, 'transcripts sequences processed'
        sequence=''
        leftEnds=[]
        rightEnds=[] 
        TranscriptDict[transcript].sort()
        orientation = TranscriptDict[transcript][0][3]
        if orientation=='+' or orientation=='F':
            for (chr,left,right,orientation) in TranscriptDict[transcript]:
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left)
                    print "can't retrieve sequence"
                except:
                    for p in range(left,right-left):
                        try:
                            sequence=sequence+hg.sequence(chr[3:len(chr)],p,1)
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='plus_strand'
        if orientation=='-' or orientation=='R':
            for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]):
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1)
                    sequence=sequence+getReverseComplement(exonsequence)
                except:
                    for p in range(left-1,right-left+1):
                        try:
                            sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1))
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='minus_strand'
        LeftEnd=min(leftEnds)
        RightEnd=max(rightEnds)
        outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense
        outfile.write(outline+'\n')
        if doPolyA:
            outfile.write(sequence+tail+'\n')
        else:
            outfile.write(sequence+'\n')

    outfile.close()

Example #6

0

Show file

from cistematic.genomes import Genome

print '%s: version 1.1' % sys.argv[0]
if len(sys.argv) < 5:
    print 'usage: python %s genome merlen chrAny:start-stop outfile' % sys.argv[
        0]
    sys.exit(1)

genome = sys.argv[1]
merlen = int(sys.argv[2])
location = sys.argv[3]
outfilename = sys.argv[4]

(chrom, pos) = location.split(':')
chrom = chrom[3:]
(start, stop) = pos.split('-')
start = int(start)
regionlength = int(stop) - start + 1

hg = Genome(genome)

seq = hg.sequence(chrom, start, regionlength)

outfile = open(outfilename, 'w')
print 'writing %d %d-mers' % (regionlength - merlen, merlen)
for index in range(regionlength - merlen):
    outfile.write(seq[index:index + merlen].upper() + '\n')

outfile.close()

Example #7

0

Show file

File: gtf-to-fasta.py Project: saupchurch/gtfgff-scripts

def main(argv):

    if len(argv) < 4:
        print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[
            0]
        sys.exit(1)

    genome = argv[1]
    gtf = argv[2]
    outputfilename = argv[3]
    doPolyA = False
    if '-polyA' in argv:
        doPolyA = True
        tailsize = int(argv[argv.index('-polyA') + 1])
        tail = ''
        for i in range(tailsize):
            tail = tail + 'A'
        print 'will add a polyA tail of ', tailsize, 'nt'

    outfile = open(outputfilename, 'w')

    hg = Genome(genome)

    j = 0
    lineslist = open(gtf)
    TranscriptDict = {}
    for line in lineslist:
        j += 1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        if 'transcript_name "' in fields[8]:
            TranscriptID = fields[8].split('transcript_name "')[1].split(
                '";')[0]
        else:
            TranscriptID = fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(TranscriptID):
            pass
        else:
            TranscriptDict[TranscriptID] = []
        chr = fields[0]
        left = int(fields[3])
        right = int(fields[4])
        orientation = fields[6]
        TranscriptDict[TranscriptID].append((chr, left, right, orientation))

    g = 0
    print 'Found', len(TranscriptDict.keys()), 'transcripts'
    for transcript in TranscriptDict.keys():
        g += 1
        if g % 1000 == 0:
            print g, 'transcripts sequences processed'
        sequence = ''
        leftEnds = []
        rightEnds = []
        TranscriptDict[transcript].sort()
        orientation = TranscriptDict[transcript][0][3]
        if orientation == '+' or orientation == 'F':
            for (chr, left, right, orientation) in TranscriptDict[transcript]:
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    sequence = sequence + hg.sequence(chr[3:len(chr)], left,
                                                      right - left)
                    print "can't retrieve sequence"
                except:
                    for p in range(left, right - left):
                        try:
                            sequence = sequence + hg.sequence(
                                chr[3:len(chr)], p, 1)
                        except:
                            sequence = sequence + 'N'
                            missed += 1
            sense = 'plus_strand'
        if orientation == '-' or orientation == 'R':
            for (chr, left, right,
                 orientation) in reversed(TranscriptDict[transcript]):
                leftEnds.append(left)
                rightEnds.append(right)
                try:
                    exonsequence = hg.sequence(chr[3:len(chr)], left - 1,
                                               right - left + 1)
                    sequence = sequence + getReverseComplement(exonsequence)
                except:
                    for p in range(left - 1, right - left + 1):
                        try:
                            sequence = sequence + getReverseComplement(
                                hg.sequence(chr[3:len(chr)], p, 1))
                        except:
                            sequence = sequence + 'N'
                            missed += 1
            sense = 'minus_strand'
        LeftEnd = min(leftEnds)
        RightEnd = max(rightEnds)
        outline = '>' + transcript + ':' + chr + ':' + str(
            LeftEnd) + '-' + str(RightEnd) + '-' + sense
        outfile.write(outline + '\n')
        if doPolyA:
            outfile.write(sequence + tail + '\n')
        else:
            outfile.write(sequence + '\n')

    outfile.close()

Example #8

0

Show file

fullOnly = False
if '-fullOnly' in sys.argv:
    fullOnly = True
    
#mot = Motif('',motifFile = motifDir + 'NRSE2.mot')
#motL = Motif('',motifFile = motifDir + 'NRSE2left.mot')
#motR = Motif('',motifFile = motifDir + 'NRSE2right.mot')
mot = Motif('',motifFile = motifDir + 'NRSE3.mot')
motL = Motif('',motifFile = motifDir + 'NRSE3left.mot')
motR = Motif('',motifFile = motifDir + 'NRSE3right.mot')
bestScore = mot.bestConsensusScore()
bestLeft = motL.bestConsensusScore()
bestRight = motR.bestConsensusScore()

hg = Genome(genome)

regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)

outfile = open(outfilename,'w')
outfile.write('#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n' % (chipfilename, infilename, normalize, doMarkov1))
outfile.write('#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n' % (enforcePeakDist, maxpeakdist, fullOnly))
outfile.write('#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n')
		
countList = []
posList = []

index = 0
regionList = []

for rchrom in regions:

Example #9

0

Show file

doCache = False
if '-cache' in sys.argv:
    doCache = True

bins = 10
standardMinThresh = standardMinDist / bins

hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalize:
    totalCount = len(hitRDS)
    normalizationFactor = totalCount / 1000000.

hg = Genome(genome)
idb = geneinfoDB(cache=True)

gidDict = {}
geneinfoDict = idb.getallGeneInfo(genome)
featuresDict = hg.getallGeneFeatures()

#infile = open(infilename)
outfile = open(outfilename, 'w')

gidList = hg.allGIDs()
gidList.sort()
for gid in gidList:
    symbol = 'LOC' + gid
    geneinfo = ''
    featureList = []

Example #10

0

Show file

    altPosList.append(altPos)
    posLine[pos] = line
    if trackStrand:
        if 'RNAFARP' in line:
            posStrand[pos] = '+'
            posStrand[altPos] = '+'
        else:
            posStrand[pos] = '-'
            posStrand[altPos] = '-'

geneList = []
geneDict = {}
if maxRadius < step:
    step = maxRadius - 2

hg = Genome(genome, inRAM=True)
if extendGenome != '':
    hg.extendFeatures(extendGenome, replace=replaceModels)

geneannotDict = hg.allAnnotInfo()
#featureTypes = ['CDS'] + hg.getFeatureTypes('UT%')
featureTypes = ['CDS', 'UTR']
for radius in range(1, maxRadius, step):
    print 'radius %d' % radius
    print len(posList)
    if radius == 1:
        posDict = genesIntersecting(genome,
                                    posList,
                                    extendGen=extendGenome,
                                    replaceMod=replaceModels)
    else:

Example #11

0

Show file

File: rnafarPairs.py Project: komorowskilab/PFMS

RDS = readDataset(rdsfile, verbose=True, cache=doCache)
rdsChromList = RDS.getChromosomes()

if doVerbose:
    print time.ctime()

distinct = 0
total = 0
outfile = open(outfilename, 'w')

idb = geneinfoDB()
if genome == 'dmelanogaster':
    geneinfoDict = idb.getallGeneInfo(genome, infoKey='locus')
else:
    geneinfoDict = idb.getallGeneInfo(genome)
hg = Genome(genome)
geneannotDict = hg.allAnnotInfo()

assigned = {}
farConnected = {}
for achrom in rdsChromList:
    if achrom == 'chrM':
        continue
    print achrom
    uniqDict = RDS.getReadsDict(fullChrom=True,
                                chrom=achrom,
                                noSense=True,
                                withFlag=True,
                                withPairID=True,
                                doUniqs=True,
                                readIDDict=True)

Example #12

0

Show file

extendGenome = ''
replaceModels = False
if '-models' in sys.argv:
    extendGenome = sys.argv[sys.argv.index('-models') + 1]
    if '-replacemodels' in sys.argv:
        replaceModels = True
        print "will replace gene models with %s" % extendGenome
    else:
        print "will extend gene models with %s" % extendGenome

doCache = False
cachePages = 0
if '-cache' in sys.argv:
    cacheGeneDB(genome)
    hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
    idb = geneinfoDB(cache=True)
    print '%s cached' % genome
    doCache = True
    cachePages = int(sys.argv[sys.argv.index('-cache') + 1])
else:
    hg = Genome(genome, inRAM=True)
    idb = geneinfoDB()

if extendGenome != '':
    hg.extendFeatures(extendGenome, replace=replaceModels)

hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
    hitRDS.setDBcache(cachePages)

Example #13

0

Show file

File: getgosig.py Project: komorowskilab/PFMS

    psyco.full()
except:
    pass
from cistematic.genomes import Genome
from math import log
import os.path
import sys

print '%s: version 2.1' % sys.argv[0]

if len(sys.argv) < 6:
    print 'usage: python %s genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [-fontsize pts] [-length in] [-width in]' % sys.argv[
        0]
    sys.exit(1)

hg = Genome(sys.argv[1])
allgodesc = hg.allGOterms()
godesc = []

import matplotlib
matplotlib.use('Agg')

from pylab import *
doGray = False

rootdir = './'

imagename = sys.argv[2]

options = 0
fontSize = 5

Example #14

0

Show file

extendGenome = ''
replaceModels = False
if '-models' in sys.argv:
    extendGenome = sys.argv[sys.argv.index('-models') + 1]
    if '-replacemodels' in sys.argv:
        replaceModels = True
        print "will replace gene models with %s" % extendGenome
    else:
        print "will extend gene models with %s" % extendGenome

doCache = False
if '-cache' in sys.argv:
    doCache = True
    cacheGeneDB(genome)
    hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
    print '%s cached' % genome
else:
    hg = Genome(genome, inRAM=True)

if extendGenome != '':
    hg.extendFeatures(extendGenome, replace=replaceModels)

RDS = readDataset(hitfile, verbose=True, cache=doCache, reportCount=False)
uniqcount = RDS.getUniqsCount()
print '%d unique reads' % uniqcount

splicecount = 0
countDict = {}
gidList = []
farList = []

Example #15

0

Show file

for line in infile:
    if line[0] == '#':
        continue
    fields = line.split('\t')
    chrom = fields[2][3:]
    start = int(fields[3])
    pos = (chrom, start)

    posList.append(pos)
    posLine[pos] = line

geneList = []
geneDict = {}
geneSense = {}

hg = Genome(genome)
#featureTypes = ['CDS'] + hg.getFeatureTypes('UT%')
featureTypes = ['CDS', 'UTR']
for ftype in featureTypes:
    if flankBP > 0:
        posDict = genesIntersecting(genome, posList, flank=flankBP)
    else:
        posDict = genesIntersecting(genome, posList)
    for pos in posDict:
        #print pos
        geneID = posDict[pos][0][0]
        try:
            symbol = geneinfoDict[geneID][0][0]
        except:
            symbol = 'LOC' + geneID
        try:

Example #16

0

Show file

outfilename = sys.argv[3]
# maxBorder should be readlen - 4
maxBorder = int(sys.argv[4])

doVerbose = False
if '-verbose' in sys.argv:
    doVerbose = True

spacer = 2
if '-spacer' in sys.argv:
    spacer = int(sys.argv[sys.argv.index('-spacer') + 1])
spacerseq = 'N' * spacer

datafile = open(datafilename)
#seqfile = open('knownGeneMrna.txt')
hg = Genome(genome)

spliceCountDict = {}
exonStartDict = {}
exonStopDict = {}
exonLengthDict = {}
nameToChromDict = {}
nameToComplementDict = {}
alreadySeen = {}
counter = 0

for line in datafile:
    fields = line.split()
    name = fields[0]
    spliceCount = int(fields[7]) - 1
    if spliceCount < 1:

Example #17

0

Show file

doDataset = False
if '-dataset' in sys.argv:
    if usePeaks:
        print "ignoring dataset and relying on peak data"
    else:
        hitfile = sys.argv[sys.argv.index('-dataset') + 1]
        doDataset = True
        hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
        readlen = hitRDS.getReadSize()

doCompact = False
if '-compact' in sys.argv:
    doCompact = True

hg = Genome(genome)

outfile = open(outfilename, 'w')

#readlen = readSize(hitfile)
#hitDict = getReadDict(hitfile)
if doCompact:
    regionDict = getMergedRegions(regionfile,
                                  minHits=minHitThresh,
                                  verbose=True,
                                  chromField=0,
                                  compact=True,
                                  keepPeak=usePeaks,
                                  returnTop=topRegions)
else:
    regionDict = getMergedRegions(regionfile,

Example #18

0

Show file

def main(argv):

    if len(argv) < 3:
        print 'usage: python %s genome gtf outfilename [-spliced] [-class_code symbol]' % argv[0]
        print '     this script will output the translation of all three possible reading frames; stop codons will be converted to a .'
        sys.exit(1)

    genome = argv[1]
    gtf=argv[2]
    outputfilename = argv[3]

    doSpliced=False
    if '-spliced' in argv:
        doSpliced=True
        print 'will only look at transciprs with more than one exon'

    doClassCode=False
    if '-class_code' in argv:
        doClassCode=True
        class_code=argv[argv.index('-class_code')+1]
        print 'will only look at transciprs if class code', class_code

    CodonDict={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
               'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L',
               'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
               'AAA':'K', 'AAG':'K',
               'AAU':'N', 'AAC':'N',
               'AUG':'M',
               'GAU':'D', 'GAC':'D',
               'UUU':'F', 'UUC':'F',
               'UGU':'C', 'UGC':'C',
               'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
               'CAA':'Q', 'CAG':'Q',
               'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S',
               'GAA':'E', 'GAG':'E',
               'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
               'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
               'UGG':'W',
               'CAU':'H', 'CAC':'H',
               'UAU':'Y', 'UAC':'Y',
               'AUU':'I', 'AUC':'I', 'AUA':'I',
               'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V',
               'START':'AUG',
               'UAA':'.',
               'UGA':'.',
               'UAG':'.'}

    outfile = open(outputfilename, 'w')

    hg = Genome(genome)

    j=0
    lineslist = open(gtf)
    TranscriptDict={}
    for line in lineslist:
        j+=1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2]!='exon':
            continue
        if doClassCode:
            if 'class_code "' in fields[8]:
                cc = fields[8].split('class_code "')[1].split('";')[0]
                if cc != class_code:
                    continue
            else:
                continue
        if 'transcript_name "' in fields[8]:
            TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(TranscriptID):
            pass
        else:
            TranscriptDict[TranscriptID]=[]
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        orientation=fields[6]
        TranscriptDict[TranscriptID].append((chr,left,right,orientation))

    g=0 
    print 'Found', len(TranscriptDict.keys()), 'transcripts'
    for transcript in TranscriptDict.keys():
        g+=1
        if g % 1000 == 0:
            print g, 'transcripts sequences processed'
        TranscriptDict[transcript] = list(Set(TranscriptDict[transcript]))
        if doSpliced:
            if len(TranscriptDict[transcript]) == 1:
                del TranscriptDict[transcript]
                continue
        sequence=''
        leftEnds=[]
        rightEnds=[]
        orientation = TranscriptDict[transcript][0][3]
        TranscriptDict[transcript].sort()
        if orientation=='+':
            for (chr,left,right,orientation) in TranscriptDict[transcript]:
                try:
                    sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left)
                except:
                    print "can't retrieve sequence", chr,left,right,orientation
                    for p in range(left,right-left):
                        try:
                            sequence=sequence+hg.sequence(chr[3:len(chr)],p,1)
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='plus_strand'
        if orientation=='-':
            for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]):
                try:
                    exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1)
                    sequence=sequence+getReverseComplement(exonsequence)
                except:
                    print "can not retrieve sequence", chr,left,right,orientation
                    for p in range(left-1,right-left+1):
                        try:
                            sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1))
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='minus_strand'
        if orientation=='.':
            for (chr,left,right,orientation) in TranscriptDict[transcript]:
                try:
                    sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left)
                except:
                    print "can not retrieve sequence", chr,left,right,orientation
                    for p in range(left,right-left):
                        try:
                            sequence=sequence+hg.sequence(chr[3:len(chr)],p,1)
                        except:
                            sequence=sequence+'N'
                            missed+=1
            sense='unknown_strand'
        LeftEnd=TranscriptDict[transcript][0][1]
        RightEnd=TranscriptDict[transcript][-1][2]
        if orientation == '+' or orientation == '-':
            sequence = sequence.upper().replace('T','U')
            max_protein_length = len(sequence)

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(0,max_protein_length-3,3):
                if 'N' in sequence[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(1,max_protein_length-4,3):
                if 'N' in sequence[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(2,max_protein_length-5,3):
                if 'N' in sequence[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence[i:i+3]]
            outfile.write(protein+'\n')
        else:
            sequence1 = sequence.upper().replace('T','U')
            sequence2 = getReverseComplement(sequence).upper().replace('T','U')
            max_protein_length = len(sequence1)

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(0,max_protein_length-3,3):
                if 'N' in sequence1[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence1[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(1,max_protein_length-4,3):
                if 'N' in sequence1[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence1[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(2,max_protein_length-5,3):
                if 'N' in sequence1[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence1[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame4'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(0,max_protein_length-3,3):
                if 'N' in sequence2[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence2[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame5'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(1,max_protein_length-4,3):
                if 'N' in sequence2[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence2[i:i+3]]
            outfile.write(protein+'\n')

            outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame6'
            outfile.write(outline+'\n')
            protein = ''
            for i in range(2,max_protein_length-5,3):
                if 'N' in sequence2[i:i+3]:
                    protein = protein + '.'
                else:
                    protein = protein + CodonDict[sequence2[i:i+3]]
            outfile.write(protein+'\n')

    outfile.close()