Ejemplo n.º 1
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         counter = ['>>']    # List containing count times
         menulist = [('F','Change output file name','outfile','OutFile'),('X','Exit','return',''),('R','Run','return','')]
         mchoice = rje_menu.menu(self,'WormPump Menu',menulist,choicetext='Please select:',changecase=True,default='R')
         if mchoice == 'X': return
         self.printLog('#OUT','Output will be to %s' % self.info['OutFile'])
         self.printLog('#START','Initialising counter...')
         ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         wormid = None
         while counter[-1] != 'X':
             if wormid: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid,default='').upper())
             else: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit',default='').upper())
             if counter[-1]:
                 wormid = counter[-1]
                 if wormid == 'X': break
                 self.printLog('#WORM','Worm "%s"' % wormid)
             counter.append(time.time())
             self.deBug(counter)
         ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         head = ['Worm','Count','WormTime','AbsTime']
         rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,rje_backup=True)
         wormstart = 0.0
         wormid = None
         wtot = 0
         while counter:
             x = counter.pop(0)
             if x in ['>>','X']: continue
             if x:
                 wormid = x
                 wormstart = counter[0]
                 wx = 0
                 wtot += 1
             else:
                 if not wormid: continue
                 wx += 1
             t = counter.pop(0)
             tt = time.localtime(t)
             wdata = {'Worm':wormid,'Count':wx,'WormTime':t-wormstart,
                      #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
                      'AbsTime':'%s:%s:%s' % (rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
             rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,datadict=wdata)
         self.printLog('#OUT','Counts for %d worms output to %s' % (wtot,self.info['OutFile']))
         rje.choice('<ENTER> to exit')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Ejemplo n.º 2
0
    def enrichment(self):   ### Performs final enrichment analysis on SLiMDIP and Random datasets.
        '''
        Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from
        the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the
        slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values"
        but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R.

        This method needs:
        - slimdip table (or *.slimdip.tdt output file to load).
        - randbase.XX.tdt files.
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table.
            if not self.db('slimdip') and not self.slimDIP(): return False
            # Check for randomised PPI datasets. These should be named randbase.XXX.tdt.
            for r in range(self.getInt('RandPPI')):
                randfile = '%s.%s.tdt' % (self.getStr('RandBase'),rje.preZero(r,self.getInt('RandPPI')-1))
                if not rje.exists(randfile):
                    if not self.randomisePPI(): return False
                    break

            ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ##


            return True
        except: self.errorLog('%s.enrichment() error' % self.prog()); return False
Ejemplo n.º 3
0
    def enrichment(
        self
    ):  ### Performs final enrichment analysis on SLiMDIP and Random datasets.
        '''
        Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from
        the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the
        slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values"
        but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R.

        This method needs:
        - slimdip table (or *.slimdip.tdt output file to load).
        - randbase.XX.tdt files.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table.
            if not self.db('slimdip') and not self.slimDIP(): return False
            # Check for randomised PPI datasets. These should be named randbase.XXX.tdt.
            for r in range(self.getInt('RandPPI')):
                randfile = '%s.%s.tdt' % (self.getStr('RandBase'),
                                          rje.preZero(
                                              r,
                                              self.getInt('RandPPI') - 1))
                if not rje.exists(randfile):
                    if not self.randomisePPI(): return False
                    break

            ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ##

            return True
        except:
            self.errorLog('%s.enrichment() error' % self.prog())
            return False
Ejemplo n.º 4
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd)
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}   # Dictionary of {ID:Sequence object}
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name'])
                if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold':scaffold,'Gene':name})
                try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos)
                except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                gene.opt['Complement'] = start > end        # Sequence on "lagging" strand
                gene.setStat({'Start':start,'End':end})
                gene.list['CDS'] = []       # Will add CDS sequences here
                gene.list['Exon'] = []      # Will add exon sequences here
            self.log.printLog('\r#GENE','Processing Gene Annotation complete!')
                           
            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx,ctot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False)
                cx += 100
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS','Processing CDS Annotation complete!')
                
            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex,etot) = (0.0,exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False)
                ex += 100
                try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON','Processing Exon Annotation complete!')
                
            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist,xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist,xreplace=False)
                gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,','))
            self.log.printLog('\r#GENE','Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except: self.log.errorLog(rje_zen.Zen().wisdom())
Ejemplo n.º 5
0
    def alignmentToLocal(self,alignment=[],protqry=False):    ### Converts alignment into local hits table
        '''
        Converts alignment into local hits table.
        >> alignment:list of alignment text strings parsed from exonerate output.
        >> protqry:bool[False] = Whether query is protein
        << returns local database table.
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand']
            vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID'])

            ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            '''
                     Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus]
                    Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp]
                     Model: protein2genome:local
                 Raw score: 1170
               Query range: 19 -> 295
              Target range: 12312786 -> 12307250
            
                   20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg :       37
                        ..!...|||   ||||||||||||||||||||||||||||||||||||||||||
                        CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg
             12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735
            
                  264 : {G}  >>>> Target Intron 7 >>>>  {ly}GluIleAspIleSerArg :      270
                        {|}           1304 bp           {||}|||||||||||||||!!!
                        {G}++                         ++{ly}GluIleAspIleSerSer
             12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328
            
                  289 : ValProProAsnTyrTyrTyr :      295
                        |||||| !!!..||| !!|||
                        ValProAlaThrTyrAspTyr
             12307273 : GTTCCTGCCACGTATGACTAT : 12307251
            '''
            qry = None
            hit = None
            alnx = {}
            ventry = {}
            parsing = alignment[0:]
            rank = 1

            while parsing:
                line = parsing.pop(0)
                #self.bugPrint(line)
                # Query
                if rje.matchExp('Query: (\S+)',line):
                    if ventry: vdb.addEntry(ventry)
                    ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank}
                    rank += 1
                # Hit
                if rje.matchExp('Target: (\S+)',line):
                    ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0]
                    qh = (ventry['Qry'],ventry['Hit'])
                    if qh in alnx: alnx[qh] += 1
                    else: alnx[qh] = 1
                    ventry['AlnID'] = alnx[qh]
                # Score
                if rje.matchExp('core: (\S+)',line):
                    ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0])
                # Alignment
                if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line):
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line)
                    #self.bugPrint('= new aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    x = line.find(aln)
                    if 'QryStart' not in ventry: ventry['QryStart'] = start
                    ventry['QryEnd'] = end
                    ventry['QrySeq'] += aln
                    #self.bugPrint('^%s$' % ventry['QrySeq'])

                    line = parsing.pop(0)
                    #self.bugPrint(line)
                    #self.bugPrint(']%s[' % aln)
                    #self.bugPrint(']%s[' % line[x:x+len(aln)])
                    ventry['AlnSeq'] += line[x:x+len(aln)]
                    #self.debug('^%s$' % ventry['AlnSeq'])

                    #self.bugPrint(parsing[0])
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata:
                        #self.deBug(parsing[0])
                        adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata: raise ValueError('Partial alignment! Truncated output?')
                    #self.bugPrint('+ hit aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    if 'HitStart' not in ventry: ventry['HitStart'] = start
                    ventry['HitEnd'] = end
                    ventry['HitSeq'] += aln
            if ventry: vdb.addEntry(ventry)
            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T'])
            self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F'])

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    # 1->1, 2->4, 3->7 = 1+3*(n-1)
                    ventry['QryStart'] = 1+3*(ventry['QryStart']-1)
                    if ventry['QrySeq'].startswith('{'):
                        codend = ventry['QrySeq'].find('}')
                        # {X} = phase 2, find = 2
                        if codend == 2: ventry['QryStart'] += 2
                        # {XX} = phase 1, find = 3
                        elif codend == 3: ventry['QryStart'] += 1
                        else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry)
                    ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1

            vdb.newKey(['Qry','Rank','Hit','AlnID'])
            for vkey in vdb.dataKeys():
                ventry = vdb.data(vkey)
                #i# Make a combined hitseq to output to fasta
                #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX
                hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd']))
                hitseq = ''
                phase = (ventry['QryStart'] + 2) % 3
                alnx = 1
                vkeyentries = [ventry]
                dirn = 1
                if ventry['HitEnd'] < ventry['HitStart']:
                    dirn = -1
                    ventry['HitStrand'] = '-'
                else: ventry['HitStrand'] = '+'
                for seq in ['HitSeq','QrySeq','AlnSeq']:
                    ventry[seq] = string.replace(ventry[seq],'}','')
                    ventry[seq] = string.replace(ventry[seq],'{','')
                while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']):
                    intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0]
                    x = ventry['QrySeq'].find(intron)
                    y = x + len(intron)
                    intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0])
                    #i# Create a new entry of the first exon
                    newentry = rje.combineDict({},ventry)
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        newentry[seq] = newentry[seq][:x]
                    newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1
                    newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1
                    newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn
                    newentry['Length'] = x
                    newentry['Identity'] = string.count(newentry['AlnSeq'],'|')
                    vkeyentries.append(vdb.addEntry(newentry))
                    hitseq += newentry['HitSeq']
                    #i# Update ventry to be the rest of the hit
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        ventry[seq] = ventry[seq][y:]
                    ventry['QryStart'] = newentry['QryEnd'] + 1
                    if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1
                    ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn
                #i# Calculate length and identity of final exon
                ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx)
                ventry['Length'] = len(ventry['AlnSeq'])
                ventry['Identity'] = string.count(ventry['AlnSeq'],'|')
                #i# Add sequence hits
                hitname += ' (%d alignment blocks)' % alnx
                hitseq += ventry['HitSeq']
                hitseq = string.replace(hitseq,'-','')
                protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq))
                self.obj['ProtHits']._addSeq(hitname,protseq)
                if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq)
                self.obj['DNAHits']._addSeq(hitname,hitseq)

                #i# Update AlnID for proper float sorting
                for ventry in vkeyentries:
                    (vcore,vx) = string.split(ventry['AlnID'],'.')
                    ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx))
                    #self.debug(ventry)
            vdb.dataFormat({'AlnID':'string'})
            vdb.remakeKeys()
            self.debug(vdb.dataKeys())

            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$\n' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            udb = self.reduceLocal(byqry=True)
            udb.rename('unique')
            udb.newKey(['Qry','Rank','Hit','AlnID'])
            self.debug(vdb.dataKeys())

            #i# Calculate exon phase
            for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3
                for ventry in udb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3

            #vdb.remakeKeys()
            return vdb

        except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
Ejemplo n.º 6
0
 def seqSubset2(self):    ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile,mainkeys=['Gene'],name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile])
             ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes: ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene); ing_mgi.remove(gene)
             self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes))
             open('ingolia.bad.txt','w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna','pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')]
                     rje_seqlist.SeqList(self.log,seqcmd)
             mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia':gene,'Gene':ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file']
         iseq = rje_seqlist.SeqList(self.log,seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST',evalue='')
             while iseq.nextSeq():
                 (iname,icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)',iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL',musg):
                     if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0]
                     else: entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]':'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI');
         ENST = open('IngExact.cdna.all.fa','w')
         ENSP = open('IngExact.pep.all.fa','w')
         ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']: fx += 1; continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'],','):
                 (tname,tseq) = iseq.getDictSeq(trans,format='tuple')
                 self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon]))
                 if tseq[entry[icod]-3:][:7] == entry[icon]:
                     ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             elif len(peptseq) < minpep:
                 self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             id = rje.preZero(int(ex/100),etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq))
             ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close(); ENSP.close()
         self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx)))
         return
     except: self.errorLog('%s.method error' % self)
Ejemplo n.º 7
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         counter = ['>>']  # List containing count times
         menulist = [('F', 'Change output file name', 'outfile', 'OutFile'),
                     ('X', 'Exit', 'return', ''),
                     ('R', 'Run', 'return', '')]
         mchoice = rje_menu.menu(self,
                                 'WormPump Menu',
                                 menulist,
                                 choicetext='Please select:',
                                 changecase=True,
                                 default='R')
         if mchoice == 'X': return
         self.printLog('#OUT',
                       'Output will be to %s' % self.info['OutFile'])
         self.printLog('#START', 'Initialising counter...')
         ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         wormid = None
         while counter[-1] != 'X':
             if wormid:
                 counter.append(
                     rje.choice(
                         'ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count'
                         % wormid,
                         default='').upper())
             else:
                 counter.append(
                     rje.choice(
                         'ID <ENTER> for new worm | X <ENTER> to exit',
                         default='').upper())
             if counter[-1]:
                 wormid = counter[-1]
                 if wormid == 'X': break
                 self.printLog('#WORM', 'Worm "%s"' % wormid)
             counter.append(time.time())
             self.deBug(counter)
         ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         head = ['Worm', 'Count', 'WormTime', 'AbsTime']
         rje.delimitedFileOutput(self,
                                 self.info['OutFile'],
                                 headers=head,
                                 rje_backup=True)
         wormstart = 0.0
         wormid = None
         wtot = 0
         while counter:
             x = counter.pop(0)
             if x in ['>>', 'X']: continue
             if x:
                 wormid = x
                 wormstart = counter[0]
                 wx = 0
                 wtot += 1
             else:
                 if not wormid: continue
                 wx += 1
             t = counter.pop(0)
             tt = time.localtime(t)
             wdata = {
                 'Worm':
                 wormid,
                 'Count':
                 wx,
                 'WormTime':
                 t - wormstart,
                 #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))}
                 'AbsTime':
                 '%s:%s:%s' % (rje.preZero(tt[3], 24), rje.preZero(
                     tt[4], 60), rje.preZero(tt[5], 60))
             }
             rje.delimitedFileOutput(self,
                                     self.info['OutFile'],
                                     headers=head,
                                     datadict=wdata)
         self.printLog(
             '#OUT', 'Counts for %d worms output to %s' %
             (wtot, self.info['OutFile']))
         rje.choice('<ENTER> to exit')
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Ejemplo n.º 8
0
    def qsub(self):  ### Creates job and calls with qsub
        '''Creates job and calls with qsub. Returns qsub job ID or 0 if jobwait=True and job completed.'''
        try:  ### Basics ###
            hr = int(self.stat['Walltime'])
            min = int((0.5 + (self.stat['Walltime'] - hr) * 60.0))
            if self.opt['Report']: return self.report()
            jobstr = string.replace('%s.job' % self.info['Job'], '.job', '')
            jlist = [
                '#!/bin/bash',
                '#PBS -N %s' % jobstr,  #,'#PBS -q batch',
                '#PBS -l nodes=%d:ppn=%d' %
                (self.stat['Nodes'], self.stat['PPN']),
                '#PBS -l walltime=%d:%s:00' % (hr, rje.preZero(min, 60)),
                '#PBS -l vmem=%dgb' % self.getInt('VMem'),
                '#PBS -l mem=%dgb' % self.getInt('VMem'),
                ''
            ]  #10
            #if not os.popen('hostname').read().startswith('katana.science.unsw.edu.au'):
            #    jlist[-2] = '#PBS -l mem=%dgb' % self.getInt('VMem')
            if self.getBool('Monitor'):
                if self.getBool('JobWait'):
                    self.warnLog(
                        'Cannot run with wait=T and monitor=T: switched monitor=F'
                    )
                    self.setBool({'Monitor': False})
                else:
                    jlist += ['#PBS -k oed']
            if self.getStr('Email'):
                jlist += ['#PBS -M %s' % self.getStr('Email'), '#PBS -m ae']
                if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae'
            jlist += [
                '### Define number of processors',
                'NPROCS=`wc -l < $PBS_NODEFILE`',
                'echo Running on host `hostname`',
                'echo Time is `date`',
                'echo Directory is `pwd`',  #2
                'echo This jobs runs on the following processors:',
                'echo `cat $PBS_NODEFILE`',
                '',  #5
                'echo This job has allocated $NPROCS cpus',
                ''
            ]
            self.printLog(
                '#PPN', '%d Node(s) requested: %d PPN.' %
                (self.getInt('Nodes'), self.getInt('PPN')))
            self.printLog('#VMEM',
                          '%s GB VMem requested.' % (self.getStat('VMem')))
            if self.getBool('ModPurge'):
                jlist.append('module purge')
                self.printLog('#MOD', 'Modules purged (modpurge=T)')
            for mod in self.list['Modules']:
                if mod.lower() not in ['', 'none']:
                    jlist.append('module add %s' % mod)
            if self.list['Modules']:
                self.printLog(
                    '#MOD', 'Modules added: %s' %
                    string.join(self.list['Modules'], '; '))
            for pcall in self.list['PreCall']:
                self.printLog('#PCALL', pcall)
                jlist.append(pcall)
            #x#jlist = ['#!/bin/sh']   # New Iridis shell script method!
            ### Directory & Program ###
            jlist.append('cd %s' % self.info['QPath'])
            pcall = self.info['Program']
            if self.opt['RjePy']:
                pcall = 'python ' + self.info['PyPath'] + pcall
            jlist.append(pcall)
            ### Completion message
            jlist += ['', 'echo ---', 'qstat -f $PBS_JOBID', 'echo ---']
            jlist += ['', 'echo', 'echo Time is `date`', 'echo Job complete']
            ### Output and call ###
            job = '{0}.job'.format(
                jobstr
            )  #string.replace('%s.job' % self.info['Job'],'.job.job','.job')
            open(job, 'w').write(string.join(jlist, '\n'))
            self.printLog('#DIR', self.info['QPath'])
            self.printLog('#RUN', pcall)
            #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes'])
            qsub = 'qsub'
            if self.getBool('StartBash'): qsub += ' -S /bin/bash'
            if self.list['Depend']:
                qsub += ' -W depend=afterany'
                #for id in self.list['Depend']: qsub += ':%s.bio-server' % id
                myhost = self.getStr('DependHPC')
                if not self.getStrLC('DependHPC'):
                    myhost = string.split(os.popen('hostname').read())[0]
                for id in self.list['Depend']:
                    qsub += ':%s.%s' % (id, myhost)
            qsub += ' %s' % (job)
            self.printLog('#JOB', qsub)
            if self.test():
                self.printLog('#TEST',
                              'Test mode: will not place job in queue.')
                self.verbose(
                    0, 1, string.join(['>>>>>'] + jlist + ['<<<<<', ''], '\n'))
                return False
            qrun = os.popen(qsub).read()
            self.printLog('#QSUB', qrun)
            qid = string.split(qrun, '.')[0]
            showstart = 'qstat -T'
            if os.popen('hostname').read().startswith(
                    'katana.science.unsw.edu.au'):
                showstart = 'showstart'
            self.printLog('#SHOW',
                          'Attempt %s %s in %s sec' %
                          (showstart, qrun, self.stat['Pause']),
                          log=False)
            time.sleep(self.stat['Pause'])
            for qline in os.popen('%s %s' % (showstart, qrun)):  #qid):
                if rje.chomp(qline):
                    self.printLog('#INFO', qline, timeout=False)

            ### Wait for job to be completed
            if self.getBool('JobWait'):
                if self.getBool('Monitor'):
                    raise ValueError('Cannot run with wait=T and monitor=T')
                self.printLog('#WAIT',
                              'Waiting for job {0} to finish'.format(qid))
                ofile = '{0}.o{1}'.format(
                    string.replace('%s.job' % self.info['Job'], '.job', ''),
                    qid)
                running = False
                while not rje.exists(ofile):
                    qstat = string.atoi(
                        os.popen("qstat | grep '^{0}' -c".format(
                            qid)).read().split()[0])
                    if not qstat:
                        self.printLog(
                            '#QSTAT',
                            'Job {0} disappeared from qstat'.format(qid))
                        break
                    elif not running:
                        try:
                            qstat = string.split(
                                os.popen("qstat | grep '^{0}'".format(
                                    qid)).read().split()[4])
                            if qstat == 'R':
                                running = True
                                self.printLog('#QSTAT',
                                              'Job {0} running...'.format(qid))
                        except:
                            pass
                    time.sleep(max(1, self.getInt('Pause')))
                owait = 300
                while owait and not rje.exists(ofile):
                    owait -= 1
                    time.sleep(1)
                if rje.exists(ofile):
                    if 'Job complete' in os.popen(
                            'tail -n 1 {0}'.format(ofile)).read():
                        self.printLog(
                            '#DONE',
                            '{0} job ({1}) complete.'.format(jobstr, qid))
                        return 0
                    else:
                        self.printLog(
                            '#FAIL', '{0} job ({1}) failed to finish.'.format(
                                jobstr, qid))
                        return qid
                else:
                    self.printLog(
                        '#FAIL',
                        '{0} job ({1}) failed to generate {2}.'.format(
                            jobstr, qid, ofile))

            return qid
        except:
            self.errorLog('Error in qsub()')
            return False
Ejemplo n.º 9
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            genes = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-gene-r5.5.fasta' % flybase] + scmd)
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            exons = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-exon-r5.5.fasta' % flybase] + scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}  # Dictionary of {ID:Sequence object}
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE',
                                  'Processing Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                (id, scaffold, pos, name, glen) = rje.matchExp(
                    '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',
                    gene.info['Name'])
                if string.atoi(glen) != gene.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold': scaffold, 'Gene': name})
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',
                                           pos)
                except:
                    (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                gene.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                gene.setStat({'Start': start, 'End': end})
                gene.list['CDS'] = []  # Will add CDS sequences here
                gene.list['Exon'] = []  # Will add exon sequences here
            self.log.printLog('\r#GENE',
                              'Processing Gene Annotation complete!')

            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx, ctot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS',
                                  'Processing CDS Annotation: %.1f%%' %
                                  (cx / ctot),
                                  newline=False,
                                  log=False)
                cx += 100
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen():
                    self.log.errorLog('%s Length mismatch!' % id,
                                      printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS', 'Processing CDS Annotation complete!')

            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex, etot) = (0.0, exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON',
                                  'Processing Exon Annotation: %.1f%%' %
                                  (ex / etot),
                                  newline=False,
                                  log=False)
                ex += 100
                try:
                    (id, scaffold, pos, name, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,
                                                                 ',')[0]]
                try:
                    (end,
                     start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',
                                           pos)
                except:
                    try:
                        (start,
                         end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)', pos)
                    except:
                        (start, end) = rje.matchExp('^(\d+)\.\.(\d+)', pos)
                (start, end) = (string.atoi(start), string.atoi(end))
                seq.opt[
                    'Complement'] = start > end  # Sequence on "lagging" strand
                seq.setStat({'Start': start, 'End': end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON',
                              'Processing Exon Annotation complete!')

            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx, gtot) = (0.0, genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE',
                                  'Generating new Gene Annotation: %.1f%%' %
                                  (gx / gtot),
                                  newline=False,
                                  log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist, xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt[
                            'Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,
                                                 glen), rje.preZero(end, glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist, xreplace=False)
                gene.info[
                    'Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (
                        gene.info['Gene'], gene.info['SpecCode'],
                        gene.info['AccNum'], gene.aaLen(),
                        string.join(clist, ','), string.join(elist, ','))
            self.log.printLog('\r#GENE',
                              'Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Ejemplo n.º 10
0
 def seqSubset2(
     self
 ):  ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),
                                      mainkeys=['Ingolia'],
                                      name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile, mainkeys=['Gene'], name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(
                 ['sourcedata=%s' % xfile,
                  'aliases=%s' % afile])
             ing_genes = string.split(
                 string.join(
                     self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes:
                 ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog(
                         '#MAP',
                         'Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene)
                     ing_mgi.remove(gene)
             self.printLog(
                 '#BAD', 'Failed to map %s genes from Ignolia' %
                 rje.iLen(bad_genes))
             open('ingolia.bad.txt', 'w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),
                                     'EnsEMBL',
                                     sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna', 'pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + [
                         'seqin=%s' % seqfile,
                         'seqout=%s' % seqout, 'autofilter=T', 'autload=T',
                         'seqmode=file',
                         'gooddesc=%s' % string.join(ing_musg, ',')
                     ]
                     rje_seqlist.SeqList(self.log, seqcmd)
             mdb = self.db().addEmptyTable('map',
                                           ['Ingolia', 'Gene', 'EnsEMBL'],
                                           ['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia': gene, 'Gene': ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else:
                     entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + [
             'seqin=%s' % seqfile, 'autofilter=F', 'autload=T',
             'seqmode=file'
         ]
         iseq = rje_seqlist.SeqList(self.log, seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST', evalue='')
             while iseq.nextSeq():
                 (iname, icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)', iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL', musg):
                     if entry['ENST']:
                         entry['ENST'] += ',%s' % string.split(iname)[0]
                     else:
                         entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]': 'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST')
         sdb.addField('ENSP')
         sdb.addField('ENSI')
         ENST = open('IngExact.cdna.all.fa', 'w')
         ENSP = open('IngExact.pep.all.fa', 'w')
         ex = 0.0
         etot = sdb.entryNum()
         sx = 0
         fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog(
                 '\r#ING',
                 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot))
             ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']:
                 fx += 1
                 continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'], ','):
                 (tname, tseq) = iseq.getDictSeq(trans, format='tuple')
                 self.deBug('%s vs %s' %
                            (tseq[entry[icod] - 3:][:7], entry[icon]))
                 if tseq[entry[icod] - 3:][:7] == entry[icon]:
                     ipept = string.split(
                         rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog(
                     '\r#ING',
                     'Unable to find Harrington start for %s %s (%s)' %
                     (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             elif len(peptseq) < minpep:
                 self.printLog(
                     '\r#ING',
                     'Peptide from mapped Harrington start for %s %s (%s) too short!'
                     % (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             id = rje.preZero(int(ex / 100), etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write(
                 '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'],
                    entry['Gene'], mentry['Gene'], cdnaseq))
             ENSP.write(
                 '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id,
                    entry['Gene'], mentry['Gene'], peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close()
         ENSP.close()
         self.printLog(
             '\r#ING',
             'Output %s Ingolia peptides and transcripts. %s failed.' %
             (rje.iStr(sx), rje.iStr(fx)))
         return
     except:
         self.errorLog('%s.method error' % self)