Exemple #1
0
 def restAPI(self):  ### Make a rest call and update RestIn with JobID if successful
     '''Make a rest call and update RestIn with JobID if successful.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         restcall = '%s&rest=jobid' % self.getStr('RestIn')
         self.printLog('#REST',restcall)
         refresh = self.getInt('Refresh')
         ### ~ [1] Set job running ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         jobid = rje.chomp(urllib2.urlopen(restcall).read())
         self.printLog('#JOBID',jobid)
         ### ~ [2] Wait for completion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         checkurl = '%scheck&jobid=%s' % (self.getStr('RestURL'),jobid)
         self.printLog('#CHECK',checkurl)
         check = rje.chomp(urllib2.urlopen(checkurl).read())
         while check in ['Queued','Running']:
             self.progLog('\r#RUN',check)
             time.sleep(refresh)
             refresh = min(self.getInt('MaxRefresh'),refresh*2)
             check = rje.chomp(urllib2.urlopen(checkurl).read())
         ### ~ [3] Return JobID if finished ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if check == 'Finished':
             self.printLog('\r#RUN','REST call complete: restin=%s' % jobid)
             self.setStr({'RestIn':jobid})
             if not self.getStrLC('RestBase'): self.setStr({'RestBase':jobid})
             return jobid
         else: self.printLog('#FAIL','REST check error: %s' % check)
     except: self.errorLog('%s.restAPI error' % self)
     return False
Exemple #2
0
    def parseOMIM(self):    ### Main parsing method
        '''Main parsing method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            aas = string.split(string.join(rje_sequence.aa_code_3.values()).upper())
            oline = os.path.exists(self.info['Name'])
            (olen,ox,mx) = (len(open(self.info['Name'],'r').readlines()),0.0,0)
            OMIM = open(self.info['Name'],'r')

            ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            record = gene = subid = disease = mutation = ''
            av = False      # Whether reading *FIELD* AV for mutation data
            while oline:
                oline = OMIM.readline()
                self.log.printLog('\r#OMIM','Processing OMIM: %.2f%% (%s genes)' % (ox/olen,rje.integerString(len(self.dict['Records']))),newline=False,log=False)
                ox += 100.0
                if not av and oline[:1] != '*': continue
                line = rje.chomp(oline)
                while line[-1:] == ' ': line = line[:-1]
                ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if line == '*RECORD*': (record,av) = ('',False)
                elif line == '*FIELD* NO':    # New record
                    record = rje.chomp(OMIM.readline())
                    gene = ''
                    ox += 100.0
                ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* TI':      # New gene
                    gene = string.split(rje.chomp(OMIM.readline()))[-1]
                    subid = ''
                    av = False
                    ox += 100.0
                ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* AV': av = True        # Start of mutation records
                elif av and rje.matchExp('^(\.\d+)',line):  # New subid mutation record
                    subid = rje.matchExp('^(\.\d+)',line)[0]
                    disease = rje.chomp(OMIM.readline())
                    ox += 100.0
                    try: mutation = rje.matchExp('^%s, (\D\D\D\d+\D\D\D)' % gene,rje.chomp(OMIM.readline()))[0]
                    except: continue    # No mutation or not coding change
                    ox += 100.0
                    subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)',mutation)
                    if subaa[0] not in aas or subaa[1] not in aas: continue
                    if gene not in self.dict['Records']: self.dict['Records'][gene] = [record]
                    if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record]
                    if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {}
                    mx += 1
                    self.dict['Mutations'][gene][subid] = (disease,mutation)
                        
            ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OMIM.close()
            self.log.printLog('\r#OMIM','Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len(self.dict['Records'])),rje.integerString(mx)))
            self.saveMutations()
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible
Exemple #3
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [0a] Protein descriptions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['ProtDesc'] = {}
         if self.getStrLC('ProtDesc'):
             for fline in open(self.getStr('ProtDesc'),'r').readlines():
                 [prot,desc] = string.split(rje.chomp(fline),maxsplit=1)
                 self.dict['ProtDesc'][prot] = desc
             #self.db().addTable(self.getStr('ProtDesc'),mainkeys=['protein'],datakeys='All',headers=['protein','description'],ignore=['#'],name='protdesc',expect=True)
         ## ~ [0b] Look for previous run results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         taxdb = self.db('taxa',add=True,forcecheck=True,mainkeys=['spcode'])
         if not taxdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.taxa.tdt' % self.getStr('TaxBase')
             taxdb = db.addTable(spfile,mainkeys=['spcode'],name='taxa',expect=False)
         mapdb = self.db('taxamap',add=True,forcecheck=True,mainkeys=['protein'])
         if not mapdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.taxamap.tdt' % self.getStr('TaxBase')
             mapdb = db.addTable(spfile,mainkeys=['protein'],name='taxamap',expect=False)
         if taxdb and mapdb:
             taxdb.dataFormat({'boot':'num'})
             mapdb.dataFormat({'boot':'num'})
             return True
         ## ~ [0c] Taxonomy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.obj['Taxonomy'] = rje_taxonomy.Taxonomy(self.log,self.cmd_list)
         self.obj['Taxonomy'].setup(force=False)
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Exemple #4
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.getStr('RestIn').startswith('http:'):
             #!# Check for rest URL and add if missing
             #!# Split on &
             restcmd = string.split(self.getStr('RestIn'),'&')
             for i in range(len(restcmd)):
                 if '=' not in restcmd[i]: continue
                 (opt,value) = string.split(restcmd[i],'=',1)
                 if value.startswith('file:'):   # Conversion of cmd=file:FILE into cmd=CONTENT
                     rfile = string.split(value,':',1)[1]
                     #!# Consider adding max size constraint. Probably a URL size limit.
                     if rje.exists(rfile):
                         restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n')))
                         if '&' in restcmd[i]:
                             self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile))
                             restcmd[i] = string.replace(restcmd[i],'&','+')
                     else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True)
             self.setStr({'RestIn':string.join(restcmd,'&')})
         ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:   # Convert to file
             self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)})
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Exemple #5
0
 def report(self):  ### Run qstat to get job list then showstart on each job
     '''Run qstat to get job list then showstart on each job .'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qidlist = []
         qidjob = {}
         ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qline in os.popen('qstat'):
             try:
                 (qid, job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)', qline)
                 qidlist.append(qid)
                 qidjob[qid] = job
             except:
                 continue
         ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#QSTAT', '%d jobs in queue.' % len(qidlist))
         for qid in qidlist:
             self.printLog('#JOB',
                           '%s = %s' % (qid, qidjob[qid]),
                           timeout=False)
             for qline in os.popen('showstart %s' % qid):
                 if rje.chomp(qline):
                     self.printLog('#INFO', qline, timeout=False)
         self.printLog('#ZEN', rje_zen.Zen().wisdom())
     except:
         self.errorLog('QSub.report problem')
Exemple #6
0
 def classify(self):  ### Generate summary tables for each protein class
     '''Generate summary tables for each protein class.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         rankdb = self.db('taxamap')
         for cfile in self.list['Classify']:
             pclass = rje.baseFile(cfile, strip_path=True)
             clist = []
             for fline in open(cfile, 'r').readlines():
                 prot = string.split(rje.chomp(fline), maxsplit=1)[0]
                 if prot: clist.append(prot)
             self.printLog(
                 '#CLASS', '%s "%s" class proteins read from %s' %
                 (rje.iLen(clist), pclass, cfile))
             if not clist:
                 self.warnLog('No proteins read from %s' % (cfile))
                 continue
             classdb = db.copyTable(rankdb, pclass)
             classdb.dropEntriesDirect('protein', clist, inverse=True)
             if not classdb.entries():
                 self.warnLog('No "%s" proteins found in TaxaMap table' %
                              (pclass))
                 continue
             self.summaryScores(classdb, pclass, 'MinClass')
     except:
         self.errorLog('%s.classify() error' % self.prog())
Exemple #7
0
 def readAAProp(self,
                filename=None):  ### Reads AA Property Matrix from file
     '''
     Reads AA Property Matrix from file.
     >> filename:str = Filename. If None, will use self.info['Name']
     '''
     try:
         ### <a> ### Load and read
         if filename:
             self.info['Name'] = filename
         else:
             filename = self.info['Name']
         readtxt = 'Reading AA Properties from %s...' % filename
         self.progLog('\r#AAPROP', readtxt)
         proplines = self.loadFromFile(filename, v=2)
         ### <b> ### Process
         self.alphabet = []
         self.prop = {}
         ## <i> ## Properties and alphabet
         for line in proplines:
             line = rje.chomp(line)
             if line.find('#') == 0:  # Comment line
                 continue
             elif line.find('PROP') == 0:  # Header line - has amino acids
                 line = rje.matchExp('^\S+(\s.+)', line)[0]
                 while re.search('^\s+\S.*', line):
                     (aa, line) = rje.matchExp('^\s+(\S)(.*)', line)
                     self.alphabet.append(aa)
                 readtxt += ' ...%s' % string.join(self.alphabet)
                 self.progLog('\r#AAPROP', readtxt)
             elif re.search('^\S', line) and self.alphabet:  # Property line
                 (aaproperty, line) = rje.matchExp('^(\S+)(\s.+)', line)
                 readtxt += ' ...%s' % aaproperty
                 self.progLog('\r#AAPROP', readtxt)
                 self.prop[aaproperty] = {}
                 for aa in self.alphabet:
                     (p, line) = rje.matchExp('^\s+(\S)(.*)', line)
                     self.prop[aaproperty][aa] = p
                 #self.verbose(2,3,'...%s' % self.prop[property],0)
         readtxt += ' ...Done!'
         self.printLog('\r#AAPROP', readtxt)
     except IOError:
         self.log.errorLog(
             'AA Property matrix file %s missing?' % self.info['Name'],
             True)
         raise
     except:
         self.log.errorLog(
             'Major Problem reading AA Property matrix(%s)' %
             self.info['Name'], True)
         return
     add = []
     if 'X' not in self.alphabet:
         add.append('X')
     if '-' not in self.alphabet:
         add.append('-')
     if add:
         add = self.alphabet + add
         self.useAlphabet(alphabet=add)
     self.makePropDif()
Exemple #8
0
 def qsub(self):      ### Creates job and calls with qsub
     '''Creates job and calls with qsub.'''
     try:### Basics ###
         hr = int(self.stat['Walltime'])
         min = int((0.5+(self.stat['Walltime'] - hr)*60.0))
         if self.opt['Report']: return self.report()
         jlist = ['#!/bin/bash',
                  '#PBS -N %s' % string.replace('%s.job' % self.info['Job'],'.job',''),  #,'#PBS -q batch',
                  '#PBS -l nodes=%d:ppn=%d' % (self.stat['Nodes'],self.stat['PPN']),
                  '#PBS -l walltime=%d:%d:00' % (hr,min),'#PBS -l vmem=%dgb' % self.getInt('VMem'),'']     #10
         if self.getStr('Email'):
             jlist += ['#PBS -M %s' % self.getStr('Email'),'#PBS -m ae']
             if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae'
         jlist += ['### Define number of processors','NPROCS=`wc -l < $PBS_NODEFILE`',
                   'echo Running on host `hostname`','echo Time is `date`','echo Directory is `pwd`', #2
                   'echo This jobs runs on the following processors:','echo `cat $PBS_NODEFILE`','',                #5
                   'echo This job has allocated $NPROCS cpus','']
         self.printLog('#PPN','%d Node(s) requested: %d PPN.' % (self.getInt('Nodes'),self.getInt('PPN')))
         self.printLog('#VMEM','%s GB VMem requested.' % (self.getStat('VMem')))
         if self.getBool('ModPurge'):
             jlist.append('module purge')
             self.printLog('#MOD','Modules purged (modpurge=T)')
         for mod in self.list['Modules']:
             if mod.lower() not in ['','none']: jlist.append('module add %s' % mod)
         if self.list['Modules']: self.printLog('#MOD','Modules added: %s' % string.join(self.list['Modules'],'; '))
         for pcall in self.list['PreCall']:
             self.printLog('#PCALL',pcall)
             jlist.append(pcall)
         #x#jlist = ['#!/bin/sh']   # New Iridis shell script method!
         ### Directory & Program ###
         jlist.append('cd %s' % self.info['QPath'])
         pcall = self.info['Program']
         if self.opt['RjePy']: pcall = 'python ' + self.info['PyPath'] + pcall
         jlist.append(pcall)
         ### Output and call ###
         job = string.replace('%s.job' % self.info['Job'],'.job.job','.job')
         open(job,'w').write(string.join(jlist,'\n'))
         self.printLog('#DIR',self.info['QPath'])
         self.printLog('#RUN',jlist[-1])
         #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes'])
         qsub = 'qsub %s -S /bin/bash' % (job)
         if self.list['Depend']:
             qsub += ' -W depend=afterany'
             #for id in self.list['Depend']: qsub += ':%s.bio-server' % id
             for id in self.list['Depend']: qsub += ':%s.%s' % (id,self.getStr('DependHPC'))
         self.printLog('#JOB',qsub)
         if self.test():
             self.printLog('#TEST','Test mode: will not place job in queue.')
             self.verbose(0,1,string.join(['>>>>>']+jlist+['<<<<<',''],'\n'))
             return False
         qrun = os.popen(qsub).read()
         self.printLog('#QSUB',qrun)
         qid = string.split(qrun,'.')[0]
         self.printLog('#SHOW','Attempt showstart %s in %s sec' % (qid,self.stat['Pause']),log=False)
         time.sleep(self.stat['Pause'])
         for qline in os.popen('showstart %s' % qrun):   #qid):
             if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False)
         return qid
     except: self.errorLog('Error in qsub()'); return False
Exemple #9
0
def parseTMHMM(tmline):     ### Returns a dictionary of TMHMM data from a TMHMM line
    '''Returns a dictionary of TMHMM data from a TMHMM line.'''
    tmdata = string.split(rje.chomp(tmline))
    tmdict = {'Seq':tmdata.pop(0)}
    for tm in tmdata:
        (tkey,tval) = string.split(tm,'=')
        tmdict[tkey] = tval
    return tmdict
Exemple #10
0
 def readAAProp(self,filename=None): ### Reads AA Property Matrix from file
     '''
     Reads AA Property Matrix from file.
     >> filename:str = Filename. If None, will use self.info['Name']
     '''
     try:
         ### <a> ### Load and read
         if filename:
             self.info['Name'] = filename
         else:
             filename = self.info['Name']
         readtxt = 'Reading AA Properties from %s...' % filename
         self.progLog('\r#AAPROP',readtxt)
         proplines = self.loadFromFile(filename,v=2)
         ### <b> ### Process
         self.alphabet = []
         self.prop = {}
         ## <i> ## Properties and alphabet
         for line in proplines:
             line = rje.chomp(line)
             if line.find('#') == 0: # Comment line
                 continue
             elif line.find('PROP') == 0:  # Header line - has amino acids
                 line = rje.matchExp('^\S+(\s.+)',line)[0]
                 while re.search('^\s+\S.*',line):
                     (aa,line) = rje.matchExp('^\s+(\S)(.*)',line)
                     self.alphabet.append(aa)
                 readtxt += ' ...%s' % string.join(self.alphabet)
                 self.progLog('\r#AAPROP',readtxt)
             elif re.search('^\S',line) and self.alphabet:   # Property line
                 (aaproperty,line) = rje.matchExp('^(\S+)(\s.+)',line)
                 readtxt += ' ...%s' % aaproperty
                 self.progLog('\r#AAPROP',readtxt)
                 self.prop[aaproperty] = {}
                 for aa in self.alphabet:
                     (p,line) = rje.matchExp('^\s+(\S)(.*)',line)
                     self.prop[aaproperty][aa] = p
                 #self.verbose(2,3,'...%s' % self.prop[property],0)
         readtxt += ' ...Done!'
         self.printLog('\r#AAPROP',readtxt)
     except IOError:
         self.log.errorLog('AA Property matrix file %s missing?' % self.info['Name'],True)
         raise
     except:
         self.log.errorLog('Major Problem reading AA Property matrix(%s)' % self.info['Name'],True)
         return
     add = []
     if 'X' not in self.alphabet:
         add.append('X')
     if '-' not in self.alphabet:
         add.append('-')
     if add:
         add = self.alphabet + add
         self.useAlphabet(alphabet=add)
     self.makePropDif()
Exemple #11
0
 def loadPPI(self):  ### Load pairwise interaction data
     '''Load pairwise interaction data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not rje.checkForFile(self.info['PPIFile']): return False
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in open(self.info['PPIFile'],'r').readlines():
             try: [pa,pb] = string.split(rje.chomp(line))[:2]
             except: continue
             for ppi in [(pa,pb),(pb,pa)]:
                 if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = []
                 if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1])
             self.progLog('\r#PPI','Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI'])))
         self.printLog('\r#PPI','Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI'])))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Exemple #12
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [0a] Protein descriptions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['ProtDesc'] = {}
         if self.getStrLC('ProtDesc'):
             for fline in open(self.getStr('ProtDesc'), 'r').readlines():
                 [prot, desc] = string.split(rje.chomp(fline), maxsplit=1)
                 self.dict['ProtDesc'][prot] = desc
             #self.db().addTable(self.getStr('ProtDesc'),mainkeys=['protein'],datakeys='All',headers=['protein','description'],ignore=['#'],name='protdesc',expect=True)
         ## ~ [0b] Look for previous run results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         taxdb = self.db('taxa',
                         add=True,
                         forcecheck=True,
                         mainkeys=['spcode'])
         if not taxdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.taxa.tdt' % self.getStr('TaxBase')
             taxdb = db.addTable(spfile,
                                 mainkeys=['spcode'],
                                 name='taxa',
                                 expect=False)
         mapdb = self.db('taxamap',
                         add=True,
                         forcecheck=True,
                         mainkeys=['protein'])
         if not mapdb and self.getStrLC('TaxBase') and not self.force():
             spfile = '%s.taxamap.tdt' % self.getStr('TaxBase')
             mapdb = db.addTable(spfile,
                                 mainkeys=['protein'],
                                 name='taxamap',
                                 expect=False)
         if taxdb and mapdb:
             taxdb.dataFormat({'boot': 'num'})
             mapdb.dataFormat({'boot': 'num'})
             return True
         ## ~ [0c] Taxonomy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.obj['Taxonomy'] = rje_taxonomy.Taxonomy(
             self.log, self.cmd_list)
         self.obj['Taxonomy'].setup(force=False)
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self.prog())
         return False  # Setup failed
Exemple #13
0
 def report(self):   ### Run qstat to get job list then showstart on each job
     '''Run qstat to get job list then showstart on each job .'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qidlist = []
         qidjob = {}
         ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qline in os.popen('qstat'):
             try:
                 (qid,job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)',qline)
                 qidlist.append(qid)
                 qidjob[qid] = job
             except: continue
         ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#QSTAT','%d jobs in queue.' % len(qidlist))
         for qid in qidlist:
             self.printLog('#JOB', '%s = %s' % (qid,qidjob[qid]), timeout=False)
             for qline in os.popen('showstart %s' % qid):
                 if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False)
         self.printLog('#ZEN',rje_zen.Zen().wisdom())
     except: self.errorLog('QSub.report problem')            
Exemple #14
0
 def mapEnsGO(self,spec='HUMAN',gokey='EnsGO',fixhead=True):   ### Extracts EnsEMBL GO mapping data from a BioMart download
     '''Extracts EnsEMBL GO mapping data from a BioMart download.'''
     ### ~ [1] ~ Setup paths and files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if gokey not in self.dict: self.dict[gokey] = {}
     ensmap = []
     for gtype in ['GO','GO.BP','GO.CC','GO.MF']:
         gfile = self.info['EnsGOPath'] + 'ens_%s.%s.tdt' % (spec,gtype)
         if os.path.exists(gfile): ensmap.append(gfile)
     if not ensmap:
         self.errorLog('EnsEMBL-GO mapping file (%s) missing' % self.info['EnsGOPath'],printerror=False)
         return False             
     ### ~ [2] ~ Parse Gene-GO Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     mainkeys = ['Ensembl Gene ID','GO ID']
     for gfile in ensmap:
         if fixhead:
             headers = string.split(rje.chomp(open(gfile,'r').readlines()[0]),'\t')
             if 'Ensembl Gene ID' in headers: mainkeys = ['Ensembl Gene ID']
             else: mainkeys = headers[:1]
             if 'GO Term Accession' in headers: mainkeys.append('GO Term Accession')
             elif 'GO Term Accession (bp)' in headers: mainkeys.append('GO Term Accession (bp)')
             elif 'GO Term Accession (mf)' in headers: mainkeys.append('GO Term Accession (mf)')
             elif 'GO Term Accession (cc)' in headers: mainkeys.append('GO Term Accession (cc)')
             elif 'GO ID' in headers: mainkeys.append('GO ID')
             else: mainkeys.append(headers[2])
             self.printLog('#HEAD','%s' % (string.join(mainkeys,' / ')))
         self.progLog('\r#GO','Mapping EnsEMBL GO...')
         ensdata = rje.dataDict(self,gfile,mainkeys)
         (mx,mtot) = (0.0,len(ensdata))
         obselete_go = []
         for map in ensdata:
             self.progLog('\r#GO','Mapping EnsEMBL GO: %.2f%%' % (mx/mtot)); mx += 100.0
             try: (gene,go) = string.split(map)
             except: continue    # no GO!
             ## Update dictionaries ##
             if go[:3] == 'GO:': go = go[3:]
             if go in self.go(): self.addGeneGO(gene,go,gokey)
             elif go in self.dict['AltID']:
                 for id in self.dict['AltID'][go]: self.addGeneGO(gene,id,gokey)
             elif go not in obselete_go: obselete_go.append(go)
         self.printLog('\r#GO','Mapping EnsEMBL GO from %s complete.' % os.path.basename(gfile))
Exemple #15
0
 def classify(self): ### Generate summary tables for each protein class
     '''Generate summary tables for each protein class.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         rankdb = self.db('taxamap')
         for cfile in self.list['Classify']:
             pclass = rje.baseFile(cfile,strip_path=True)
             clist = []
             for fline in open(cfile,'r').readlines():
                 prot = string.split(rje.chomp(fline),maxsplit=1)[0]
                 if prot: clist.append(prot)
             self.printLog('#CLASS','%s "%s" class proteins read from %s' % (rje.iLen(clist),pclass,cfile))
             if not clist:
                 self.warnLog('No proteins read from %s' % (cfile))
                 continue
             classdb = db.copyTable(rankdb,pclass)
             classdb.dropEntriesDirect('protein',clist,inverse=True)
             if not classdb.entries():
                 self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass))
                 continue
             self.summaryScores(classdb,pclass,'MinClass')
     except: self.errorLog('%s.classify() error' % self.prog())
Exemple #16
0
 def loadPPI(self):  ### Load pairwise interaction data
     '''Load pairwise interaction data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not rje.checkForFile(self.info['PPIFile']): return False
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in open(self.info['PPIFile'], 'r').readlines():
             try:
                 [pa, pb] = string.split(rje.chomp(line))[:2]
             except:
                 continue
             for ppi in [(pa, pb), (pb, pa)]:
                 if ppi[0] not in self.dict['PPI']:
                     self.dict['PPI'][ppi[0]] = []
                 if ppi[1] not in self.dict['PPI'][ppi[0]]:
                     self.dict['PPI'][ppi[0]].append(ppi[1])
             self.progLog(
                 '\r#PPI', 'Loading PPI data: %s proteins' %
                 rje.integerString(len(self.dict['PPI'])))
         self.printLog(
             '\r#PPI', 'Loaded PPI data for %s proteins' %
             rje.integerString(len(self.dict['PPI'])))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Exemple #17
0
 def readHMMPFamSearch(self,resfile=None,readaln=False):  ### Reads HMM PFam Search Results into objects    
     '''
     Reads HMM Search Results into objects.
     >> resfile:str = Results File (set as self.info['OutFile'])
     >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!!
     '''
     try:
         ### Setup ###
         if not resfile or not os.path.exists(resfile):
             self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False)
             return False
         ## Make RegExp for starting next alignment ##
         re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+')
         ## Search dictionary as results come back per sequence, not per HMM! ##
         pfam = {}   # Dictionary of {PFam name:search}
         hitx = 0    # Total number of hits
         hitlist = []        # List of sequences processed from file (may or may not include zero hit sequences)
         ### Read in Search results ###
         if open(resfile,'r').readline().find('hmmpfam') != 0:
             self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False)
             if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'):
                 os.unlink(resfile)
                 self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile)
             return False
         hitname = None
         i = 0; hx = 0; seqx = 0
         RESFILE = open(resfile,'r')
         #x#resline = self.loadFromFile(resfile,chomplines=True)
         #x#while i < len(resline):
         line = RESFILE.readline()
         newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile
         if os.path.exists(newresfile): os.unlink(newresfile)
         while line:
             self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx)))
             line = rje.chomp(line)
             #print line
             ## New Sequence ##
             if rje.matchExp('^Query sequence:\s+(\S+)',line):
                 if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n'))
                 newres = ['',line]; newresout = False
                 hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1
                 #x#if hitname not in hitlist: hitlist.append(hitname)
             ## One Line Data for hits ##
             elif line.find('Parsed for domains:') == 0:
                 #x#i += 3      # Skip two complete lines
                 newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())]
                 line = rje.chomp(RESFILE.readline()); newres.append(line)
                 #Model           Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
                 #--------        ------- ----- -----    ----- -----      -----  -------
                 #Lep_receptor_Ig   1/1      24   114 ..     1   103 []   158.4  1.7e-44
                 # ... else ...
                 #         [no hits above thresholds]
                 while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line):
                     newresout = True
                     (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line)
                     if not pfam.has_key(dom):
                         pfam[dom] = self._addSearch()
                         pfam[dom].info['Name'] = dom
                     hit = pfam[dom]._addHit()
                     hit.info['Name'] = hitname
                     aln = hit._addAln()
                     aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)})
                     hitx += 1
                     self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx)))
                     line = rje.chomp(RESFILE.readline()); newres.append(line)
             ## End of Protein ##
             elif line[:2] == '//': hitname = None; newres.append(line)
             elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line):
                 seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0])
             elif newres: newres.append(line)
             #x#i += 1
             line = RESFILE.readline()
         if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n'))
         if not seqx: seqx = hx
         if self.opt['CleanRes']:
             open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n'))
             os.unlink(resfile)
             os.rename(newresfile,resfile)
             self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx)))
         self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx)))
         return True
     except:
         self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile))
         return False
Exemple #18
0
    def qsub(self):  ### Creates job and calls with qsub
        '''Creates job and calls with qsub. Returns qsub job ID or 0 if jobwait=True and job completed.'''
        try:  ### Basics ###
            hr = int(self.stat['Walltime'])
            min = int((0.5 + (self.stat['Walltime'] - hr) * 60.0))
            if self.opt['Report']: return self.report()
            jobstr = string.replace('%s.job' % self.info['Job'], '.job', '')
            jlist = [
                '#!/bin/bash',
                '#PBS -N %s' % jobstr,  #,'#PBS -q batch',
                '#PBS -l nodes=%d:ppn=%d' %
                (self.stat['Nodes'], self.stat['PPN']),
                '#PBS -l walltime=%d:%s:00' % (hr, rje.preZero(min, 60)),
                '#PBS -l vmem=%dgb' % self.getInt('VMem'),
                '#PBS -l mem=%dgb' % self.getInt('VMem'),
                ''
            ]  #10
            #if not os.popen('hostname').read().startswith('katana.science.unsw.edu.au'):
            #    jlist[-2] = '#PBS -l mem=%dgb' % self.getInt('VMem')
            if self.getBool('Monitor'):
                if self.getBool('JobWait'):
                    self.warnLog(
                        'Cannot run with wait=T and monitor=T: switched monitor=F'
                    )
                    self.setBool({'Monitor': False})
                else:
                    jlist += ['#PBS -k oed']
            if self.getStr('Email'):
                jlist += ['#PBS -M %s' % self.getStr('Email'), '#PBS -m ae']
                if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae'
            jlist += [
                '### Define number of processors',
                'NPROCS=`wc -l < $PBS_NODEFILE`',
                'echo Running on host `hostname`',
                'echo Time is `date`',
                'echo Directory is `pwd`',  #2
                'echo This jobs runs on the following processors:',
                'echo `cat $PBS_NODEFILE`',
                '',  #5
                'echo This job has allocated $NPROCS cpus',
                ''
            ]
            self.printLog(
                '#PPN', '%d Node(s) requested: %d PPN.' %
                (self.getInt('Nodes'), self.getInt('PPN')))
            self.printLog('#VMEM',
                          '%s GB VMem requested.' % (self.getStat('VMem')))
            if self.getBool('ModPurge'):
                jlist.append('module purge')
                self.printLog('#MOD', 'Modules purged (modpurge=T)')
            for mod in self.list['Modules']:
                if mod.lower() not in ['', 'none']:
                    jlist.append('module add %s' % mod)
            if self.list['Modules']:
                self.printLog(
                    '#MOD', 'Modules added: %s' %
                    string.join(self.list['Modules'], '; '))
            for pcall in self.list['PreCall']:
                self.printLog('#PCALL', pcall)
                jlist.append(pcall)
            #x#jlist = ['#!/bin/sh']   # New Iridis shell script method!
            ### Directory & Program ###
            jlist.append('cd %s' % self.info['QPath'])
            pcall = self.info['Program']
            if self.opt['RjePy']:
                pcall = 'python ' + self.info['PyPath'] + pcall
            jlist.append(pcall)
            ### Completion message
            jlist += ['', 'echo ---', 'qstat -f $PBS_JOBID', 'echo ---']
            jlist += ['', 'echo', 'echo Time is `date`', 'echo Job complete']
            ### Output and call ###
            job = '{0}.job'.format(
                jobstr
            )  #string.replace('%s.job' % self.info['Job'],'.job.job','.job')
            open(job, 'w').write(string.join(jlist, '\n'))
            self.printLog('#DIR', self.info['QPath'])
            self.printLog('#RUN', pcall)
            #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes'])
            qsub = 'qsub'
            if self.getBool('StartBash'): qsub += ' -S /bin/bash'
            if self.list['Depend']:
                qsub += ' -W depend=afterany'
                #for id in self.list['Depend']: qsub += ':%s.bio-server' % id
                myhost = self.getStr('DependHPC')
                if not self.getStrLC('DependHPC'):
                    myhost = string.split(os.popen('hostname').read())[0]
                for id in self.list['Depend']:
                    qsub += ':%s.%s' % (id, myhost)
            qsub += ' %s' % (job)
            self.printLog('#JOB', qsub)
            if self.test():
                self.printLog('#TEST',
                              'Test mode: will not place job in queue.')
                self.verbose(
                    0, 1, string.join(['>>>>>'] + jlist + ['<<<<<', ''], '\n'))
                return False
            qrun = os.popen(qsub).read()
            self.printLog('#QSUB', qrun)
            qid = string.split(qrun, '.')[0]
            showstart = 'qstat -T'
            if os.popen('hostname').read().startswith(
                    'katana.science.unsw.edu.au'):
                showstart = 'showstart'
            self.printLog('#SHOW',
                          'Attempt %s %s in %s sec' %
                          (showstart, qrun, self.stat['Pause']),
                          log=False)
            time.sleep(self.stat['Pause'])
            for qline in os.popen('%s %s' % (showstart, qrun)):  #qid):
                if rje.chomp(qline):
                    self.printLog('#INFO', qline, timeout=False)

            ### Wait for job to be completed
            if self.getBool('JobWait'):
                if self.getBool('Monitor'):
                    raise ValueError('Cannot run with wait=T and monitor=T')
                self.printLog('#WAIT',
                              'Waiting for job {0} to finish'.format(qid))
                ofile = '{0}.o{1}'.format(
                    string.replace('%s.job' % self.info['Job'], '.job', ''),
                    qid)
                running = False
                while not rje.exists(ofile):
                    qstat = string.atoi(
                        os.popen("qstat | grep '^{0}' -c".format(
                            qid)).read().split()[0])
                    if not qstat:
                        self.printLog(
                            '#QSTAT',
                            'Job {0} disappeared from qstat'.format(qid))
                        break
                    elif not running:
                        try:
                            qstat = string.split(
                                os.popen("qstat | grep '^{0}'".format(
                                    qid)).read().split()[4])
                            if qstat == 'R':
                                running = True
                                self.printLog('#QSTAT',
                                              'Job {0} running...'.format(qid))
                        except:
                            pass
                    time.sleep(max(1, self.getInt('Pause')))
                owait = 300
                while owait and not rje.exists(ofile):
                    owait -= 1
                    time.sleep(1)
                if rje.exists(ofile):
                    if 'Job complete' in os.popen(
                            'tail -n 1 {0}'.format(ofile)).read():
                        self.printLog(
                            '#DONE',
                            '{0} job ({1}) complete.'.format(jobstr, qid))
                        return 0
                    else:
                        self.printLog(
                            '#FAIL', '{0} job ({1}) failed to finish.'.format(
                                jobstr, qid))
                        return qid
                else:
                    self.printLog(
                        '#FAIL',
                        '{0} job ({1}) failed to generate {2}.'.format(
                            jobstr, qid, ofile))

            return qid
        except:
            self.errorLog('Error in qsub()')
            return False
Exemple #19
0
    def parseOMIM(self):  ### Main parsing method
        '''Main parsing method.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            aas = string.split(
                string.join(rje_sequence.aa_code_3.values()).upper())
            oline = os.path.exists(self.info['Name'])
            (olen, ox, mx) = (len(open(self.info['Name'],
                                       'r').readlines()), 0.0, 0)
            OMIM = open(self.info['Name'], 'r')

            ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            record = gene = subid = disease = mutation = ''
            av = False  # Whether reading *FIELD* AV for mutation data
            while oline:
                oline = OMIM.readline()
                self.log.printLog(
                    '\r#OMIM',
                    'Processing OMIM: %.2f%% (%s genes)' %
                    (ox / olen, rje.integerString(len(self.dict['Records']))),
                    newline=False,
                    log=False)
                ox += 100.0
                if not av and oline[:1] != '*': continue
                line = rje.chomp(oline)
                while line[-1:] == ' ':
                    line = line[:-1]
                ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if line == '*RECORD*': (record, av) = ('', False)
                elif line == '*FIELD* NO':  # New record
                    record = rje.chomp(OMIM.readline())
                    gene = ''
                    ox += 100.0
                ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* TI':  # New gene
                    gene = string.split(rje.chomp(OMIM.readline()))[-1]
                    subid = ''
                    av = False
                    ox += 100.0
                ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* AV':
                    av = True  # Start of mutation records
                elif av and rje.matchExp('^(\.\d+)',
                                         line):  # New subid mutation record
                    subid = rje.matchExp('^(\.\d+)', line)[0]
                    disease = rje.chomp(OMIM.readline())
                    ox += 100.0
                    try:
                        mutation = rje.matchExp(
                            '^%s, (\D\D\D\d+\D\D\D)' % gene,
                            rje.chomp(OMIM.readline()))[0]
                    except:
                        continue  # No mutation or not coding change
                    ox += 100.0
                    subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)', mutation)
                    if subaa[0] not in aas or subaa[1] not in aas: continue
                    if gene not in self.dict['Records']:
                        self.dict['Records'][gene] = [record]
                    if record not in self.dict['Records'][gene]:
                        self.dict['Records'][gene] += [record]
                    if gene not in self.dict['Mutations']:
                        self.dict['Mutations'][gene] = {}
                    mx += 1
                    self.dict['Mutations'][gene][subid] = (disease, mutation)

            ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OMIM.close()
            self.log.printLog(
                '\r#OMIM',
                'Processing OMIM complete! (%s genes; %s mutations)' %
                (rje.integerString(len(
                    self.dict['Records'])), rje.integerString(mx)))
            self.saveMutations()
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise  # Delete this if method error not terrible
Exemple #20
0
    def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence
        '''
        Performs actual mapping of sequence.
        >> seq:SeqList object containing Sequence Object to be mapped
        >> blast:BLAST_Run object to perform BLAST and GABLAM
        >> search:Current BLAST search object for mapping
        >> outputmap:boolean = Whether to output mapping into a file [True]
        << returns shortName() of mapped sequence (or None if none)
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seq = seqlist.getSeq(format='tuple')
            mapseq = self.obj['MapDB']
            hits = blast.db('Hit').indexEntries('Query',search)
            self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits)))
            hitseq = {}; hitdata = {}
            for entry in hits:
                hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple')
                hitdata[entry['Hit']] = entry
            resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])}
            ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hits,hitdict) = self.orderHits(seq,hits,hitseq)
            self.debug(hits)
            self.debug(hitdict)
            ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for method in self.list['Mapping']:
                resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower())
                if resdict['Hit']:
                    resdict['Method'] = method[:1].upper() + method[1:].lower()
                    break
                elif method == 'gablam' and (len(hits) > 0):
                    resdict['Method'] = 'Rejected'
            self.debug(resdict)
            ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if resdict['Hit']:  #hitdict[hit]['Data']['ShortName']
                hit = resdict['Hit']['Hit']     # resdict['Hit'] is the BLAST table entry for Hit
                shortname = hitdict[hit]['Data']['ShortName']   # This is just hit!
                self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method']))
                ## Update Stats ##
                self.debug('')
                resdict['BlastRank'] = hitdata[hit]['Rank']
                for key in hitdict[hit]: resdict[key] = hitdict[hit][key]
                ## Fasta and Redundancy ##
                if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                else:
                    self.list['Mapped'].append(shortname)
                    if outputmap:
                        open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1]))
                resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode']
                resdict['Hit'] = shortname
            else:
                ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
                if 'grep' in self.list['Mapping']:
                    greplist = []; hitseq = ''
                    self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False)
                    for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines():
                        if line[:1] == '>': greplist.append(string.split(line[1:])[0])
                        elif not hitseq: hitseq = rje.chomp(line)
                    if greplist:
                        shortname = greplist.pop(0)
                        resdict['Hit'] = shortname
                        resdict['Method'] = 'Grep'
                        resdict['Qry_ID'] = '100.0'
                        resdict['Qry_Len'] = len(seq[1])
                        resdict['Hit_Len'] = len(hitseq)
                        resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1])
                        try: resdict['Hit_Species'] = string.split(shortname,'_')[1]
                        except: pass
                        if shortname in self.list['Mapped']:
                            self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                        else:
                            self.list['Mapped'].append(shortname)
                            if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq))
                    for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra))
                if not resdict['Hit'] and self.bool['Combine']:
                    ## Fasta and Redundancy ##
                    shortname = string.split(seq[0])[0]
                    if shortname in self.list['Mapped']:
                        self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas']))
                    else:
                        self.list['Mapped'].append(shortname)
                        if outputmap:
                            open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                elif outputmap:
                    open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method']))
            if outputmap:
                rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
            return resdict['Hit']

        except:
            self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True)
            return False
Exemple #21
0
 def readHMMPFamSearch(
         self,
         resfile=None,
         readaln=False):  ### Reads HMM PFam Search Results into objects
     '''
     Reads HMM Search Results into objects.
     >> resfile:str = Results File (set as self.info['OutFile'])
     >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!!
     '''
     try:
         ### Setup ###
         if not resfile or not os.path.exists(resfile):
             self.log.errorLog('Results file "%s" missing!' % resfile,
                               printerror=False)
             return False
         ## Make RegExp for starting next alignment ##
         re_hit = string.join([
             '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)',
             'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)'
         ], '\s+')
         ## Search dictionary as results come back per sequence, not per HMM! ##
         pfam = {}  # Dictionary of {PFam name:search}
         hitx = 0  # Total number of hits
         hitlist = [
         ]  # List of sequences processed from file (may or may not include zero hit sequences)
         ### Read in Search results ###
         if open(resfile, 'r').readline().find('hmmpfam') != 0:
             self.errorLog(
                 'File "%s" does not appear to be an hmmpfam results file' %
                 resfile,
                 printerror=False)
             if rje.yesNo(
                     'Delete incorrect results file? (Check that hmmpfam=T is right!)',
                     default='N'):
                 os.unlink(resfile)
                 self.printLog('#DEL',
                               'Dodgy results file "%s" deleted.' % resfile)
             return False
         hitname = None
         i = 0
         hx = 0
         seqx = 0
         RESFILE = open(resfile, 'r')
         #x#resline = self.loadFromFile(resfile,chomplines=True)
         #x#while i < len(resline):
         line = RESFILE.readline()
         newres = [rje.chomp(line)]
         newresout = True
         newresfile = '%s.partial' % resfile
         if os.path.exists(newresfile): os.unlink(newresfile)
         while line:
             self.progLog(
                 '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' %
                 (resfile, rje.integerString(hx),
                  rje.integerString(len(pfam)), rje.integerString(hitx)))
             line = rje.chomp(line)
             #print line
             ## New Sequence ##
             if rje.matchExp('^Query sequence:\s+(\S+)', line):
                 if newres and newresout and self.opt['CleanRes']:
                     open(newresfile, 'a').write(string.join(newres, '\n'))
                 newres = ['', line]
                 newresout = False
                 hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0]
                 hx += 1
                 #x#if hitname not in hitlist: hitlist.append(hitname)
             ## One Line Data for hits ##
             elif line.find('Parsed for domains:') == 0:
                 #x#i += 3      # Skip two complete lines
                 newres += [
                     line,
                     rje.chomp(RESFILE.readline()),
                     rje.chomp(RESFILE.readline())
                 ]
                 line = rje.chomp(RESFILE.readline())
                 newres.append(line)
                 #Model           Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
                 #--------        ------- ----- -----    ----- -----      -----  -------
                 #Lep_receptor_Ig   1/1      24   114 ..     1   103 []   158.4  1.7e-44
                 # ... else ...
                 #         [no hits above thresholds]
                 while rje.matchExp(
                         string.join([
                             '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)',
                             '(\S+)\s*$'
                         ], '\s+'), line):
                     newresout = True
                     (dom, start, end, score, eval) = rje.matchExp(
                         string.join([
                             '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)',
                             '(\S+)\s*$'
                         ], '\s+'), line)
                     if not pfam.has_key(dom):
                         pfam[dom] = self._addSearch()
                         pfam[dom].info['Name'] = dom
                     hit = pfam[dom]._addHit()
                     hit.info['Name'] = hitname
                     aln = hit._addAln()
                     aln.setStat({
                         'SbjStart': string.atoi(start),
                         'SbjEnd': string.atoi(end),
                         'Expect': string.atof(eval),
                         'BitScore': string.atof(score)
                     })
                     hitx += 1
                     self.progLog(
                         '\r#RES',
                         'Reading %s: %s Seqs; %s Domains; %s Hits' %
                         (resfile, rje.integerString(hx),
                          rje.integerString(
                              len(pfam)), rje.integerString(hitx)))
                     line = rje.chomp(RESFILE.readline())
                     newres.append(line)
             ## End of Protein ##
             elif line[:2] == '//':
                 hitname = None
                 newres.append(line)
             elif rje.matchExp(
                     'End of rje_hmm reduced results file: (%d) sequences in original',
                     line):
                 seqx = string.atoi(
                     rje.matchExp(
                         'End of rje_hmm reduced results file: (\d+) sequences in original',
                         line)[0])
             elif newres:
                 newres.append(line)
             #x#i += 1
             line = RESFILE.readline()
         if newres and newresout and self.opt['CleanRes']:
             open(newresfile, 'a').write(string.join(newres, '\n'))
         if not seqx: seqx = hx
         if self.opt['CleanRes']:
             open(newresfile, 'a').write(
                 string.join([
                     '',
                     'End of rje_hmm reduced results file: %d sequences in original'
                     % seqx
                 ], '\n'))
             os.unlink(resfile)
             os.rename(newresfile, resfile)
             self.printLog(
                 '\r#RED',
                 'Results file %s replaced with reduced version (%s Hits only)'
                 % (resfile, rje.integerString(hitx)))
         self.printLog(
             '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' %
             (resfile, rje.integerString(seqx), rje.integerString(
                 len(pfam)), rje.integerString(hitx)))
         return True
     except:
         self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile))
         return False
Exemple #22
0
 def exonerate(self,qryfas, genome, model,exonerate='exonerate',bestn=0):
     '''
     Runs exonerate and parses output into lists for processing.
     { query: {'gff':[outputlines], 'cigar':[outputlines], 'alignment':[outputlines], 'vulgar':[[headerlist], {header:value}, {header:value}, ...] }
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         EXFILE = None
         exfile = '%s.%s' % (self.baseFile(),model)  # Used in memsaver mode
         query_dic = {}
         header_list = ['query_id', 'query_start', 'query_end', 'query_strand', 'target_id', 'target_start', 'target_end', 'target_strand', 'score', '<label, query_length, target_length> triplets']
         excmd = [exonerate, qryfas, genome, '--showtargetgff', '--showcigar']
         if model: excmd += ['--model', model]
         if bestn: excmd += ['--bestn', '%d' % bestn]
         if self.getStrLC('ExOpt'): excmd += string.split(self.getStr('ExOpt'))
         self.printLog('#RUN',string.join(excmd))
         extext = []
         if self.getBool('MemSaver'):
             gzfile = '%s.gz' % exfile
             if rje.exists(gzfile): self.gUnzip(gzfile)
             if rje.exists(exfile) and not self.force():
                 self.printLog('#EXFILE','Found %s (force=F). Assuming complete.' % exfile)
             else:
                 rje.backup(self,exfile)
                 self.printLog('#SAVER','memsaver=T: Exonerate output directed to %s.' % exfile)
                 EXFILE = open(exfile,'w')
                 if subprocess.call(excmd, stdout=EXFILE): raise IOError('Exonerate call did not complete!')
                 EXFILE.close()
                 self.printLog('#EXFILE','%s generated.' % exfile)
             EXFILE = open(exfile,'r')
         else:
             extext = Popen(excmd, stdout=PIPE).stdout.readlines()
         output_format = ''
         while extext or EXFILE:
             #line = process.stdout.readline().rstrip()
             if EXFILE:
                 line = EXFILE.readline()
                 if not line: break
                 line = rje.chomp(line)
             else: line = rje.chomp(extext.pop(0))
             if line:
                 if line.startswith('         Query:'):
                     query = line.split(':', 1)[1].split(' ')[1]
                     #for q in rje.sortKeys(query_dic):
                     #    self.bugPrint('%s: %s' % (q,rje.sortKeys(query_dic[q])))
                     #self.debug(query)
                 if line == 'C4 Alignment:':
                     output_format = 'alignment'
                 elif line == '# --- START OF GFF DUMP ---':
                     output_format = 'gff'
                 elif line.startswith('vulgar:'):
                     output_format = 'vulgar'
                     fields = line.split(' ', 10)[1:]
                     if output_format in query_dic[query]:
                         query_dic[query][output_format].append({})
                     else:
                         query_dic[query][output_format] = [header_list, {}]
                     for header, field in zip(header_list, fields):
                         query_dic[query][output_format][-1][header] = field
                     #self.debug(query_dic[query][output_format])
                 elif line.startswith('cigar:'):
                     output_format = 'cigar'
                     if output_format in query_dic[query]:
                         query_dic[query][output_format].append(line.replace('cigar: ', ''))
                     else:
                         query_dic[query][output_format] = [line.replace('cigar: ', '')]
                 elif line == '------------' or line.startswith('Command line:') or line.startswith('Hostname:') or line == '# --- END OF GFF DUMP ---' or line == '#' or line.startswith('-- completed exonerate analysis'):
                     pass
                 elif output_format:
                     if query in query_dic:
                         if output_format in query_dic[query]:
                             query_dic[query][output_format].append(line)
                         else:
                             query_dic[query][output_format] = [line]
                     else:
                         query_dic[query] = {output_format:[line]}
             #elif process.poll() is not None:
             #    break
             elif output_format == 'alignment':
                 try: query_dic[query][output_format].append(line)
                 except: pass
             self.vPrint(line,v=1)
         if EXFILE:
             EXFILE.close()
             if self.getBool('Cleanup'):
                 os.unlink(exfile)
                 self.printLog('#CLEAN','%s deleted.' % exfile)
             elif self.getBool('GZip'): self.gZip(exfile)
         return query_dic
     except: self.errorLog('%s.exonerate error' % self.prog()); raise
Exemple #23
0
 def loadTimePoints(self,filename):  ### Load TimePoints from file of various formats
     '''Load TimePoints from file of various formats.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(filename): return self.errorLog('File %s missing!' % filename)
         data = open(filename,'r').readlines()
         db = self.db('TimePoints')
         
         ### ~ [2] Load from File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Delimited File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if string.split(data[0])[0] == 'TimePoint Name':    # 
             ftype = 'delimited text file'
             temp = self.db().addTable(filename,mainkeys=['TimePoint Name'],name='temp')
             for entry in temp.entries(): db.addEntry(entry)
             db.deleteTable(temp)
         ## ~ [2b] File of Database Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         elif data[0][0] == '(':
             ftype = 'database string'
             for line in data:
                 line = rje.chomp(line)
                 while line[-1:] == ' ': line = line[:-1]
                 pdata = string.split(string.replace(line[2:-3],', ',','),"','")
                 if not pdata: continue
                 if rje.matchExp('^(\d+)$',pdata[0]): pdata.pop(0)   # Database output with key ID numbers
                 entry = {}
                 for field in db.fields(): entry[field] = pdata[db.fields().index(field)]
                 db.addEntry(entry)
         ## ~ [2c] Glossary Text File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             ftype = 'glossary text file'
             for line in data:
                 if '(TimePoint)' not in line: continue
                 # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                 pdata = string.split(line,'. ')
                 if pdata[2][-2:] == 'ya':
                     pdata[1] = '%s. %s' % (pdata[1],pdata.pop(2))
                 entry = {'TimePoint Name':pdata[0]}
                 try: entry['Source URL'] = rje.matchExp('Source: <(\S+)>',line)[0]
                 except: self.errorLog('Cannot read Source URL')
                 try: entry['TimePoint Description'] = rje.matchExp('^(\S.+\S) Source: <',string.join(pdata[2:],'. '))[0]
                 except: self.errorLog('Cannot read TimePoint Description: %s' % line)
                 if pdata[1][-2:] == 'ya':
                     [entry['Year'],entry['yearUnit']] = string.split(pdata[1])[-2:]
                 else:
                     try:
                         ydata = rje.matchExp('(\d+) (\S+), (\d+) (\S+)$',pdata[1])
                         if ydata:
                             for i in range(4): entry[['Year','yearUnit','month','day'][i]] = ydata[i]   
                         else: (entry['Year'],entry['yearUnit']) = rje.matchExp('(\d+) (\S+)$',pdata[1])
                     except: self.errorLog('Cannot parse time from %s' % pdata[1])
                 kfield = ['keyword1','keyword2','keyword3','keyword4','keyword5']
                 try: 
                     keywords = string.split(rje.matchExp('\(Keywords: (\S.+)\)',pdata[-1])[0],', ')
                     while keywords and kfield:
                         entry[kfield.pop(0)] = keywords.pop(0)
                     while kfield: entry[kfield.pop(0)] = 'blank'
                     if keywords: self.printLog('#ERR','%d extra Keywords (%s)!' % (len(keywords),string.join(keywords,', ')))
                 except: self.errorLog('Cannot read Keywords (%s)' % pdata[-1])
                 db.addEntry(entry)
         ### ~ [3] Summarise Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#TP','Timepoints read from %s: %s TimePoints total.' % (ftype,db.entryNum()))
         return True
     except: self.errorLog('%s.loadTimePoints(%s) error' % (self,filename)); return False
Exemple #24
0
 def parseGO(self,glines,clear=True,obselete=False):   ### Parses GO Data from list of glines from OBO file
     '''
     Parses GO Data from list of glines from OBO file.
     >> glines:list of text lines read from OBO file
     >> clear:opt [True] = Whether to clear self.dict before reading in data
     >> obselete:opt [False] = Whether to read in obselete terms
     << returns True/False depending on success
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if clear:
             self.dict['AltID'] = {}
             self.dict['GO'] = {}
             self.dict['Subset'] = {}
         id = 'subsets'         # Current term being parsed
         ### ~ [2] ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (gx,gtot) = (0.0,len(glines))
         while glines:
             self.printLog('\r#PARSE','Parsing %s GO terms: %.1f%%' % (rje.integerString(len(self.dict['GO'])),gx/gtot),newline=False,log=False)
             gx += 100.0
             ## ~ [2a] ~ Establish ID of current GO terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             gline = rje.chomp(glines.pop(0))
             if rje.matchExp('^id:\s+GO:(\d+)',gline):
                 id = rje.matchExp('^id:\s+GO:(\d+)',gline)[0]
                 self.dict['GO'][id] = {}
                 continue
             elif not id: continue
             elif rje.matchExp('^(\S+):\s+(\S.+)$',gline): (type,data) = rje.matchExp('^(\S+):\s+(\S.+)$',gline)
             elif gline[:1] in ['[','']: id = ''; continue
             ## ~ [2b] ~ Parse details ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             try:
                 if type == 'is_obsolete' and data.lower()[:4] == 'true':
                     self.dict['GO'].pop(id)
                     id = ''
                 elif type in ['name','def']: self.dict['GO'][id][type] = data
                 elif rje.matchExp('^subsetdef: (\S+) \"(\S.+)\"',gline):
                     (subset,desc) = rje.matchExp('^subsetdef: (\S+) \"(\S.+)\"',gline)
                     self.dict['Subset'][subset] = {'name':desc,'terms':[]}
                 elif type == 'namespace':
                     g = string.split(data,'_')
                     self.dict['GO'][id]['type'] = '%s%s' % (g[0][0],g[1][0])
                 elif type in ['is_a','relationship']:
                     parent = rje.matchExp('GO:(\d+)',data)[0]
                     if type != 'is_a': type = string.split(data)[0]
                     if type not in self.list['ParentTerms']: self.list['ParentTerms'].append(type)
                     if type not in self.dict['GO'][id]: self.dict['GO'][id][type] = []
                     self.dict['GO'][id][type].append(parent)
                 elif type == 'subset': self.dict['Subset'][string.split(gline)[1]]['terms'].append(id)
                 elif type == 'alt_id':
                     alt_id = rje.matchExp('GO:(\d+)',data)[0]
                     if alt_id in self.dict['AltID']: self.dict['AltID'][alt_id].append(id)
                     else: self.dict['AltID'][alt_id] = [id]
                 elif type in ['xref','synonym']:
                     if type not in self.dict['GO'][id]: self.dict['GO'][id][type] = []
                     self.dict['GO'][id][type].append(data)
             except: self.errorLog('GO.parseGO(%s) error' % gline)
         self.printLog('\r#PARSE','Parsed %s GO terms and %d subsets.' % (rje.integerString(len(self.dict['GO'])),len(self.dict['Subset'])))
         ### ~ [3] ~ Tidy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.makeChildren()
         self.makeGOSlim()
         for subset in self.dict['Subset']: self.dict['Subset'][subset]['terms'].sort()
         self.list['ParentTerms'].sort()
         return True
     except: self.log.errorLog('GO.parseGO() failed')
     return False
Exemple #25
0
    def loadTimePoints(
            self, filename):  ### Load TimePoints from file of various formats
        '''Load TimePoints from file of various formats.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(filename):
                return self.errorLog('File %s missing!' % filename)
            data = open(filename, 'r').readlines()
            db = self.db('TimePoints')

            ### ~ [2] Load from File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [2a] Delimited File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if string.split(data[0])[0] == 'TimePoint Name':  #
                ftype = 'delimited text file'
                temp = self.db().addTable(filename,
                                          mainkeys=['TimePoint Name'],
                                          name='temp')
                for entry in temp.entries():
                    db.addEntry(entry)
                db.deleteTable(temp)
            ## ~ [2b] File of Database Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elif data[0][0] == '(':
                ftype = 'database string'
                for line in data:
                    line = rje.chomp(line)
                    while line[-1:] == ' ':
                        line = line[:-1]
                    pdata = string.split(string.replace(line[2:-3], ', ', ','),
                                         "','")
                    if not pdata: continue
                    if rje.matchExp('^(\d+)$', pdata[0]):
                        pdata.pop(0)  # Database output with key ID numbers
                    entry = {}
                    for field in db.fields():
                        entry[field] = pdata[db.fields().index(field)]
                    db.addEntry(entry)
            ## ~ [2c] Glossary Text File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            else:
                ftype = 'glossary text file'
                for line in data:
                    if '(TimePoint)' not in line: continue
                    # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                    pdata = string.split(line, '. ')
                    if pdata[2][-2:] == 'ya':
                        pdata[1] = '%s. %s' % (pdata[1], pdata.pop(2))
                    entry = {'TimePoint Name': pdata[0]}
                    try:
                        entry['Source URL'] = rje.matchExp(
                            'Source: <(\S+)>', line)[0]
                    except:
                        self.errorLog('Cannot read Source URL')
                    try:
                        entry['TimePoint Description'] = rje.matchExp(
                            '^(\S.+\S) Source: <',
                            string.join(pdata[2:], '. '))[0]
                    except:
                        self.errorLog('Cannot read TimePoint Description: %s' %
                                      line)
                    if pdata[1][-2:] == 'ya':
                        [entry['Year'],
                         entry['yearUnit']] = string.split(pdata[1])[-2:]
                    else:
                        try:
                            ydata = rje.matchExp('(\d+) (\S+), (\d+) (\S+)$',
                                                 pdata[1])
                            if ydata:
                                for i in range(4):
                                    entry[['Year', 'yearUnit', 'month',
                                           'day'][i]] = ydata[i]
                            else:
                                (entry['Year'],
                                 entry['yearUnit']) = rje.matchExp(
                                     '(\d+) (\S+)$', pdata[1])
                        except:
                            self.errorLog('Cannot parse time from %s' %
                                          pdata[1])
                    kfield = [
                        'keyword1', 'keyword2', 'keyword3', 'keyword4',
                        'keyword5'
                    ]
                    try:
                        keywords = string.split(
                            rje.matchExp('\(Keywords: (\S.+)\)', pdata[-1])[0],
                            ', ')
                        while keywords and kfield:
                            entry[kfield.pop(0)] = keywords.pop(0)
                        while kfield:
                            entry[kfield.pop(0)] = 'blank'
                        if keywords:
                            self.printLog(
                                '#ERR', '%d extra Keywords (%s)!' %
                                (len(keywords), string.join(keywords, ', ')))
                    except:
                        self.errorLog('Cannot read Keywords (%s)' % pdata[-1])
                    db.addEntry(entry)
            ### ~ [3] Summarise Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.printLog(
                '#TP', 'Timepoints read from %s: %s TimePoints total.' %
                (ftype, db.entryNum()))
            return True
        except:
            self.errorLog('%s.loadTimePoints(%s) error' % (self, filename))
            return False
Exemple #26
0
    def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence
        '''
        Performs actual mapping of sequence.
        >> seq:SeqList object containing Sequence Object to be mapped
        >> blast:BLAST_Run object to perform BLAST and GABLAM
        >> search:Current BLAST search object for mapping
        >> outputmap:boolean = Whether to output mapping into a file [True]
        << returns shortName() of mapped sequence (or None if none)
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seq = seqlist.getSeq(format='tuple')
            mapseq = self.obj['MapDB']
            hits = blast.db('Hit').indexEntries('Query',search)
            self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits)))
            hitseq = {}; hitdata = {}
            for entry in hits:
                hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple')
                hitdata[entry['Hit']] = entry
            resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])}
            ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hits,hitdict) = self.orderHits(seq,hits,hitseq)
            self.debug(hits)
            self.debug(hitdict)
            ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for method in self.list['Mapping']:
                resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower())
                if resdict['Hit']:
                    resdict['Method'] = method[:1].upper() + method[1:].lower()
                    break
                elif method == 'gablam' and (len(hits) > 0):
                    resdict['Method'] = 'Rejected'
            self.debug(resdict)
            ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if resdict['Hit']:  #hitdict[hit]['Data']['ShortName']
                hit = resdict['Hit']['Hit']     # resdict['Hit'] is the BLAST table entry for Hit
                shortname = hitdict[hit]['Data']['ShortName']   # This is just hit!
                self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method']))
                ## Update Stats ##
                self.debug('')
                resdict['BlastRank'] = hitdata[hit]['Rank']
                for key in hitdict[hit]: resdict[key] = hitdict[hit][key]
                ## Fasta and Redundancy ##
                if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                else:
                    self.list['Mapped'].append(shortname)
                    if outputmap:
                        open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1]))
                resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode']
                resdict['Hit'] = shortname
            else:
                ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
                if 'grep' in self.list['Mapping']:
                    greplist = []; hitseq = ''
                    self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False)
                    for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines():
                        if line[:1] == '>': greplist.append(string.split(line[1:])[0])
                        elif not hitseq: hitseq = rje.chomp(line)
                    if greplist:
                        shortname = greplist.pop(0)
                        resdict['Hit'] = shortname
                        resdict['Method'] = 'Grep'
                        resdict['Qry_ID'] = '100.0'
                        resdict['Qry_Len'] = len(seq[1])
                        resdict['Hit_Len'] = len(hitseq)
                        resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1])
                        try: resdict['Hit_Species'] = string.split(shortname,'_')[1]
                        except: pass
                        if shortname in self.list['Mapped']:
                            self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas']))
                        else:
                            self.list['Mapped'].append(shortname)
                            if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq))
                    for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra))
                if not resdict['Hit'] and self.bool['Combine']:
                    ## Fasta and Redundancy ##
                    shortname = string.split(seq[0])[0]
                    if shortname in self.list['Mapped']:
                        self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas']))
                    else:
                        self.list['Mapped'].append(shortname)
                        if outputmap:
                            open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                elif outputmap:
                    open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1]))
                self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method']))
            if outputmap:
                rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
            return resdict['Hit']

        except:
            self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True)
            return False
Exemple #27
0
 def splitMascot(self):  ### Reads the MASCOT file and splits into header, hits and unmatched files.
     '''Reads the MASCOT file and splits into header, hits and unmatched files.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         infile = self.getStr('MASCOT')
         if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('MASCOT')))
         #x#self.deBug(self.basefile())
         headfile = '%s.header.txt' % self.basefile()
         hitsfile = '%s.mascot.csv' % self.basefile()
         peptfile = '%s.nohits.csv' % self.basefile()
         if rje.isYounger(self.getStr('MASCOT'),hitsfile) == hitsfile and not self.force():
             return self.printLog('#FILE','%s file found (force=F)' % hitsfile)
         ### ~ [1] Split MASCOT~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         headlines = []
         csvhead = []
         mdb = None
         mx = 0
         itraq = []
         prot_data = {}
         for mline in open(self.getStr('MASCOT'),'r').readlines():
             mx += 1     # Index of next line in case needed for iTRAQ reading!
             ## ~ [1a] Skip down until Header found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not headlines and mline.find('Header') < 0: continue
             ## ~ [1b] Add Header lines to headlines until results headers found ~~~~~~~~~~~~~~~ ##
             if not csvhead and mline.find('prot_hit_num') < 0: headlines.append(mline); continue
             ## ~ [1c] Sort out MASCOT results headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if mline.find('prot_hit_num') >= 0:
                 ## ~ Read Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 open(headfile,'w').writelines(headlines)
                 csvhead = rje.readDelimit(string.join(string.split(rje.chomp(mline))),',')
                 while '' in csvhead: csvhead.remove('')
                 ## ~ Sort out iTRAQ headers (missing) ~~~~~~~~~ ##
                 if self.getBool('iTRAQ'):
                     iline = open(self.getStr('MASCOT'),'r').readlines()[mx]
                     for isplit in rje.readDelimit(iline,',')[len(csvhead):]:  # Should be start of iTRAQ data
                         if '/' in isplit: itraq.append(isplit)
                     self.printLog('#ITRAQ',string.join(itraq))
                     csvhead += itraq
                     idb = db.addEmptyTable('itraq',['prot_hit_num','prot_acc','prot_desc','itraq','ratio','n','geomean','summary'],keys=['prot_hit_num','itraq'])
                     idb.info['Delimit'] = ','
                 ## ~ Add emPAI header (also missing) ~~~~~~~~~~ ##
                 if self.getBool('emPAI'): csvhead.append('empai')
                 ## ~ Set up Database Table ~~~~~~~~~~~~~~~~~~~~ ##
                 self.printLog('#HEAD',string.join(csvhead,'; '))
                 mdb = db.addEmptyTable('mascot',csvhead,keys=['prot_hit_num','pep_query'])
                 mdb.info['Delimit'] = ','
             elif mline.find('Peptide matches') >= 0:
                 mdb.saveToFile()
                 if self.getBool('emPAI'): csvhead.remove('empai')
                 mdb = db.addEmptyTable('nohits',csvhead,keys=['pep_query'])
                 for field in mdb.fields():
                     if field[:4] == 'prot': mdb.dropField(field)
                 mdb.info['Delimit'] = ','
                 continue
             elif rje.chomp(mline):
                 #self.deBug('%s ... %s' % (mline[:20],mline.find('Peptide matches')))
                 data = rje.readDelimit(mline,',')
                 entry = {}; pretraq = True
                 #self.deBug(csvhead); self.deBug(itraq);
                 for d in range(len(csvhead)+len(itraq)):
                     if d >= len(data): break
                     if data[d] in itraq: dhead = data[d]; pretraq = False
                     elif data[d] == 'emPAI': entry['empai'] = data[d+1]; pretraq = False
                     elif pretraq and d < len(csvhead): dhead = csvhead[d]
                     elif pretraq: continue      # Unmatched peptides will not have emPAI or iTRAQ data
                     #self.deBug('%s > %s' % (data[d],dhead))
                     if d and data[d-1] == 'emPAI': continue
                     elif data[d] in itraq + ['emPAI']: continue
                     elif dhead not in entry: entry[dhead] = data[d]
                     #self.deBug('%s = %s' % (dhead,entry[dhead]))
                 if entry['prot_acc']: prot_data[entry['prot_hit_num']] = {'prot_acc':entry['prot_acc'],'prot_desc':entry['prot_desc']}
                 if self.getBool('iTRAQ') and 'Quantitation summary for protein' in data:
                     d = data.index('Quantitation summary for protein') + 1
                     if entry['prot_hit_num'] in prot_data:
                         pacc = prot_data[entry['prot_hit_num']]['prot_acc']
                         pdesc = prot_data[entry['prot_hit_num']]['prot_desc']
                     else:
                         pacc = entry['prot_acc']
                         pdesc = entry['prot_desc']
                     while d < len(data):
                         if data[d] in itraq:
                             idb.addEntry({'prot_hit_num':entry['prot_hit_num'],'prot_acc':pacc,'prot_desc':pdesc,
                                           'itraq':data[d],'ratio':data[d+1],'n':data[d+2],'geomean':data[d+3],'summary':data[d+4]})
                         d += 1
                 #self.deBug(entry)
                 if entry['prot_hit_num'] or entry['pep_query']: mdb.addEntry(entry)
         mdb.saveToFile()
         if self.getBool('iTRAQ'): idb.saveToFile()
         self.deBug('')
         return True
     except: self.errorLog('Error reading MASCOT file'); return False
 def parsePileup(self,tname,filename,wtdb=None):  ### Extracts, filters and processes PileUp data
     '''Extracts, filters and processes PileUp data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         table = self.db().addEmptyTable(tname,['Locus','Pos','Seq','N','QN','Major','MajFreq'],keys=['Locus','Pos'])
         qc = []
         if wtdb: table.addField('WTFreq')
         PILEUP = open(filename,'r'); px = 0; ex = 0
         PILEOUT = open('%s.%s.tdt' % (self.baseFile(),tname),'w')
         rje.writeDelimit(PILEOUT,outlist=table.fields(),delimit='\t')
         locus = None
         refseq = ''     #? What is this used for?
         majors = []     #? What is this used for?
         ### ~ [2] Process each entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in PILEUP:
             # Split line up into data. Should be: locus, position, reference, no. reads, read data, qualscores
             data = string.split(rje.chomp(line))
             if not data: break
             self.progLog('\r#PARSE','Parsing %s: %s pos...' % (filename,rje.iStr(px)),rand=0.01); px += 1
             ## ~ [2a] Extract Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             entry = {'Locus':data[0],'Pos':int(data[1]),'Seq':data[2],'N':int(data[3]),'QN':0}
             if entry['Locus'] != locus: locus = entry['Locus']; refseq = ''; majors = []
             refseq += data[2]
             #entry => 'Ref','Pos','Seq','N','Reads','Qual'
             rseq = data[4]
             reads = []
             delx = 0
             while rseq:                    
                 try:
                     if rseq[:1] in ['.',',']: reads.append(entry['Seq']); rseq = rseq[1:]
                     elif rseq[:1] == '^': rseq = rseq[2:]
                     #elif rseq[:1] == '*':
                     #    reads.append('-1%s' % entry['Seq'].upper())
                     #    rseq = rseq[1:]
                     elif rseq[:1] in ['-','+']:
                         ilen = string.atoi(rje.matchExp('^(\d+)',rseq[1:])[0])
                         indel = rseq[len('%s' % ilen)+1:][:ilen]
                         #self.deBug('%s: %s' % (rseq,indel))
                         if rseq[:1] == '-':
                             delx += 1
                             reads.append(rseq[:len('%s' % ilen)+ilen+1].upper())
                         else:
                             reads[-1] += indel.upper()
                         #self.deBug(reads[-1])
                         rseq = rseq[len('%s' % ilen)+ilen+1:]
                     elif rseq[:1] in ['$']: rseq = rseq[1:]
                     else:
                         if rseq[0].upper() not in 'ATGCN*': print ' ???', rseq[0].upper(), '???'
                         reads.append(rseq[0].upper()); rseq = rseq[1:]
                 except:
                     self.errorLog('!')
                     self.deBug(rseq)
                     raise ValueError
             if len(reads) != (entry['N'] + delx):
                 self.deBug('%s = %d' % (data[4],entry['N']))
                 self.deBug('%s = %d' % (reads,len(reads)))
                 self.errorLog('Read versus Read Count mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False)
                 raise ValueError
             ## ~ [2b] Convert Quality Scores ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             qual = []
             for q in data[5]:
                 # Gaps do not have a quality score, so fill these in first
                 while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut'))
                 # Then append actual qv
                 qual.append(ord(q) - 33)
                 qc += [0] * (qual[-1] - len(qc)); qc[qual[-1]-1] += 1
             while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut'))
             while '*' in reads: reads[reads.index('*')] = '-'   #'-1%s' % entry['Seq'].upper()
             if len(reads) != len(qual):
                 self.deBug('%s = %d' % (reads,len(reads)))
                 self.deBug('%s = %d' % (qual,len(qual)))
                 self.deBug(data)
                 self.errorLog('Read versus Quality length mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False)
                 raise ValueError
             ## ~ [2c] Filter low quality ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if entry['Pos'] in [190359]:    #100,98901,183697,169284,
                 self.deBug(qual)
                 self.deBug(reads)
                 self.deBug(qc)
             # Remove (from back) any reads than do not meet QV cutoff
             for r in range(len(qual)-1,-1,-1):
                 if qual[r] < self.getInt('QCut'): qual.pop(r); reads.pop(r)
             entry['QN'] = len(reads)
             ## ~ [2d] Major Allele ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             alleles = {}    # Dictionary of {nt:count}
             # Setup major allele
             if reads: major = reads[0]
             else: major = '-'; alleles[major] = 0
             # Cycle through reads. Keep most abundant allele as major - or reference allele if tied.
             for read in reads:
                 if read in alleles: alleles[read] += 1
                 else: alleles[read] = 1
                 if alleles[read] > alleles[major] or (read == entry['Seq'] and alleles[read] == alleles[major]): major = read
             entry['Major'] = major
             majors.append(major)
             if reads: entry['MajFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[major]) / float(len(reads)))
             else: entry['MajFreq'] = 0.0
             if wtdb:
                 try:
                     wtmajor = self.dict['WTMajor'][locus][entry['Pos']-1]
                     if wtmajor in alleles and reads: entry['WTFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[wtmajor]) / float(len(reads)))
                     else: entry['WTFreq'] = 0.0
                     if wtmajor != major: self.debug(entry)
                     elif locus == 'chrIV_S288C__BK006938.2' and entry['Pos'] == 271733: self.debug(entry)
                 except: self.warnLog('WTFreq Error (%s:Pos=%d) [Probably no WT read mapped]' % (locus,entry['Pos'])); entry['WTFreq'] = 0.0
             if entry['Pos'] in [190359]:    #100,98901,183697,169284,
                 self.deBug(qual)
                 self.deBug(reads)
                 self.deBug(alleles)
                 self.deBug(entry)
                 self.deBug(line)
             #table.addEntry(entry)
             outlist = []
             for field in table.fields(): outlist.append(entry[field])
             rje.writeDelimit(PILEOUT,outlist,delimit='\t'); ex += 1
         self.printLog('\r#PARSE','Parsed %s: %s entries from %s lines.' % (filename,rje.iStr(ex),rje.iStr(px)))
         PILEOUT.close()
         PILEUP.close()
         ### ~ [3] Save QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         QC = open('%s.%s.QC.tdt' % (self.baseFile(),tname),'w')
         QC.write('Qual\tCount\n')
         for q in range(len(qc)):
             try: QC.write('%d\t%d\n' % (q+1,qc[q]))
             except: self.errorLog('!')
         QC.close()
         return table
     except: self.errorLog('%s.parsePileup(%s) error' % (self,filename)); return None